Пример #1
0
def download(url, number, timeout):
    print("bitrating_alerts")
    reponse = requests.get(url, headers=headers.header())
    reponse.encoding = "utf-8"
    # 判断网站是否加载成功
    if reponse.status_code == 200:
        try:
            html = etree.HTML(reponse.text)
            # 获取标题、文章内容、作者和来源
            title = html.xpath(
                '/html/body/section/div[1]/div/header/div[1]/h1/a/text()')[0]
            texts = html.xpath(
                '/html/body/section/div[1]/div/article/p/text()')
            author = html.xpath(
                '/html/body/section/div[1]/div/article/div/text()')[0].split(
                )[0]
            source = "比特评级--快讯:" + url
            # 新闻有推荐和声明的问题
            recommends = html.xpath('//div[@class="asb-post-footer"]')[0]
            recommend = etree.tostring(recommends,
                                       method="text",
                                       encoding="utf8").decode("utf")
            recommend += ": https://bitrating.com/wenda"
            statement = html.xpath(
                '/html/body/section/div[1]/div/div[3]/text()')[0]
            # 将获取的数据存入数据库
            storage(number, title, timeout, author, source, recommend,
                    statement, texts)
        except Exception as err:
            mistake(url, err)
    else:
        err = reponse.status_code
        mistake(url, err)
Пример #2
0
def download(url, number):
    print('hashcaijing')
    reponse = requests.get(url, headers=headers.header())
    reponse.encoding = "utf-8"
    # 判断网页是否加载成功
    if reponse.status_code == 200:
        try:
            # 获取标题、发布时间、作者、来源
            html = etree.HTML(reponse.text)
            title = html.xpath(
                '/html/body/div[2]/div[1]/div/div[1]/ul/li[1]/b/text()')[0]
            if not len(title):
                return True
            timeout = html.xpath(
                '/html/body/div[2]/div[1]/div/div[1]/ul/li[2]/i[1]/text()')[0]
            author = html.xpath(
                '/html/body/div[2]/div[1]/div/div[1]/ul/li[2]/i[2]/text()')[0]
            source = "哈希财经" + ":" + url
            texts = html.xpath('//div[@class="contentNews"]')[0]
            text = etree.tostring(texts, method="text",
                                  encoding="utf8").decode("utf8").split()
            storage(number, title, timeout, author, source, text)
        except Exception as err:
            mistake(url, err)
    else:
        err = reponse.status_code
        mistake(url, err)
Пример #3
0
def download(html, number, url):
    try:
        print("youjiatuanjian")
        # 获取标题
        title = html.xpath(
            '//*[@id="article-wrap"]/div/div[1]/div[1]/text()')[0]
        # 获取信息来源
        source_name = html.xpath(
            '//*[@id="article-wrap"]/div/div[1]/div[3]/p[1]/span/text()')[0]
        source_url = html.xpath(
            '//*[@id="article-wrap"]/div/div[2]/div[1]/div[1]/a/@href')[0]
        source = source_name + "--http://youjiatuanjian.com" + source_url
        # 获取发布时间
        timeout = html.xpath(
            '//*[@id="article-wrap"]/div/div[1]/div[2]/div[2]/span/text()')[0]
        # 获取文章作者
        author = html.xpath(
            '//*[@id="article-wrap"]/div/div[1]/div[2]/div[1]/span/text()')[0]
        # 获取信息内容
        texts = html.xpath('//*[@id="article-wrap"]/div/div[1]/div[3]')[0]
        main_text = etree.tostring(texts, method="text",
                                   encoding="utf8").decode("utf8").split()
        mains = main_text[1:]
        storage(number, title, author, timeout, source, mains)
    except Exception as err:
        mistake(url, err)
Пример #4
0
def starts(headers):
    # 从首页开始查询网址
    s = 0
    url = "https://www.jinse.com"
    while True:
        reponse = requests.get(url, headers = headers)
        reponse.encoding = "utf-8"
        if reponse.status_code == 200:
            html = reponse.text
            # 获取首页所有的新闻网址
            pattern = re.compile('[a-zA-z]+://www.jinse.com[^\s]*\.html')
            urls = re.findall(pattern, html)
            urls = list(set(urls))
            for url in urls:
                if reponse.status_code == 200:
                    titleUrl(url, headers)
                else:
                    err = reponse.status_code
                    mistake(url, err)
                    break
            break
        else:
            # 有三次重新加载网页
            if s == 3:
                err = reponse.status_code
                mistake(url, err)
                break
            s += 1
Пример #5
0
def download(number):
    # try:
    url = 'http://www.zhilianfm.com/zlfmCms/kx/%s.jhtml' % number
    reponse = requests.get(url, headers=headers.header())
    reponse.encoding = "utf-8"
    if reponse.status_code == 200:
        html = etree.HTML(reponse.text)
        classify = html.xpath(
            '/html/body/section/legend/a[2]/text()')[0].split()[0]
        if classify != "快讯":
            return
        print("zhilianfm_alerts")
        # 获取标题
        title = html.xpath('/html/body/div[2]/section/h1/text()')[0]
        author_timeout_source = html.xpath(
            '/html/body/div[2]/section/div[1]/text()')[0].split()
        # 获取作者
        author = author_timeout_source[1]
        # 获取发布时间
        timeout = author_timeout_source[0]
        # 获取信息来源
        source = author_timeout_source[2]
        # 获取文章内容
        texts = html.xpath(
            '/html/body/div[2]/section/div[2]/text()')[0].split()
        text = ""
        for i in texts:
            text += i + " "
        storage(number, title, author, timeout, source, text, classify)
    else:
        err = reponse.status_code
        mistake(url, err)
        return True
Пример #6
0
def getUrl(reponse):
    html = reponse.text
    pattern = re.compile("news/[^\s]*\/\d+")
    urls = re.findall(pattern, html)
    url_number = []
    # 在网页中获取信息的最大编号
    for i in urls:
        pattern_num = re.compile("\d+")
        num = re.findall(pattern_num, i)
        url_number.append(num)
    max_number = int(max(url_number)[0])
    reload = 0
    while True:
        try:
            url = "https://www.btc123.com/news/newsDetails/%s" % max_number
            reponse_news = requests.get(url, headers=headers.header())
            reponse_news.encoding = "utf-8"
            # 判断在数据库中是否已经存入
            if rechecking(max_number, come_from="btc123"):
                break
            if reponse_news.status_code == 200:
                download(reponse_news, url, max_number)
                max_number -= 1
            else:
                err = reponse_news.status_code
                mistake(url, err)
                # 可以重新加载三次网页
                if reload == 3:
                    break
                reload += 1
        except:
            if reload == 3:
                break
            reload += 1
Пример #7
0
def starts():
    url = "http://www.huoxing24.com/"
    try:
        # 从主页查找新闻信息
        reponse = requests.get(url, headers=header())
        reponse.encoding = "utf-8"
        html = reponse.text
        pattern = re.compile('<div class="index-news-list">[\s\S]*?<div class="shadow">')
        texts = re.findall(pattern, html)
        # print(texts)
        n = 1
        for text in texts:
            # print(n)
            n += 1
            # 进行遍历,加载新闻网址
            pattern = re.compile('[a-zA-z]+://[^\s]*\.html')
            url = re.findall(pattern, text)[0]
            reponse = requests.get(url, headers=header())
            reponse.encoding = "utf-8"
            # 判断网址能否加载
            if reponse.status_code == 200:
                html = reponse.text
                download(html, url)
            else:
                err = reponse.status_code
                mistake(url, err)
    except Exception as err:
        mistake(url, err)
Пример #8
0
def starts():
    # 网站新闻信息网址
    urls = [
        'http://39.108.117.97:8082/hotNewsList?size=10&page=%s&subType=',
        "http://39.108.117.97:8082/blockChainList?size=10&page=%s&subType="
    ]
    for i in urls:
        subType = 0
        # 新闻信息的分类,都分为3类共6类新闻
        while subType < 3:
            page = 1
            url_page = i + str(subType)
            # 新闻信息翻页获取url
            while True:
                url = url_page % page
                reponse = requests.get(url, headers=headers.header())
                reponse.encoding = "utf-8"
                if reponse.status_code == 200:
                    data = getUrl(reponse)
                    if data:
                        break
                else:
                    err = reponse.status_code
                    mistake(url, err)
                    break
                page += 1
            subType += 1
Пример #9
0
def download(reponse, url, number):
    try:
        print("tuoluocaijing")
        html = etree.HTML(reponse.text)
        # 获取标题
        title = html.xpath('/html/body/div[6]/div[1]/div/h1/text()')[0]
        # 获取发布时间
        timeout = html.xpath(
            '/html/body/div[6]/div[1]/div/div[1]/span[3]/text()')[0]
        # 获取作者及作者的网址
        author_name = html.xpath(
            '/html/body/div[6]/div[1]/div/div[1]/span[1]/a/text()')[0]
        author_ur = html.xpath(
            '/html/body/div[6]/div[1]/div/div[1]/span[1]/a/@href')[0]
        author = author_name + "--https://www.tuoluocaijing.cn" + author_ur
        # 获取标签
        label = html.xpath('/html/body/div[6]/div[1]/div/div[3]/a/text()')
        # 获取正文
        texts = html.xpath("/html/body/div[6]/div[1]/div/div[2]")[0]
        text = etree.tostring(texts, method="text",
                              encoding="utf8").decode("utf8").split()
        # 文章有声明
        statement = html.xpath('/html/body/div[6]/div[1]/div/p/text()')
        source = "陀螺财经--:https://www.tuoluocaijing.cn/"
        storage(number, title, timeout, author, source, text, statement, label)
    except Exception as err:
        mistake(url, err)
Пример #10
0
def starts():
    n = 1
    reload = 0
    while True:
        url = "http://weilaicaijing.com/api/Fastnews/lists?search_str=&page=%s" % n
        reponse = requests.get(url, headers=headers.header())
        reponse.encoding = "utf-8"
        # 判断网页是否加载完成
        if reponse.status_code == 200:
            already = download(reponse, url)
            if already:
                break
            n += 1
        # 此网站常出现503错误,重新加载5次
        elif reponse.status_code == 503:
            if reload == 5:
                err = reponse.status_code
                mistake(url, err)
                break
            reload += 1
        else:
            # 网页有三次重新加载
            if reload == 2:
                err = reponse.status_code
                mistake(url, err)
                break
            reload += 1
Пример #11
0
def download(number):
    # 连接新闻具体信息
    url = 'http://39.108.117.97:8082/geek/infoDetail/1/%s' % number
    reponse = requests.get(url, headers=headers.header())
    reponse.encoding = "utf-8"
    if reponse.status_code == 200:
        # 获取信息的内容
        html = reponse.text
        # 将json格式文本转换为字典
        texts = json.loads(html)
        data = texts["data"]
        # 转换时间,获取发布时间
        num = float(data["releasedTime"]) / 1000
        timeout = time.asctime(time.localtime(num))
        # 在html文本中获取新闻内容
        main_html = data["content"]
        main_html = etree.HTML(main_html)
        main_text = etree.tostring(main_html, method="text",
                                   encoding="utf8").decode("utf8").split()
        # 获取所使用的图片
        img = main_html.xpath('//img/@src')
        storage(data, timeout, main_text, img)
    else:
        err = reponse.status_code
        mistake(url, err)
Пример #12
0
def divide(i):
    try:
        print("tuoniaox_alerts")
        # 将从网页中用正则匹配到的局部html内容,变为xml格式的文档
        html = etree.HTML(i)
        # 获取实时的发布时间
        timeout = html.xpath('//span/text()')[0]
        # 获取信息的内容
        texts = html.xpath('//p/text()')[0].split()
        text = ""
        for i in texts:
            text += i + " "
        # 分离出标题以及其发布的时间日期
        pattern = re.compile("【[\s\S]*?】")
        title = re.findall(pattern, text)[0]
        # 由于没有明确的id值
        if title_find(title, come_from="tuoniaox_alerts"):
            return
        pattern = re.compile("\d月\d日")
        accurate = re.findall(pattern, text)
        if accurate:
            accurate = accurate[0]
        else:
            accurate = ""
        author = "鸵鸟区块链:https://www.tuoniaox.com/"
        # 判断是否是该网站的原创作
        source = html.xpath('//a/@href')
        if source:
            source = "负责编译--原文:" + source
        else:
            source = "鸵鸟区块链--快讯"
        storage(title, author, timeout, accurate, source, text)
    except Exception as err:
        mistake(url="https://www.tuoniaox.com/", err=err)
Пример #13
0
def getUrl(reponse):
    pattern = re.compile('/kuaixun/detail-\d+\.html')
    urls = re.findall(pattern, reponse.text)
    urls = list(set(urls))
    for i in urls:
        reload = 0
        try:
            url = "https://www.tuoluocaijing.cn" + i
            reponse = requests.get(url, headers=headers.header())
            reponse.encoding = "utf-8"
            if reponse.status_code == 200:
                pattern_num = re.compile("\d+")
                number = re.findall(pattern_num, url)[0]
                if rechecking(number, come_from="tuoluocaijing_alerts"):
                    break
                download(reponse, number, url)
            else:
                err = reponse.status_code
                mistake(url, err)
                if reload == 3:
                    break
                reload += 1
        except:
            if reload == 3:
                break
            reload += 1
Пример #14
0
def download(url):
    print("jinse_alerts")
    reponse = requests.get(url, headers=headers.header())
    reponse.encoding = "utf-8"
    # 判断网页是否加载完成
    if reponse.status_code == 200:
        # 匹配时间正文再加以组合
        try:
            html = etree.HTML(reponse.text)
            texts = html.xpath('//*[@class="tc"]')[0]
            text = etree.tostring(texts, method="text",
                                  encoding="utf8").decode("utf8").split()
            times = text[0] + text[1]
            texts = html.xpath('//*[@class="time-detail"]')[0]
            text = etree.tostring(texts, method="text",
                                  encoding="utf8").decode("utf8").split()
            times = times + "--" + text[0]
            texts = html.xpath('//*[@class="intro-detail"]')[0]
            text = etree.tostring(texts, method="text",
                                  encoding="utf8").decode("utf8").split()
            titles = text[0:3]
            authors = text[-5:]
            mains = text[3:-5]
            storage(titles, authors, times, url, mains)
        except Exception as err:
            mistake(url, err)
        return False
    else:
        err = reponse.status_code
        mistake(url, err)
        return True
Пример #15
0
def starts():
    n = 8208
    tf = True
    # 判断数据库是否已经存在内容
    number = max_id(come_from="polo321_alerts")
    if number:
        n = number + 20
        tf = False
    while True:
        try:
            url = "http://39.108.117.97:8082/lives/getList?Id=%s&flag=down" % n
            reponse = requests.get(url, headers=headers.header())
            reponse.encoding = "utf-8"
            if reponse.status_code == 200:
                data = classify(reponse)
                if data:
                    break
            else:
                err = reponse.status_code
                mistake(url, err)
                break
            # 主要是运行的第一次
            if tf:
                n -= 20
            else:
                n += 20
        except TimeoutError:
            time.sleep(10)
Пример #16
0
def download(html, number, url):
    # 查看文章是否引用图片
    img = html.xpath('//*[@id="main"]/div/div[1]/div[1]/p/img/@src')
    if not img:
        try:
            # 没有引用图片则可能是快讯信息
            text = html.xpath('//*[@id="main"]/div/div[1]/div[1]/p/text()')
            # 再次判断,以文章的内容作为依据判断是否是快讯信息
            if len(text) == 1 and text[0] == "\n":
                pass
            else:
                print("longkuai_alerts")
                # 获取标题
                title = html.xpath('//*[@id="main"]/div/div[1]/div[1]/h1/text()')[0]
                # 获取时间
                timeout = html.xpath('//*[@id="main"]/div/div[1]/div[1]/div[1]/span[1]/text()')[0]
                # 本文的作者获取编辑
                author = html.xpath('//*[@id="main"]/div/div[2]/div[1]/h1/text()')[0]
                # 内容的来源或者是原创
                content_source_object = html.xpath('//*[@id="main"]/div/div[1]/div[1]/div[2]')[0]
                content_source_text = etree.tostring(content_source_object, method="text", encoding="utf8").decode("utf8").split()
                content_source = ""
                for i in content_source_text:
                    content_source += i
                # 文章的声明
                statement = html.xpath('//*[@id="main"]/div/div[1]/div[1]/div[3]/text()')[0]
                storage(number, title, author, timeout, content_source, statement, text)
        except Exception as err:
            mistake(url, err)
Пример #17
0
def download(url, number):
    print("shilian_alerts")
    reponse = requests.get(url, headers=headers.header())
    reponse.encoding = "gbk"
    if reponse.status_code == 200:
        try:
            html = etree.HTML(reponse.text)
            # 获取标题、文章内容、作者和来源
            title = html.xpath('/html/body/div[2]/div[1]/div/h1/text()')[0]
            theSidebar = html.xpath('/html/body/div[2]/div[1]/div/div[1]/text()')
            author = "世链财经--快讯:" + theSidebar[0]
            timeout = theSidebar[1]
            # 信息的分类
            classify = html.xpath('/html/body/div[2]/div[1]/div/div[1]/a/text()')
            source = ("世链财经--资讯--%s:" % classify) + url
            texts = html.xpath('/html/body/div[2]/div[1]/div/div[3]')[0]
            text = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split()
            # 新闻有声明的问题
            statements = html.xpath('/html/body/div[2]/div[1]/div/div[5]')[0]
            statement = etree.tostring(statements, method="text", encoding="utf8").decode("utf8").split()
            # 信息的标签
            label_head = html.xpath('/html/body/div[2]/div[1]/div/div[6]/div/text()')[0]
            label_word = html.xpath('/html/body/div[2]/div[1]/div/div[6]/div/a/text()')[0]
            label_url = html.xpath('/html/body/div[2]/div[1]/div/div[6]/div/a/@href')[0]
            label = label_head + label_word + "--http://www.shilian.com" + label_url
            # 将获取的数据存入数据库
            storage(number, title, timeout, author, source, statement, text, label)
        except Exception as err:
            mistake(url, err)
    else:
        err = reponse.status_code
        mistake(url, err)
Пример #18
0
def download(url):
    print("huoxing24_alerts")
    reponse = requests.get(url, headers=header())
    reponse.encoding = "utf-8"
    if reponse.status_code == 200:
        try:
            # 获取编号
            pattern_num = re.compile('\d+')
            number = re.findall(pattern_num, url)[1]
            # 判断数据库中是否已经下载过
            if rechecking(number, come_from="huoxing_alerts"):
                return
            html = reponse.text
            down = etree.HTML(html)
            texts = down.xpath('/html/body/div[5]/div[1]')[0]
            text = etree.tostring(texts, method="text",
                                  encoding="utf8").decode("utf8").split()
            # print(text)
            time = text[1] + text[0] + "日" + "--" + text[3] + "--" + text[4]
            # print(time)
            title = text[5]
            # print(title)
            mains = text[6:-4]
            main = ""
            for i in mains:
                main += i + " "
            # print(main)
            source = "火星财经快讯"
            storage(title, time, source, main)
        except Exception as err:
            mistake(url, err)
    else:
        err = "reponse.status_code为:" + reponse.status_code
        mistake(url, err)
Пример #19
0
def starts():
    # 由于快讯网址上的快讯信息不是顺序排列,是乱序排列,采用已有的编号来爬取
    n = 8300
    tf = True
    # 判断数据库是否已经存在内容
    number = max_id(come_from="hangliancj_alerts")
    if number:
        n = number + 1
        tf = False
    while True:
        try:
            url = "http://hangliancj.com/article/%s.html" % n
            reponse = requests.get(url, headers=headers.header())
            reponse.encoding = "utf-8"
            if reponse.status_code == 200:
                data = download(url, reponse, n)
                if data:
                    break
            else:
                err = reponse.status_code
                mistake(url, err)
                break
            # 主要是运行的第一次
            if tf:
                n -= 1
            else:
                n += 1
        except TimeoutError:
            time.sleep(10)
Пример #20
0
def download(url, html):
    try:
        print("fn")
        # 筛选数据
        # 新闻的编号
        pattern_num = re.compile('\d+')
        number = re.findall(pattern_num, url)[0]
        # 判断数据库中是否已经下载过
        if rechecking(number, come_from="fn"):
            return
        # 发布时间
        pattern_time = re.compile('([0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]{1}|[0-9]{1}[1-9][0-9]{2}|[1-9][0-9]{3})-(((0[13578]|1[02])-(0[1-9]|[12][0-9]|3[01]))|((0[469]|11)-(0[1-9]|[12][0-9]|30))|(02-(0[1-9]|[1][0-9]|2[0-8])))')
        times = re.findall(pattern_time, html)[0][0: 2]
        html = etree.HTML(html)
        # 新闻标题
        texts = html.xpath('//h1[@class="entry-title"]')[0]
        title = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split()[0]
        # 新闻作者
        texts = html.xpath('//div[@class="entry-info"]')[0]
        text = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split()
        num = text.index('•')
        authors = text[0: num]
        # 新闻导读"副标题"
        texts = html.xpath('//div[@class="entry-excerpt"]')[0]
        subtitle = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split()[0]
        # 新闻正文
        texts = html.xpath('//div[@class="entry-content clearfix"]')[0]
        mains = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split()
        main = mains[0: -1]
        # 新闻来源
        source = main[-1]
        # 进行存储
        storage(title, authors, times, source, main, number, subtitle)
    except Exception as err:
        mistake(url, err)
Пример #21
0
def titleUrl(url, headers):
    s = 0
    while True:
        try:
            reponse = requests.get(url, headers = headers)
            reponse.encoding = "utf-8"
            # 查询在数据库中是否已经存在
            pattern_num = re.compile('\d+')
            number = re.findall(pattern_num, url)[0]
            # 判断数据库中是否已经下载过
            if rechecking(number, come_from="jinse"):
                break
            # 判断是否加载完成
            if reponse.status_code == 200:
                download(url, reponse, number)
                # 匹配下一篇新闻的url
                html = reponse.text
                pattern = re.compile('<ol>下一篇</ol>[\s\S]*?</h2>')
                texts = re.findall(pattern, html)[0]
                # print(texts)
                pattern = re.compile('https://[\s\S]*?\d+.html')
                url = re.findall(pattern, texts)[0]
                # print(href)
            else:
                err = reponse.status_code
                mistake(url, err)
                if s == 3:
                    break
                s += 3
        except:
            if s == 3:
                break
            s += 1
Пример #22
0
def starts():
    urls = [
        "https://www.7234.cn/fetch_articles/news",
        "https://www.7234.cn/fetch_articles/blockchain",
        "https://www.7234.cn/fetch_articles/tech",
        "https://www.7234.cn/fetch_articles/huodong",
        "https://www.7234.cn/fetch_articles/column"
    ]
    for i in urls:
        n = 1
        while True:
            url = i + "?page=%s" % n
            reponse = requests.get(url, headers=headers.header())
            reponse.encoding = "utf-8"
            if reponse.status_code == 200:
                # 将json格式html文本转换为字典格式
                data = reponse.text
                data = json.loads(data)
                html = etree.HTML(data["html"])
                data = getUrl(html)
                if data:
                    break
                n += 1
            else:
                err = reponse.status_code
                mistake(url, err)
                break
Пример #23
0
def starts():
    n = 1
    s = 0
    while True:
        url = "https://apibtc.btc123.com/v1/index/getFlashPage?pageSize=20&pageNumber=%s" % n
        try:
            reponse = requests.get(url, headers=headers.header())
            reponse.encoding = "utf-8"
            # 判断网页是否加载完成
            if reponse.status_code == 200:
                data = download(reponse, url)
                if data:
                    break
                n += 1
            else:
                err = reponse.status_code
                mistake(url, err)
                # 网页有三次重新加载
                if s == 2:
                    break
                s += 1
        except:
            # 网页可以重新加载
            if s == 2:
                break
            s += 1
Пример #24
0
def starts():
    n = 2016
    tf = True
    # 判断数据库是否已经存在内容
    number = max_id(come_from="shangxia_alerts")
    if number:
        n = number + 1
        tf = False
    while True:
        try:
            url = "https://www.shangxia.net/kuaixun/1/%s.html" % n
            reponse = requests.get(url, headers=headers.header())
            reponse.encoding = "utf-8"
            if reponse.status_code == 200:
                data = download(reponse, n, url)
                if data:
                    break
            else:
                err = reponse.status_code
                mistake(url, err)
                break
            # 主要是运行的第一次
            if tf:
                n -= 1
            else:
                n += 1
        except TimeoutError:
            time.sleep(10)
Пример #25
0
def download(urls):
    for url_one in urls:
        try:
            print("hecaijing_alerts")
            url = "https://www.hecaijing.com" + url_one
            reponse = requests.get(url, headers=headers.header())
            reponse.encoding = "utf-8"
            html = etree.HTML(reponse.text)
            if reponse.status_code == 200:
                # 获取编号
                pattern_num = re.compile('\d+')
                number = re.findall(pattern_num, url)[0]
                # 判断数据库中是否已经下载过
                if rechecking(number, come_from="hecaijing_alerts"):
                    return
                times = html.xpath('/html/body/div[5]/div[1]/h2/text()')
                time_hour = html.xpath(
                    '/html/body/div[5]/div[1]/div/p[1]/span/text()')[0].split(
                    )
                timeout = times[1] + time_hour[0]
                title = html.xpath(
                    '/html/body/div[5]/div[1]/div/p[2]/text()')[0]
                main_texts = html.xpath(
                    '/html/body/div[5]/div[1]/div/div[1]/text()')[0].split()
                source = "核财经:" + url
                storage(number, title, timeout, main_texts, source)
            else:
                err = reponse.status_code
                mistake(url, err)
        except Exception as err:
            mistake(url_one, err)
Пример #26
0
def download(reponse, number, url):
    try:
        print("shangxia_alerts")
        html = etree.HTML(reponse.text)
        # 获取标题
        title = html.xpath('//*[@id="title"]/text()')[0]
        if not title:
            return True
        # 获取作者
        author = html.xpath('//div[@class="title_trade2"]/a/text()')[0]
        # 获取发布时间
        timeout = html.xpath('//div[@class="title_trade2"]/text()')[0].split()[1:3]
        # 获取信息来源
        source = html.xpath('/html/body/div[11]/div[6]/text()')[1].split()[0]
        # 文章的声明
        statement_object = html.xpath('/html/body/div[11]/div[7]')[0]
        statement_list = etree.tostring(statement_object, method="text", encoding="utf8").decode("utf8").split()
        statement = ""
        for i in statement_list:
            statement += i + " "
        # 获取文章内容
        texts = html.xpath('//*[@id="content"]')[0]
        text = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split()
        # 获取信息使用的图片
        img_img = html.xpath('//*[@id="content"]/img/@src')
        img_div = html.xpath('//*[@id="content"]/div/img/@src')
        img_p = html.xpath('//*[@id="content"]/p/img/@src')
        img = img_img + img_div + img_p
        storage(number, title, author, timeout, source, text, statement, img)
    except Exception as err:
        mistake(url, err)
Пример #27
0
def download(html, author, number, url):
    try:
        print("epcnn")
        # 获取标题
        title = html.xpath(
            '/html/body/section/div[1]/div/header/h1/a/text()')[0]
        # 获取发布时间
        timeout = html.xpath(
            '/html/body/section/div[1]/div/header/div/span[1]/text()')[0]
        # 文章的分类
        classify = html.xpath(
            '/html/body/section/div[1]/div/header/div/span[2]/a/text()')[0]
        # 文章的标签
        label = html.xpath('/html/body/section/div[1]/div/div[5]/a/text()')
        # 本网站此文章的声明
        statement = html.xpath(
            '/html/body/section/div[1]/div/div[3]/text()')[0]
        # 所引用到的图片
        img = html.xpath('/html/body/section/div[1]/div/article//img/@src')
        # 文章的来源位置
        source_location = html.xpath('/html/body/div[2]/div/a/text()')
        source = "e能链财经"
        for i in source_location:
            source += "-" + i
        # 文章的正文内容
        texts = html.xpath('/html/body/section/div[1]/div/article')[0]
        text = etree.tostring(texts, method="text",
                              encoding="utf8").decode("utf8")
        storage(number, title, author, timeout, source, text, label, classify,
                statement, img)
    except Exception as err:
        mistake(url, err)
Пример #28
0
def starts():
    n = 33588
    # 用与判断获取路径是获取上一篇还是下一篇
    tf = True
    # 判断数据库是否已经存在内容
    number = max_id(come_from="chainfor")
    if number:
        n = int(number) + 1
        tf = False
    url = "https://www.chainfor.com/news/show/%s.html" % n
    while True:
        reponse = requests.get(url, headers=headers.header())
        reponse.encoding = "utf-8"
        if reponse.status_code == 200:
            data = getUrl(reponse, tf, url)
            if data == "end":
                break
            elif data == "continue":
                continue
            # 返回路径,跟新
            url = data
        else:
            err = reponse.status_code
            mistake(url, err)
            break
Пример #29
0
def starts():
    url = "http://bishequ.com/article/getArticleList"
    reponse = requests.get(url, headers=headers.header())
    reponse.encoding = "utf-8"
    if reponse.status_code == 200:
        connent(reponse)
    else:
        err = reponse.status_code
        mistake(url, err)
Пример #30
0
def starts():
    url = "http://www.btc798.com/"
    reponse = requests.get(url, headers=headers.header())
    reponse.encoding = "utf-8"
    if reponse.status_code == 200:
        getUrl(reponse)
    else:
        err = reponse.status_code
        mistake(url, err)