Пример #1
0
def starts():
    url = "http://news.enorth.com.cn/system/count/0017000/000000000000/count_page_list_0017000000000000000.js"
    response = requests.get(url, headers=headers.header())
    response.encoding = "utf-8"
    if response.status_code == 200:
        html = response.text
        pattern = re.compile('(maxpage = )(8287)(;)')
        page = int(re.findall(pattern, html)[0][1])
    else:
        return
    n = 0
    while True:
        if n == 0:
            url = "http://news.enorth.com.cn/gd/"
        else:
            url = "http://news.enorth.com.cn/system/more/17000000000000000/0082/17000000000000000_0000%s.shtml" % page
        response = requests.get(url, headers=headers.header())
        response.encoding = "gb2312"
        if response.status_code == 200:
            html = response.text
            mistake = getURL(html)
            if mistake:
                break
            page -= n
            n += 1
        else:
            break
Пример #2
0
def starts():
    url = "https://heilongjiang.dbw.cn/system/count/0015037/000000000000/count_page_list_0015037000000000000.js"
    response = requests.get(url, headers=headers.header())
    response.encoding = "utf-8"
    if response.status_code == 200:
        html = response.text
        pattern = re.compile('(maxpage = )(\d+)(;)')
        num = int(re.findall(pattern, html)[0][1])
    else:
        return
    n = 0
    while True:
        if n == 0:
            url = "https://heilongjiang.dbw.cn/rc/index.shtml"
        else:
            page = num - n
            url = "https://heilongjiang.dbw.cn/system/count//0015037/000000000000/000/000/c0015037000000000000_000000%s.shtml" % page
        response = requests.get(url, headers=headers.header())
        response.encoding = "gbk"
        if response.status_code == 200:
            html = response.text
            mistake = getURL(html)
            if mistake == "end":
                break
            n += 1
        else:
            break
Пример #3
0
def starts():
    url = "http://www.huoxing24.com/"
    try:
        # 从主页查找新闻信息
        reponse = requests.get(url, headers=header())
        reponse.encoding = "utf-8"
        html = reponse.text
        pattern = re.compile('<div class="index-news-list">[\s\S]*?<div class="shadow">')
        texts = re.findall(pattern, html)
        # print(texts)
        n = 1
        for text in texts:
            # print(n)
            n += 1
            # 进行遍历,加载新闻网址
            pattern = re.compile('[a-zA-z]+://[^\s]*\.html')
            url = re.findall(pattern, text)[0]
            reponse = requests.get(url, headers=header())
            reponse.encoding = "utf-8"
            # 判断网址能否加载
            if reponse.status_code == 200:
                html = reponse.text
                download(html, url)
            else:
                err = reponse.status_code
                mistake(url, err)
    except Exception as err:
        mistake(url, err)
Пример #4
0
def starts():
    urls = [
        "http://www.chinanews.com/scroll-news/news%s.html",
        "http://www.chinanews.com/mil/news.shtml"
    ]
    n = 1
    for i in urls:
        if n == 1:
            while True:
                url = i % n
                reponse = requests.get(url, headers=headers.header())
                reponse.encoding = "gbk"
                if reponse.status_code == 200:
                    html = reponse.text
                    msg = getURL(html)
                    if msg:
                        break
                    n += 1
                else:
                    break
        else:
            url = i
            reponse = requests.get(url, headers=headers.header())
            reponse.encoding = "gbk"
            if reponse.status_code == 200:
                html = reponse.text
                getURL(html)
Пример #5
0
def download(url, number):
    print("shilian_alerts")
    reponse = requests.get(url, headers=headers.header())
    reponse.encoding = "gbk"
    if reponse.status_code == 200:
        try:
            html = etree.HTML(reponse.text)
            # 获取标题、文章内容、作者和来源
            title = html.xpath('/html/body/div[2]/div[1]/div/h1/text()')[0]
            theSidebar = html.xpath('/html/body/div[2]/div[1]/div/div[1]/text()')
            author = "世链财经--快讯:" + theSidebar[0]
            timeout = theSidebar[1]
            # 信息的分类
            classify = html.xpath('/html/body/div[2]/div[1]/div/div[1]/a/text()')
            source = ("世链财经--资讯--%s:" % classify) + url
            texts = html.xpath('/html/body/div[2]/div[1]/div/div[3]')[0]
            text = etree.tostring(texts, method="text", encoding="utf8").decode("utf8").split()
            # 新闻有声明的问题
            statements = html.xpath('/html/body/div[2]/div[1]/div/div[5]')[0]
            statement = etree.tostring(statements, method="text", encoding="utf8").decode("utf8").split()
            # 信息的标签
            label_head = html.xpath('/html/body/div[2]/div[1]/div/div[6]/div/text()')[0]
            label_word = html.xpath('/html/body/div[2]/div[1]/div/div[6]/div/a/text()')[0]
            label_url = html.xpath('/html/body/div[2]/div[1]/div/div[6]/div/a/@href')[0]
            label = label_head + label_word + "--http://www.shilian.com" + label_url
            # 将获取的数据存入数据库
            storage(number, title, timeout, author, source, statement, text, label)
        except Exception as err:
            mistake(url, err)
    else:
        err = reponse.status_code
        mistake(url, err)
Пример #6
0
def starts():
    n = 8208
    tf = True
    # 判断数据库是否已经存在内容
    number = max_id(come_from="polo321_alerts")
    if number:
        n = number + 20
        tf = False
    while True:
        try:
            url = "http://39.108.117.97:8082/lives/getList?Id=%s&flag=down" % n
            reponse = requests.get(url, headers=headers.header())
            reponse.encoding = "utf-8"
            if reponse.status_code == 200:
                data = classify(reponse)
                if data:
                    break
            else:
                err = reponse.status_code
                mistake(url, err)
                break
            # 主要是运行的第一次
            if tf:
                n -= 20
            else:
                n += 20
        except TimeoutError:
            time.sleep(10)
Пример #7
0
def connect(url, number, title):
    url = "http:" + url
    response = requests.get(url, headers=headers.header())
    response.encoding = "utf-8"
    if response.status_code == 200:
        html = response.text
        download(html, number, title)
Пример #8
0
def starts():
    n = 1
    s = 0
    while True:
        url = "https://apibtc.btc123.com/v1/index/getFlashPage?pageSize=20&pageNumber=%s" % n
        try:
            reponse = requests.get(url, headers=headers.header())
            reponse.encoding = "utf-8"
            # 判断网页是否加载完成
            if reponse.status_code == 200:
                data = download(reponse, url)
                if data:
                    break
                n += 1
            else:
                err = reponse.status_code
                mistake(url, err)
                # 网页有三次重新加载
                if s == 2:
                    break
                s += 1
        except:
            # 网页可以重新加载
            if s == 2:
                break
            s += 1
Пример #9
0
def starts():
    url = "https://www.huxiu.com/v2_action/article_list"
    n = 1
    last_dateline = ""
    while True:
        data = {
            "huxiu_hash_code": "9501c2ced764ebbe029807a9f17790fa",
            "page": str(n),
            "last_dateline": str(last_dateline)
        }
        if n == 1:
            url = "https://www.huxiu.com"
            response = requests.get(url, headers=headers.header())
        else:
            response = requests.get(url, data=data, headers=header)
        response.encoding = "utf-8"
        if response.status_code == 200:
            html = response.text
            mistake = getURL(html, n)
            if mistake == "end":
                break
            last_dateline = mistake
            n += 1
        else:
            break
Пример #10
0
def starts():
    urls = [
        "https://www.thepaper.cn/channel_25950",
        "https://www.thepaper.cn/channel_25951",
        "https://www.thepaper.cn/channel_36079",
        "https://www.thepaper.cn/channel_25952",
        "https://www.thepaper.cn/channel_25953",
        "https://www.thepaper.cn/gov_publish.jsp"
    ]
    for i in urls:
        n = 1
        lastTime = ""
        while True:
            url = combination(i, n, lastTime)
            reponse = requests.get(url, headers=headers.header())
            reponse.encoding = "utf-8"
            if reponse.status_code == 200:
                html = reponse.text
                data = getURL(html, n, i)
                if data == "end":
                    break
                lastTime = data
                n += 1
            else:
                break
Пример #11
0
def download(url, number):
    print('hashcaijing')
    reponse = requests.get(url, headers=headers.header())
    reponse.encoding = "utf-8"
    # 判断网页是否加载成功
    if reponse.status_code == 200:
        try:
            # 获取标题、发布时间、作者、来源
            html = etree.HTML(reponse.text)
            title = html.xpath(
                '/html/body/div[2]/div[1]/div/div[1]/ul/li[1]/b/text()')[0]
            if not len(title):
                return True
            timeout = html.xpath(
                '/html/body/div[2]/div[1]/div/div[1]/ul/li[2]/i[1]/text()')[0]
            author = html.xpath(
                '/html/body/div[2]/div[1]/div/div[1]/ul/li[2]/i[2]/text()')[0]
            source = "哈希财经" + ":" + url
            texts = html.xpath('//div[@class="contentNews"]')[0]
            text = etree.tostring(texts, method="text",
                                  encoding="utf8").decode("utf8").split()
            storage(number, title, timeout, author, source, text)
        except Exception as err:
            mistake(url, err)
    else:
        err = reponse.status_code
        mistake(url, err)
Пример #12
0
def starts():
    url = "http://news.mtime.com/#nav"
    response = requests.get(url, headers=headers.header())
    response.encoding = "utf-8"
    if response.status_code == 200:
        html = response.text
        getURL(html)
Пример #13
0
def download(number):
    # try:
    url = 'http://www.zhilianfm.com/zlfmCms/kx/%s.jhtml' % number
    reponse = requests.get(url, headers=headers.header())
    reponse.encoding = "utf-8"
    if reponse.status_code == 200:
        html = etree.HTML(reponse.text)
        classify = html.xpath(
            '/html/body/section/legend/a[2]/text()')[0].split()[0]
        if classify != "快讯":
            return
        print("zhilianfm_alerts")
        # 获取标题
        title = html.xpath('/html/body/div[2]/section/h1/text()')[0]
        author_timeout_source = html.xpath(
            '/html/body/div[2]/section/div[1]/text()')[0].split()
        # 获取作者
        author = author_timeout_source[1]
        # 获取发布时间
        timeout = author_timeout_source[0]
        # 获取信息来源
        source = author_timeout_source[2]
        # 获取文章内容
        texts = html.xpath(
            '/html/body/div[2]/section/div[2]/text()')[0].split()
        text = ""
        for i in texts:
            text += i + " "
        storage(number, title, author, timeout, source, text, classify)
    else:
        err = reponse.status_code
        mistake(url, err)
        return True
Пример #14
0
def download(url):
    print("huoxing24_alerts")
    reponse = requests.get(url, headers=header())
    reponse.encoding = "utf-8"
    if reponse.status_code == 200:
        try:
            # 获取编号
            pattern_num = re.compile('\d+')
            number = re.findall(pattern_num, url)[1]
            # 判断数据库中是否已经下载过
            if rechecking(number, come_from="huoxing_alerts"):
                return
            html = reponse.text
            down = etree.HTML(html)
            texts = down.xpath('/html/body/div[5]/div[1]')[0]
            text = etree.tostring(texts, method="text",
                                  encoding="utf8").decode("utf8").split()
            # print(text)
            time = text[1] + text[0] + "日" + "--" + text[3] + "--" + text[4]
            # print(time)
            title = text[5]
            # print(title)
            mains = text[6:-4]
            main = ""
            for i in mains:
                main += i + " "
            # print(main)
            source = "火星财经快讯"
            storage(title, time, source, main)
        except Exception as err:
            mistake(url, err)
    else:
        err = "reponse.status_code为:" + reponse.status_code
        mistake(url, err)
Пример #15
0
def starts():
    urls = [
        "http://www.taiwan.cn/taiwan/index.htm",
        "http://www.taiwan.cn/plzhx/index.htm",
        "http://www.taiwan.cn/lilunpindao/index.htm",
        "http://www.taiwan.cn/xwzx/la/index.htm",
        "http://www.taiwan.cn/xwzx/index.htm",
        "http://culture.taiwan.cn/index.htm"
    ]
    for i in urls:
        n = 0
        while True:
            if n == 0:
                url = i
            else:
                page = "_" + str(n) + ".htm"
                url = re.sub(".htm", page, i)
            response = requests.get(url, headers=headers.header())
            response.encoding = "gbk"
            if response.status_code == 200:
                html = response.text
                mistake = getURL(html)
                if mistake:
                    break
                n += 1
            else:
                break
Пример #16
0
def starts():
    urls = [
        "http://www.qlwb.com.cn/news/domesticnews/%s.shtml",
        "http://www.qlwb.com.cn/news/overseas/%s.shtml",
        "http://www.qlwb.com.cn/news/SocialNews/%s.shtml",
        "http://www.qlwb.com.cn/news/sports/%s.shtml",
        "http://www.qlwb.com.cn/news/importantnews/%s.shtml",
        "http://www.qlwb.com.cn/news/commentary/%s.shtml",
        "http://yule.qlwb.com.cn/"
    ]
    for i in urls:
        n = 1
        while True:
            if i == "http://yule.qlwb.com.cn":
                url = i
            else:
                url = i % n
            response = requests.get(url, headers=headers.header())
            response.encoding = "utf-8"
            if response.status_code == 200:
                html = response.text
                mistake = getURL(html)
                if mistake:
                    break
                n += 1
            else:
                break
Пример #17
0
def getUrl(reponse):
    html = reponse.text
    pattern = re.compile("news/[^\s]*\/\d+")
    urls = re.findall(pattern, html)
    url_number = []
    # 在网页中获取信息的最大编号
    for i in urls:
        pattern_num = re.compile("\d+")
        num = re.findall(pattern_num, i)
        url_number.append(num)
    max_number = int(max(url_number)[0])
    reload = 0
    while True:
        try:
            url = "https://www.btc123.com/news/newsDetails/%s" % max_number
            reponse_news = requests.get(url, headers=headers.header())
            reponse_news.encoding = "utf-8"
            # 判断在数据库中是否已经存入
            if rechecking(max_number, come_from="btc123"):
                break
            if reponse_news.status_code == 200:
                download(reponse_news, url, max_number)
                max_number -= 1
            else:
                err = reponse_news.status_code
                mistake(url, err)
                # 可以重新加载三次网页
                if reload == 3:
                    break
                reload += 1
        except:
            if reload == 3:
                break
            reload += 1
Пример #18
0
def starts():
    url = "http://gansu.gscn.com.cn/bwyc/index.html"
    response = requests.get(url, headers=headers.header())
    response.encoding = "utf-8"
    if response.status_code == 200:
        html = response.text
        getURL(html)
Пример #19
0
def starts():
    # 由于快讯网址上的快讯信息不是顺序排列,是乱序排列,采用已有的编号来爬取
    n = 8300
    tf = True
    # 判断数据库是否已经存在内容
    number = max_id(come_from="hangliancj_alerts")
    if number:
        n = number + 1
        tf = False
    while True:
        try:
            url = "http://hangliancj.com/article/%s.html" % n
            reponse = requests.get(url, headers=headers.header())
            reponse.encoding = "utf-8"
            if reponse.status_code == 200:
                data = download(url, reponse, n)
                if data:
                    break
            else:
                err = reponse.status_code
                mistake(url, err)
                break
            # 主要是运行的第一次
            if tf:
                n -= 1
            else:
                n += 1
        except TimeoutError:
            time.sleep(10)
Пример #20
0
def starts():
    urls = [
        "http://www.yulefm.com/star/index.html",
        "http://www.yulefm.com/movie/index.html",
        "http://www.yulefm.com/v/index.html",
        "http://www.yulefm.com/music/index.html",
        "http://www.yulefm.com/shishang/index.html"
    ]
    for i in urls:
        n = 1
        while True:
            if n == 1:
                url = i
            else:
                page = "_" + str(n) + ".html"
                url = re.sub(".html", page, n)
            response = requests.get(url, headers=headers.header())
            response.encoding = "utf-8"
            if response.status_code == 200:
                html = response.text
                mistake = getURL(html)
                if mistake:
                    break
                n += 1
            else:
                break
Пример #21
0
def starts():
    n = 33588
    # 用与判断获取路径是获取上一篇还是下一篇
    tf = True
    # 判断数据库是否已经存在内容
    number = max_id(come_from="chainfor")
    if number:
        n = int(number) + 1
        tf = False
    url = "https://www.chainfor.com/news/show/%s.html" % n
    while True:
        reponse = requests.get(url, headers=headers.header())
        reponse.encoding = "utf-8"
        if reponse.status_code == 200:
            data = getUrl(reponse, tf, url)
            if data == "end":
                break
            elif data == "continue":
                continue
            # 返回路径,跟新
            url = data
        else:
            err = reponse.status_code
            mistake(url, err)
            break
Пример #22
0
def starts():
    n = 2016
    tf = True
    # 判断数据库是否已经存在内容
    number = max_id(come_from="shangxia_alerts")
    if number:
        n = number + 1
        tf = False
    while True:
        try:
            url = "https://www.shangxia.net/kuaixun/1/%s.html" % n
            reponse = requests.get(url, headers=headers.header())
            reponse.encoding = "utf-8"
            if reponse.status_code == 200:
                data = download(reponse, n, url)
                if data:
                    break
            else:
                err = reponse.status_code
                mistake(url, err)
                break
            # 主要是运行的第一次
            if tf:
                n -= 1
            else:
                n += 1
        except TimeoutError:
            time.sleep(10)
Пример #23
0
def starts():
    url = "https://www.kg.com/jinrong"
    reponse = requests.get(url, headers=headers.header())
    reponse.encoding = "utf-8"
    if reponse.status_code == 200:
        html = reponse.text
        getUrl(html)
Пример #24
0
def starts():
    urls = [
        "http://www.artsbj.com/list-17-%s.html",
        "http://www.artsbj.com/list-18-%s.html",
        "http://www.artsbj.com/list-19-%s.html",
        "http://www.artsbj.com/list-20-%s.html",
        "http://www.artsbj.com/list-21-%s.html",
        "http://www.artsbj.com/list-22-%s.html",
        "http://www.artsbj.com/list-23-%s.html",
        "http://www.artsbj.com/list-24-%s.html"
    ]
    for i in urls:
        n = 1
        while True:
            url = i % n
            response = requests.get(url, headers=headers.header())
            response.encoding = "utf-8"
            if response.status_code == 200:
                html = response.text
                mistake = getURL(html)
                if mistake:
                    break
                n += 1
            else:
                break
Пример #25
0
def starts():
    urls = [
        "http://www.legaldaily.com.cn/index_article/node_5955.htm",
        "http://www.legaldaily.com.cn/Finance_and_Economics/node_75684.htm",
        "http://www.legaldaily.com.cn/IT/node_69471.htm",
        "http://www.legaldaily.com.cn/society/node_55564.htm",
        "http://www.legaldaily.com.cn/army/node_80560.htm"
    ]
    for i in urls:
        n = 1
        while True:
            if n == 1:
                url = i
            else:
                page = "_" + str(n) + ".htm"
                url = re.sub(".htm", page, i)
            response = requests.get(url, headers=headers.header())
            response.encoding = "utf-8"
            if response.status_code == 200:
                html = response.text
                mistake = getURL(html)
                if mistake:
                    break
                n += 1
            else:
                break
Пример #26
0
def starts():
    urls = [
        "http://www.cqcb.com/highlights/index.json",
        "http://www.cqcb.com/shishi/index.json",
        "http://www.cqcb.com/entertainment/index.json",
        "http://www.cqcb.com/science/index.json"
    ]
    for i in urls:
        n = 1
        while True:
            if n == 1:
                url = i
            else:
                page = "_" + str(n) + ".json"
                url = re.sub(".json", page, i)
            response = requests.get(url, headers=headers.header())
            response.encoding = "utf-8"
            if response.status_code == 200:
                html = response.text
                mistake = getURL(html)
                if mistake:
                    break
                n += 1
            else:
                break
Пример #27
0
def starts():
    urls = [
        "http://www.huaxia.com/xw/dlxw/index.html",
        "http://www.huaxia.com/xw/twxw/index.html",
        "http://www.huaxia.com/xw/gaxw/index.html",
        "http://www.huaxia.com/xw/gjxw/index.html",
        "http://www.huaxia.com/xw/zhxw/index.html"
    ]
    for i in urls:
        n = 1
        while True:
            if n == 1:
                url = i
            else:
                page = "_" + str(n) + ".html"
                url = re.sub(".html", page, i)
            response = requests.get(url, headers=headers.header())
            response.encoding = "gbk"
            if response.status_code == 200:
                html = response.text
                mistake = getURL(html)
                if mistake:
                    break
                n += 1
            else:
                break
Пример #28
0
def connect(url, number):
    url = "https://www.shobserver.com" + url
    response = requests.get(url, headers=headers.header())
    response.encoding = "utf-8"
    if response.status_code == 200:
        html = response.text
        download(html, number)
Пример #29
0
def starts():
    url = "https://idol001.com/"
    response = requests.get(url, headers=headers.header())
    response.encoding = "utf-8"
    if response.status_code == 200:
        html = response.text
        getURL(html)
Пример #30
0
def starts():
    urls = [
        "https://www.7234.cn/fetch_articles/news",
        "https://www.7234.cn/fetch_articles/blockchain",
        "https://www.7234.cn/fetch_articles/tech",
        "https://www.7234.cn/fetch_articles/huodong",
        "https://www.7234.cn/fetch_articles/column"
    ]
    for i in urls:
        n = 1
        while True:
            url = i + "?page=%s" % n
            reponse = requests.get(url, headers=headers.header())
            reponse.encoding = "utf-8"
            if reponse.status_code == 200:
                # 将json格式html文本转换为字典格式
                data = reponse.text
                data = json.loads(data)
                html = etree.HTML(data["html"])
                data = getUrl(html)
                if data:
                    break
                n += 1
            else:
                err = reponse.status_code
                mistake(url, err)
                break