Пример #1
0
def read_detial(url, i):
    detial_html = htmlSource.get_html(url_p=url, type_p='rg')
    #print(detial_html)
    # 写html
    files = File_file()
    names = url.split('/')
    file_name = names[len(names) - 1]

    files.save_source(path=path,
                      file=file_name,
                      all_the_text=detial_html,
                      encoding_='utf-8')
    colum = [
        ('title', '//h1[@class="articleHead"]//text()', 'l'),
        ('pushDate',
         '//div[@class="info"]//span[@class="infoA"][@id="pubtime_baidu"]//text()',
         'l'), ('content', '//div[@class="articleText"]//text()', 'sarra', ',')
    ]
    result = rule.html_content_analysis_detial(html_text=detial_html,
                                               column=colum,
                                               url=url)
    print(result)
    #sql="insert into cancer value('%s','%s','%s','%s','%s')"%(result[0][1][0],str(result[1][1][0]).replace('患者,图片因隐私问题无法显示','').replace("患者,","患者:").replace("医生,","医生:").replace('\'','"'),type,'春雨医生',url)
    #print(sql)
    # 写文件
    # web_name(网站名)、web_url(网址)、titile(标题)、text(新闻内容)、publish_date(发布时间)
    csv = Csv_base()
    csv.write_csv_file_line(file_path=path + "/data.csv",
                            str=[
                                '参考消息', url, result[0][1], result[1][1],
                                result[2][1], i,
                                time.strftime('%Y-%m-%d %H:%M:%S',
                                              time.localtime(time.time()))
                            ])
Пример #2
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://mil.huanqiu.com/world/%s.html"
    for i in range(1, 101):
        s = 'index'
        if (i == 1):
            s = 'index'
        else:
            s = i
        url = start_url % (s)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [('a', '//div[@class="fallsFlow"]//ul//li//a//@href', 'l')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
Пример #3
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://www.81.cn/rd/node_92585%s.htm"
    for i in range(1, 6):
        ss = ''
        if (i == 1):
            ss = ''
        else:
            ss = '_%d' % i
        url = start_url % (ss)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [('a', '//div[@class="content"]//ul//li//a//@href', 'sab',
                  'http://www.81.cn/rd/')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
Пример #4
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://junshi.xilu.com/dfjs/index_1372_%d.html"
    for i in range(1, 101):
        url = start_url % (i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [(
            'a',
            '//div[@class="newslist_box"]//ul//li//div[@class="newslist_tit"]//a//@href',
            'l')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
Пример #5
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://www.cankaoxiaoxi.com/mil/gjjq/%d.shtml"
    for i in range(1, 101):
        url = start_url % (i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [(
            'a',
            '//div[@class="inner"]//ul[@class="txt-list-a fz-14"]//li//a//@href',
            'sab', '')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
Пример #6
0
    def runDetail(self, file_path, confs, listconfs):
        csv = Csv_base()
        # 数据读取
        for listconf in listconfs:
            dic_list = []
            for row in listconf['columns']:
                dic_list.append(row['名称'])
            listList = csv.read_csv_file_dict(file_path=file_path,
                                              fieldnames=dic_list)

            for row in listList:
                self.crawlerDetail(confs, url=row[confs['urlname']])
Пример #7
0
 def runDict(self, url, conf):
     rule = Rule()
     result = rule.crawler_list(url, conf)
     # 数据入库 TODO
     dic_list = []
     for row in conf['columns']:
         dic_list.append(row['名称'])
     # 文件写入文件
     csv = Csv_base()
     csv.write_csv_file_dictLines(file_path='../data/xuexi111.csv',
                                  strs=result,
                                  fieldnames=dic_list)
Пример #8
0
    def runList(self, file_path, confs, dictconf):
        csv = Csv_base()
        # 数据读取
        dic_list = []
        for row in dictconf['columns']:
            dic_list.append(row['名称'])
        dictList = csv.read_csv_file_dict(file_path=file_path,
                                          fieldnames=dic_list)

        # 数据写入
        for dict in dictList:
            for conf in confs:
                self.crawlerNext(conf, url=dict[conf['urlname']])
Пример #9
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://qc.wa.news.cn/nodeart/list?nid=11139636&pgnum=%d&cnt=1000&tp=1&orderby=1"
    for i in range(1, 6):
        url = start_url % (i)

        read_detial(url, i)
Пример #10
0
 def crawlerNext(self, conf, url=''):
     rule = Rule()
     csv = Csv_base()
     list_list = []
     for row in conf['columns']:
         list_list.append(row['名称'])
     result, next_page = rule.crawler_list(url, conf)
     print(result)
     if (len(result) > 0):
         csv.write_csv_file_dictLines(file_path='../data/xuexi111List.csv',
                                      strs=result,
                                      fieldnames=list_list)
         if (next_page):
             self.crawlerNext(conf, url=next_page)
Пример #11
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(file_path=path+"/data.csv",mode='w+',str=['网站名','网址','标题','新闻内容','发布时间','页码','采集时间'])
    # 爬虫
    start_url = "https://www.thepaper.cn/load_index.jsp?nodeids=25430&topCids=&pageidx=%d&isList=true&lastTime=1550211962471"
    for i in range(1,26):
        url = start_url%(i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url,type_p='rg')
        #print(list_html)
        colum=[('a','//div[@class="news_li"]//h2//a//@href','sab','https://www.thepaper.cn/')]
        list = rule.html_content_analysis_detial(html_text=list_html,column=colum,url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a,i)
Пример #12
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(file_path=path + "/data.csv",
                            mode='w+',
                            str=['网站名', '网址', '标题', '新闻内容', '发布时间', '采集时间'])
    # 爬虫
    start_url = "https://military.china.com/news/"

    #print(url)
    list_html = htmlSource.get_html(url_p=start_url, type_p='rg')
    #print(list_html)
    colum = [('a', '//div[@class="column-list"]//h3[@class="tit"]//a//@href',
              'l')]
    list = rule.html_content_analysis_detial(html_text=list_html,
                                             column=colum,
                                             url=start_url)
    #print(list)
    for a in list[0][1]:
        read_detial(a)
Пример #13
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://www.dsti.net/Information/HyeList/arms/%d"
    for i in range(1, 815):
        url = start_url % (i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [('a', '//div[@class="listMidContent"]//ul//li//a//@href',
                  'sab', 'http://www.dsti.net')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
Пример #14
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://mil.news.sina.com.cn/roll/index.d.html?cid=57919&page=%d"
    for i in range(1, 26):
        url = start_url % (i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [('a', '//div[@class="fixList"]//ul//li//a//@href', 'l')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            if (a[len(a) - 6:] == '.shtml'):
                read_detial(a, i)
Пример #15
0
def main():
    floder = File_floder()
    floder.add(path_p=path)
    csv = Csv_base()
    csv.write_csv_file_line(
        file_path=path + "/data.csv",
        mode='w+',
        str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间'])
    # 爬虫
    start_url = "http://military.people.com.cn/GB/1077/index%d.html"
    for i in range(1, 8):
        url = start_url % (i)
        #print(url)
        list_html = htmlSource.get_html(url_p=url, type_p='rg')
        #print(list_html)
        colum = [('a', '//div[@class="ej_list_box clear"]//ul//li//a//@href',
                  'sab', 'http://military.people.com.cn')]
        list = rule.html_content_analysis_detial(html_text=list_html,
                                                 column=colum,
                                                 url=url)
        #print(list)
        for a in list[0][1]:
            read_detial(a, i)
Пример #16
0
#!/usr/bin/env python
# coding=utf-8

from common.inc_csv import Csv_base
from common.inc_file import File_file
from lxml import html
import re

csv = Csv_base()
file = File_file()


def replaceStr(a):
    print(a)
    a = re.sub(re.compile(r"收藏查看我的收藏(\d+)有用(.*?)(\d+)已投票(\d+)", re.S), "", a)
    a = str(a).replace("编辑锁定", " ").strip()
    a = str(a).replace("讨论999", " ").strip()
    a = str(a).replace("本词条缺少概述图,补充相关内容使词条更完整,还能快速升级,赶紧来编辑吧!", " ").strip()
    a = str(a).replace(
        "百度百科内容由网友共同编辑,如您发现自己的词条内容不准确或不完善,欢迎使用本人词条编辑服务(免费)参与修正。", " ").strip()
    a = str(a).replace("立即前往 >>", " ").strip()
    print(a)
    return a


if __name__ == '__main__':

    csv_data_path = "../../data/百科候选关键词.csv"
    rows = csv.read_csv_file(csv_data_path)
    for row in rows:
        try:
Пример #17
0
# -- coding: UTF-8 --

from common.inc_csv import Csv_base
from common.inc_file import File_floder
import requests
if __name__ == '__main__':
    file_path = '../data/百科候选关键词.csv'
    folder_path ="../data/百科候选关键词/img"
    floder = File_floder()
    floder.add(folder_path)

    file = Csv_base()
    list = file.read_csv_file(file_path)
    for row in list:
        try:
            img_url = str(row[4]).replace('`','')
            if(img_url!=''):
                img_name=img_url.split("/")[-1]
                if(img_name.find("?")>-1):
                    img_name = img_name[0:img_name.find("?")]
                #img_content = requests.get(img_url).content
                #with open('../data/百科候选关键词/img/%s.jpg' % img_name, 'wb') as f:
                #    f.write(img_content)
                rw_str=[img_url,img_name,row[11].replace('`','')]
                file.write_csv_file_line(file_path="../data/百科候选关键词_img.csv",str=rw_str)
        except Exception as e:
            print(e)
Пример #18
0
# -- coding: UTF-8 --

from common.HtmlSource import HtmlSource
from common.Rule import Rule
#from common.inc_conn import Conn_mysql
from common.inc_csv import Csv_base
from common.inc_file import File_file, File_floder
import requests
from lxml import html
import time

floder = File_floder()
htmlSource = HtmlSource()
rule = Rule()
csv = Csv_base()
flag = 0
commontitle = 1


# 多线程
def read_detial(url, path):
    if (str(url[0][1]).startswith("http") and url[1][1] != "收起"):
        # TODO 翻页
        for i in range(1, 11):
            print(url[0][1])
            detial_html = htmlSource.get_html(url_p=url[0][1] +
                                              "/?page=%d" % i,
                                              type_p='rg')
            tree = html.fromstring(detial_html)
            hreflist = tree.xpath(
                '//ul[@class="list"]/li/div/div/p[@class="name"]/a/@href')
Пример #19
0
#!/usr/bin/env python
# coding=utf-8

from common.inc_csv import Csv_base
from lxml import html
import re


def readFile(filePath=''):
    pass


if __name__ == '__main__':
    filepath = '../data/问答语料_1.0.txt'
    file = Csv_base()
    list = file.read_csv_file(filepath)
    for i in range(len(list)):
        if (i > 3):
            row = list[i]
            rows = str(row[0]).split("\t")
            html_text = rows[1].replace("[", '').replace("]", "")
            # 正则匹配 re.match从字符串起始处匹配。
            html_text = re.sub(re.compile(r"<script.*?</script>", re.S), "",
                               html_text)

            print(html_text)
            tree = html.fromstring(html_text)
            texts = tree.xpath('.//text()')
            text = ""
            for a in texts:
                text = text + str(a).replace("\\n", ".").strip()