Пример #1
0
    def start_crawl(self):
        '''
        这个函数开始爬取数据
        '''
        #获取到页面和条目统计信息
        fileName = ''
        stopFlag = False
        CrawlData = {}
        #FilterData={}
        pages, indexcount = 2275, 22757
        fields = [
            "reporttime", "reporter", "title", "sitename", "keyword",
            "content", "imgUrl"
        ]
        #根据是否需要记录文件来进行
        EventInfoExtract = fact_triple_extraction1chen.EventInfoExtract(
            r"3.3.0\ltp_data", 'out123.txt')
        #        index = 0
        #遍历所有的新闻页
        for i in range(1, pages + 1):
            #得到当前页的url
            urls = self.starturl.replace("page=1", "page=%d" % i)
            #返回数组对象,每个元素表示一条新闻的简略信息
            infodexs = self.index_info(urls, self.my_headers, i)
            if len(infodexs) > 0:
                csv_data = []
                for infodex in infodexs:
                    #判断是否达到终止天数

                    if self.deadlineTime != 0 and infodex["reporttime"][
                            0:10] < self.deadlineTime:  #终止爬取
                        stopFlag = True
                        if (EventInfoExtract.segmentor != None):
                            EventInfoExtract.release_module()
                        break

#                    if(infodex["keyword"]=="视频"):
#                        continue
                    print "=====================新闻信息=========================="
                    print infodex["url"]
                    #                    body = self.get_news_body(infodex["url"],self.my_headers)
                    #                    if body!=None:
                    #                        infodex["content"]=body["content"]
                    #                        infodex["reporter"]=u"新华社"
                    #                    index+=1
                    #恢复为原来的段落
                    #                    datas = infodex["content"].split(u"  ")
                    #                    EventInfoExtract.InitModule()
                    #
                    #                    #print '数据-----------------', datas
                    #                    for data in datas:
                    #                        print data.encode("utf-8")
                    #
                    #                        if len(data.encode("utf-8"))<30 or data.encode("utf-8")==None:
                    #                            continue
                    #                        TimeAndAddress=EventInfoExtract.addresssTime_extract(data.encode("utf-8"))
                    #                        #print TimeAndAddress
                    #                        fact_attribute = EventInfoExtract.fact_attribute_from_text(data.encode("utf-8"))
                    #                        orgnization = EventInfoExtract.organization_from_text(data.encode("utf-8"))
                    #                        death_num,hurt_num,total_num = EventInfoExtract.death_num_from_text(data.encode("utf-8"))
                    #                        if TimeAndAddress[0]["date"]=="" and TimeAndAddress[0]["address"]=="":
                    #                            continue
                    #                        print '''
                    #    时间\t地点\t事件类型\t攻击组织\t伤亡总人数\t死亡人数\t受伤人数
                    #    %s--%s--%s--%s--%s--%s--%s'''%(TimeAndAddress[0]['date'],TimeAndAddress[0]['address'],fact_attribute,orgnization,total_num,death_num,hurt_num)
                    #                    # print("start to releases")

                    #将新闻的原文也进行保存

                    #                    imgUrl=infodex["imgUrl"]
                    #                    imgName=""
                    #                    if(imgUrl!=None and imgUrl!=""):
                    #                        imgName=imgUrl.split("/")[-1]
                    #                        urlretrieve("http://tpic.home.news.cn/xhCloudNewsPic/"+imgUrl,"./imgs/"+ imgName)
                    print infodex
                    news = {}
                    news["title"] = infodex["title"]
                    news["des"] = infodex["des"]
                    news["pubtime"] = infodex["reporttime"]
                    news["content"] = infodex["content"]
                    #                    news["img"]=imgName
                    news["url"] = infodex["url"]
                    csv_data.append(news)
#                    news['time']=TimeAndAddress[0]['date']
#                    news['address']=TimeAndAddress[0]['address']
#                    news['type']=fact_attribute

#                    if(total_num!=None):
#                        news['total']="伤亡:" + total_num
#                    else:
#                        if death_num==None:
#                            death_num="0"
#                        if hurt_num==None:
#                            hurt_num="0"
#                        # print death_num, hurt_num
#                        news['total']="死亡:" + death_num + ",受伤:" + hurt_num
#                    news["gname"]=orgnization
#                    news['nwound']=hurt_num
#                    news['nkill']=death_num

# insertSql = db_connect.generateSQL(news)
# print insertSql
# db_connect.insertOneData(insertSql)

# PostData(data,hosturl)
#                    EventInfoExtract.release_module()

                if stopFlag == True:
                    print "sTOPP ING"
                    break
            self.save_to_file(csv_data, 'cctv.csv', i)
        if (EventInfoExtract.segmentor != None):
            EventInfoExtract.release_module()
        sys.exit(0)
        print "Here Release-=======--==="
Пример #2
0
    def craw(self, root_url):
        count = 1
        for i in range(30):#21
            start_url = root_url.replace("offset=20", "offset=%d" % count)
            html = self.downloader.download(start_url)
            # print html
            new_urls = self.parser.parse(start_url, html)
            self.urls.add_new_urls(new_urls)
            print 'craw %d %s' % (count, start_url)
            count = count + 20

        count = 1
        while self.urls.has_new_url():
            try:
                new_url = self.urls.get_new_url()
                print('id %d' % count)
                html_cont = self.downloader.download(new_url)
                self.content = []
                title, time, des, content, img_url = self.parser.get_data(new_url, html_cont)
                print(title, time, des, self.content)
                print('==========================================')
            except Exception as e:
                print(e)
                continue


            EventInfoExtract = fact_triple_extraction1chen.EventInfoExtract(r"3.3.0\ltp_data", 'out123.txt')
            EventInfoExtract.InitModule()

            TimeAndAddress = EventInfoExtract.addresssTime_extract(content.encode("utf-8"))
            # print TimeAndAddress
            fact_attribute = EventInfoExtract.fact_attribute_from_text(content.encode("utf-8"))
            orgnization = EventInfoExtract.organization_from_text(content.encode("utf-8"))
            death_num, hurt_num, total_num = EventInfoExtract.death_num_from_text(content.encode("utf-8"))
            # if TimeAndAddress[0]["date"] == "" and TimeAndAddress[0]["address"] == "":
            #     continue
            print '''  time\t address\t type\t orgnize\t total\t dead\t hurt%s--%s--%s--%s--%s--%s--%s''' % (
            TimeAndAddress[0]['date'], TimeAndAddress[0]['address'], fact_attribute, orgnization, total_num, death_num, hurt_num)

            # basis datas
            news = {}
            news['id'] = count
            news['pubtime'] = time
            news['title'] = title
            news['des'] = des
            news['content'] = content
            news['url'] = new_url
            news['img'] = img_url

            # exact datas
            news['time'] = TimeAndAddress[0]['date']
            news['address'] = TimeAndAddress[0]['address']
            news['type'] = fact_attribute
            if (total_num != None):
                news['total'] = "total:" + total_num
            else:
                if death_num == None:
                    death_num = "0"
                if hurt_num == None:
                    hurt_num = "0"
                # print death_num, hurt_num
                news['total'] = "dead:" + death_num + ", hurt:" + hurt_num
            news['gname'] = orgnization
            news['nwound'] = hurt_num
            news['nkill'] = death_num

            datas = []
            datas.append(news)
            self.outputer.output_execl(datas, "SputnikNews.csv", count)
            count = count + 1
            EventInfoExtract.release_module()