def analyzeInfo_one(item): result = {} a_title = item.find_all('a') result['disasterid'] = '10107' #类别:暴雨 result['link'] = 'http://www.cibeicn.com' + a_title[0]['href'] # 新闻链接 source = get_source(result['link']) result['source'] = re.findall(r'来源:(.+)', source)[0] #新闻来源 result['originalText'] = get_original(result['link']) # 新闻原文 release = get_releaseTime(result['link']) time_str1 = re.sub("\D", "", release) datetime_struct1 = parser.parse(time_str1) releaseTime = datetime_struct1.strftime('%Y-%m-%d %H:%M:%S') result['releaseTime'] = releaseTime # 发布时间 strong_info_list = item.find('strong') if strong_info_list == None: a_info_list = a_title[0].get_text().strip() result['title'] = a_info_list # 标题 else: result['title'] = strong_info_list.get_text().strip() originalText = result['title'] + ',' + result['originalText'] latlngadd_tuple = address.placeMany(originalText) result['place'] = latlngadd_tuple[0] #发生地点 result['longitude'] = str(latlngadd_tuple[1]) #地点经度 result['latitude'] = str(latlngadd_tuple[2]) #地点纬度 result['strength'] = '' #灾害强度 result['occurTime'] = result['releaseTime'] #发生时间 death = toYc.death(originalText) injured = toYc.Injured(originalText) lossNumber = toYc.loss(originalText) result['loss'] = str(lossNumber) #经济损失 result['injured'] = str(injured) #受伤人数 result['death'] = str(death) #死亡人数 result['pictures'] = '' #多个路径之间用分号隔开 result['more'] = '' #特殊字段 result['regional'] = '国内' result['province'] = latlngadd_tuple[3] #灾害发生的一级行政区划 result['country'] = latlngadd_tuple[4] #灾害发生国家 result['current_website'] = '防灾网' #灾害当前网站 result['isreleasetime'] = '1' #灾害发生时间是否是用发布时间代替 resultSun = {} resultSun['title'] = result['title'] resultSun['originalText'] = result['originalText'] resultSun['pictures'] = result['pictures'] try: title = 'rainstorm_ZH001' res = postgreCommand.insertData(result, resultSun, title) if res == 1: print(title, '数据插入成功!') elif res == 0: print(title, '数据更新成功!') except Exception as e: print("插入数据失败", str(e))
def analyzeInfo(item): result = {} a_list = item.find_all('a', limit=1) h3_list = item.find_all('h3',attrs={'class': 'tit'},limit=1) span_link_list = h3_list[0].find_all('span', limit=1) div_list = item.find_all('div', attrs={'class': 'src-tim'}, limit=1) span2_list = div_list[0].find_all('span', attrs={'class': 'tim'}, limit=1) time_str = re.sub('\D', "", span2_list[0].get_text().strip()) datetime_struct = parser.parse(time_str) releaseTime = datetime_struct.strftime('%Y-%m-%d %H:%M:%S') result['disasterid'] = '10002' #新闻类别 result['link'] = span_link_list[0]['lanmu1'] #新闻链接 result['title'] = a_list[0].get_text().strip() #新闻标题 result['releaseTime'] = releaseTime #发布时间 print(result['link']) analyze = analyzeInfoSun(result['link']) result['source'] = analyze[1] #新闻来源 result['originalText'] = analyze[0] #新闻原文 originalText = result['title'] + ',' + result['originalText'] latlngadd_tuple = address.placeSingle(originalText) result['place'] = latlngadd_tuple[0] #发生地点 result['longitude'] = str(latlngadd_tuple[1]) #地点经度 result['latitude'] = str(latlngadd_tuple[2]) #地点纬度 result['strength'] = '' #灾害强度 result['occurTime'] = result['releaseTime'] #发生时间 death = toYc.death(originalText) injured = toYc.Injured(originalText) lossNumber = toYc.loss(originalText) result['loss'] = str(lossNumber) #经济损失 result['injured'] = str(injured) #受伤人数 result['death'] = str(death) #死亡人数 result['pictures'] = analyze[2] #多个路径之间用分号隔开 result['more'] = '' #特殊字段 result['regional'] = '国内' result['province'] = latlngadd_tuple[3] #灾害发生的一级行政区划 result['country'] = latlngadd_tuple[4] #灾害发生国家 result['current_website'] = '央视网' #灾害当前网站 result['isreleasetime'] = '1' #灾害发生时间是否是用发布时间代替 result['isrellonandlat'] = '0' resultSun = {} resultSun['title'] = result['title'] resultSun['originalText'] = result['originalText'] resultSun['pictures'] = result['pictures'] try: title = 'landslide_ZH006' res = postgreCommand.insertData(result,resultSun,title) if res == 1: print(title,'数据插入成功!') elif res == 0: print(title,'数据更新成功!') except Exception as e: print("插入数据失败", str(e))
def analyzeInfo(item): result = {} h3_list = item.find_all('h3', limit=1) a_list = h3_list[0].find_all('a', limit=1) div_list = item.find('div', attrs={'class': 'c-summary c-row ' }) #.find('p').get_text().split() p_list = div_list.find('p').get_text().split() time_str = re.sub("\D", "", p_list[1] + p_list[2]) datetime_struct1 = parser.parse(time_str) releaseTime = datetime_struct1.strftime('%Y-%m-%d %H:%M:%S') result['link'] = a_list[0]['href'] result['title'] = a_list[0].get_text().strip() result['releaseTime'] = releaseTime result['disasterid'] = '1000104' originalList = get_original(result['link']) result['source'] = originalList[0] result['originalText'] = originalList[1] result['pictures'] = originalList[2] originalText = result['title'] + ',' + result['originalText'] latlngadd_tuple = address.placeSingle(originalText) result['place'] = latlngadd_tuple[0] #发生地点 result['longitude'] = str(latlngadd_tuple[1]) #地点经度 result['latitude'] = str(latlngadd_tuple[2]) #地点纬度 death = toYc.death(originalText) injured = toYc.Injured(originalText) lossNumber = toYc.loss(originalText) result['loss'] = str(lossNumber) #经济损失 result['injured'] = str(injured) #受伤人数 result['death'] = str(death) #死亡人数 result['province'] = latlngadd_tuple[3] #灾害发生的一级行政区划 result['country'] = latlngadd_tuple[4] #灾害发生国家 result['strength'] = '' result['occurTime'] = result['releaseTime'] #多个路径之间用分号隔开 result['more'] = '' #特殊字段 result['regional'] = '国内' #新闻发布地区 result['current_website'] = '百度新闻' #灾害当前网站 result['isreleasetime'] = '1' #灾害发生时间是否是用发布时间代替 result['isrellonandlat'] = '0' resultSun = {} resultSun['title'] = result['title'] resultSun['originalText'] = result['originalText'] resultSun['pictures'] = result['pictures'] try: title = 'collapse_ZH003' res = postgreCommand.insertData(result, resultSun, title) if res == 1: print(title, '数据插入成功!') elif res == 0: print(title, '数据更新成功!') except Exception as e: print("插入数据失败", str(e))
def analyzeInfo_One(item): result = {} divs = item.find_all('div') title = divs[0].find('a').get_text().strip() link = 'http://www.qxkp.net' + divs[0].find('a')['href'] time_str1 = re.sub("\D", "", divs[1].get_text()) datetime_struct1 = parser.parse(time_str1) releaseTime = datetime_struct1.strftime('%Y-%m-%d %H:%M:%S') result['disasterid'] = '10107' #类别:暴雨 result['link'] = link # 新闻链接 resultSun = analyzeInfo_Two(link) result['source'] = resultSun['source'] #新闻来源 result['originalText'] = resultSun['originalText'] # 新闻原文 result['releaseTime'] = releaseTime # 发布时间 result['title'] = title # 标题 originalText = result['title'] + ',' + result['originalText'] latlngadd_tuple = address.placeMany(originalText) result['place'] = latlngadd_tuple[0] #发生地点 result['longitude'] = str(latlngadd_tuple[1]) #地点经度 result['latitude'] = str(latlngadd_tuple[2]) #地点纬度 result['strength'] = '' #灾害强度 result['occurTime'] = result['releaseTime'] #发生时间 death = toYc.death(originalText) injured = toYc.Injured(originalText) lossNumber = toYc.loss(originalText) result['loss'] = str(lossNumber) #经济损失 result['injured'] = str(injured) #受伤人数 result['death'] = str(death) #死亡人数 result['pictures'] = resultSun['pictures'] #多个路径之间用分号隔开 result['more'] = '' #特殊字段 result['regional'] = '国内' result['province'] = latlngadd_tuple[3] #灾害发生的一级行政区划 result['country'] = latlngadd_tuple[4] #灾害发生国家 result['current_website'] = '气象科普网' #灾害当前网站 result['isreleasetime'] = '1' #灾害发生时间是否是用发布时间代替 result['isrellonandlat'] = '0' resultSun = {} resultSun['title'] = result['title'] resultSun['originalText'] = result['originalText'] resultSun['pictures'] = result['pictures'] try: title = 'rainstorm_ZH002' res = postgreCommand.insertData(result, resultSun, title) if res == 1: print(title, '数据插入成功!') elif res == 0: print(title, '数据更新成功!') except Exception as e: print("插入数据失败", str(e))
def analyzeInfo(item): result = {} h2_list = item.find_all('h2', limit=1) a_list = h2_list[0].find_all('a') span_list = h2_list[0].find_all('span') span_new = span_list[0].get_text().strip().split() result['link'] = a_list[0]['href'] result['title'] = a_list[0].get_text().strip() result['releaseTime'] = span_new[1] + ' ' + span_new[2] originalList = get_original(result['link']) if originalList[3]: result['source'] = originalList[0] result['originalText'] = originalList[1] result['pictures'] = originalList[2] result['disasterid'] = '1000104' originalText = result['title'] + ',' + result['originalText'] latlngadd_tuple = address.placeSingle(originalText) result['place'] = latlngadd_tuple[0] #发生地点 result['longitude'] = str(latlngadd_tuple[1]) #地点经度 result['latitude'] = str(latlngadd_tuple[2]) #地点纬度 death = toYc.death(originalText) injured = toYc.Injured(originalText) lossNumber = toYc.loss(originalText) result['loss'] = str(lossNumber) #经济损失 result['injured'] = str(injured) #受伤人数 result['death'] = str(death) #死亡人数 result['province'] = latlngadd_tuple[3] #灾害发生的一级行政区划 result['country'] = latlngadd_tuple[4] #灾害发生国家 result['strength'] = '' result['occurTime'] = result['releaseTime'] #多个路径之间用分号隔开 result['more'] = '' #特殊字段 result[ 'regional'] = '国内' #新闻发布地区 #灾害发生国家 result['current_website'] = '新浪网' #灾害当前网站 result['isreleasetime'] = '1' #灾害发生时间是否是用发布时间代替 result['isrellonandlat'] = '0' resultSun = {} resultSun['title'] = result['title'] resultSun['originalText'] = result['originalText'] resultSun['pictures'] = result['pictures'] try: title = 'collapse_ZH005' res = postgreCommand.insertData(result, resultSun, title) if res == 1: print(title, '数据插入成功!') elif res == 0: print(title, '数据更新成功!') except Exception as e: print("插入数据失败", str(e))
def analyzeInfo(item): result = {} h4_list = item.find_all('h4', limit=1) a_list = h4_list[0].find_all('a', limit=1) i_list = item.find_all('i', limit=1) time_str = (i_list[0].get_text().strip())[3:] datetime_struct1 = parser.parse(time_str) releaseTime = datetime_struct1.strftime('%Y-%m-%d %H:%M:%S') result['link'] = a_list[0]['href'] #新闻链接 result['title'] = a_list[0].get_text().strip() #新闻标题 result['releaseTime'] = releaseTime #发布时间 originalList = get_original(result['link']) result['source'] = originalList[1] #新闻来源 result['originalText'] = originalList[0] #新闻原文 originalText = result['title'] + ',' + result['originalText'] latlngadd_tuple = address.placeMany(originalText) result['disasterid'] = '10107' #灾害类型 result['place'] = latlngadd_tuple[0] #发生地点 result['longitude'] = str(latlngadd_tuple[1]) #地点经度 result['latitude'] = str(latlngadd_tuple[2]) #地点纬度 result['strength'] = '' result['occurTime'] = result['releaseTime'] death = toYc.death(originalText) injured = toYc.Injured(originalText) lossNumber = toYc.loss(originalText) result['loss'] = str(lossNumber) #经济损失 result['injured'] = str(injured) #受伤人数 result['death'] = str(death) #死亡人数 result['pictures'] = originalList[2] #多个路径之间用分号隔开 result['more'] = '' #特殊字段 result['regional'] = '国内' #新闻发布地区 result['province'] = latlngadd_tuple[3] #灾害发生的一级行政区划 result['country'] = latlngadd_tuple[4] #灾害发生国家 result['current_website'] = '天气网' #灾害当前网站 result['isreleasetime'] = '1' #灾害发生时间是否是用发布时间代替 result['isrellonandlat'] = '0' resultSun = {} resultSun['title'] = result['title'] resultSun['originalText'] = result['originalText'] resultSun['pictures'] = result['pictures'] try: title = 'rainstorm_ZH006' res = postgreCommand.insertData(result, resultSun, title) if res == 1: print(title, '数据插入成功!') elif res == 0: print(title, '数据更新成功!') except Exception as e: print("插入数据失败", str(e))
def analyzeInfo_one(item): result = {} a_title = item.find_all('a') dataCount = int(mysqlCommand.getLastId()) + 1 result['id'] = str(dataCount) result['disasterid'] = '0006' #类别:暴雨 result['link'] = 'http://www.cibeicn.com' + a_title[0]['href'] # 新闻链接 source = get_source(result['link']) result['source'] = re.findall(r'来源:(.+)', source)[0] #新闻来源 result['originalText'] = get_original(result['link']) # 新闻原文 release = get_releaseTime(result['link']) releaseTime = re.sub("\D", "", release) result['releaseTime'] = releaseTime # 发布时间 strong_info_list = item.find('strong') if strong_info_list == None: a_info_list = a_title[0].get_text().strip() result['title'] = a_info_list # 标题 else: result['title'] = strong_info_list.get_text().strip() title_str = [result['originalText']] df = cpca.transform(title_str) place = df.values result['place'] = place[0][0] + place[0][1] + place[0][2] #发生地点 if result['place'] != '': llat = geocode(result['place']) result['longitude'] = llat[0] #地点经度 result['latitude'] = llat[1] #地点纬度 else: result['longitude'] = '0' result['latitude'] = '0' result['strength'] = '暴雨' #灾害强度 result['occurTime'] = '' #发生时间 originalText = result['originalText'] + result['title'] death = toYc.death(originalText) injured = toYc.Injured(originalText) result['injured'] = str(injured) #受伤人数 result['death'] = str(death) #死亡人数 result['loss'] = '0' #经济损失 result['pictures'] = '' #多个路径之间用分号隔开 result['more'] = '' #特殊字段 try: # 插入数据,如果已经存在就不在重复插入 title = 'rainstorm_ZH001' res = mysqlCommand.insertData(result, title) if res: dataCount = res except Exception as e: print("插入数据失败", str(e)) #输出插入失败的报错语句
def analyzeInfo(item): address = '' result = {} a_list = item.find_all('a', limit=1) h3_list = item.find_all('h3', attrs={'class': 'tit'}, limit=1) span_link_list = h3_list[0].find_all('span', limit=1) div_list = item.find_all('div', attrs={'class': 'src-tim'}, limit=1) span1_list = div_list[0].find_all('span', attrs={'class': 'src'}, limit=1) span2_list = div_list[0].find_all('span', attrs={'class': 'tim'}, limit=1) dataCount = int(mysqlCommand.getLastId()) + 1 result['id'] = str(dataCount) result['disasterid'] = '0008' #类别:泥石流 result['link'] = span_link_list[0]['lanmu1'] result['title'] = a_list[0].get_text().strip() #新闻标题 result['releaseTime'] = re.sub("\D", "", span2_list[0].get_text().strip()) #发布时间 result['source'] = span1_list[0].get_text().strip().replace('来源:', '') #新闻来源 result['originalText'] = get_original(result['link']) #新闻原文 title_str = [result['originalText']] words, ners = fool.analysis(title_str) for itemSun in ners[0]: if itemSun[2] == 'location': if itemSun[3] in address: break else: address = address + itemSun[3] + ',' result['place'] = address #发生地点 result['longitude'] = '0' #地点经度 result['latitude'] = '0' #地点纬度 result['strength'] = '' #灾害强度 result['occurTime'] = '' #发生时间 originalText = result['originalText'] + ',' + result['title'] death = toYc.death(originalText) injured = toYc.Injured(originalText) result['injured'] = str(injured) #受伤人数 result['death'] = str(death) #死亡人数 result['loss'] = '0' #经济损失 result['pictures'] = '' #多个路径之间用分号隔开 result['more'] = '' #特殊字段 try: # 插入数据,如果已经存在就不在重复插入 title = 'debrisFlow' res = mysqlCommand.insertData(result, title) if res: dataCount = res except Exception as e: print("插入数据失败", str(e)) #输出插入失败的报错语句
def analyzeInfo(item): result = {} a_info = item.find('a') result['link'] = 'http://japan.people.com.cn' + a_info['href'] result['title'] = a_info.get_text().strip() datetime_struct1 = parser.parse(item.find('span').get_text().strip()) releaseTime = datetime_struct1.strftime('%Y-%m-%d %H:%M:%S') result['releaseTime'] = releaseTime analyze = analyzeInfoSun(result['link']) result['source'] = analyze[1] result['originalText'] = analyze[0] originalText = result['title'] + ',' + result['originalText'] result['disasterid'] = disasterNB(originalText) #新闻类别:崩塌 if result['disasterid'] != '0': latlngadd_tuple = placeSingle(originalText) result['place'] = latlngadd_tuple[0] #发生地点 result['longitude'] = str(latlngadd_tuple[1]) #地点经度 result['latitude'] = str(latlngadd_tuple[2]) #地点纬度 result['strength'] = '' #灾害强度 result['occurTime'] = result[ 'releaseTime'] #parser.parse('2017-10-01 12:12:12') #发生时间 death = toYc.death(originalText) injured = toYc.Injured(originalText) lossNumber = toYc.loss(originalText) result['loss'] = str(lossNumber) #经济损失 result['injured'] = str(injured) #受伤人数 result['death'] = str(death) #死亡人数 result['pictures'] = analyze[2] #多个路径之间用分号隔开 result['more'] = '' #特殊字段 result['regional'] = '国内' #新闻发布地区 result['province'] = '' #灾害发生的一级行政区划 result['country'] = '日本' #灾害发生国家 result['current_website'] = '人民网' #灾害当前网站 result['isreleasetime'] = '1' #灾害发生时间是否是用发布时间代替 result['isrellonandlat'] = '0' resultSun = {} resultSun['title'] = result['title'] resultSun['originalText'] = result['originalText'] resultSun['pictures'] = result['pictures'] try: title = 'comprehensive_ZH001' res = postgreCommand.insertData(result, resultSun, title) if res == 1: print(title, '数据插入成功!') elif res == 0: print(title, '数据更新成功!') except Exception as e: print("插入数据失败", str(e))
def analyzeInfo_One(item): result = {} divs = item.find_all('div') title = divs[0].find('a').get_text().strip() link = 'http://www.qxkp.net' + divs[0].find('a')['href'] releaseTime = re.sub("\D", "", divs[1].get_text()) dataCount = int(mysqlCommand.getLastId()) + 1 result['id'] = str(dataCount) result['disasterid'] = '0006' #类别:暴雨 result['link'] = link # 新闻链接 resultSun = analyzeInfo_Two(link) result['source'] = resultSun['source'] #新闻来源 result['originalText'] = resultSun['originalText'] # 新闻原文 result['releaseTime'] = releaseTime # 发布时间 result['title'] = title # 标题 title_str = [result['originalText']] df = cpca.transform(title_str) place = df.values result['place'] = place[0][0] + place[0][1] + place[0][2] #发生地点 if result['place'] != '': llat = geocode(result['place']) result['longitude'] = llat[0] #地点经度 result['latitude'] = llat[1] #地点纬度 else: result['longitude'] = '0' result['latitude'] = '0' result['strength'] = '暴雨' #灾害强度 result['occurTime'] = '' #发生时间 originalText = result['originalText'] + result['title'] death = toYc.death(originalText) injured = toYc.Injured(originalText) result['injured'] = str(injured) #受伤人数 result['death'] = str(death) #死亡人数 result['loss'] = '0' #经济损失 result['pictures'] = resultSun['pictures'] #多个路径之间用分号隔开 result['more'] = '' #特殊字段 try: # 插入数据,如果已经存在就不在重复插入 title = 'rainstorm_ZH002' res = mysqlCommand.insertData(result, title) if res: dataCount = res except Exception as e: print("插入数据失败", str(e)) #输出插入失败的报错语句
def analyzeInfo(item): result = {} a_list = item.find_all('td',limit=1) result['link'] = a_list[0].get_text().strip() #新闻链接 originalList = get_original(result['link']) result['title'] = originalList[0] #新闻标题 result['source'] = originalList[1] #新闻来源 result['releaseTime'] = originalList[2] #发布时间 result['originalText'] = originalList[3] #新闻原文 result['disasterid'] = '10205' #灾害类型 result['pictures'] = originalList[4] #新闻图片 originalText = result['title'] + ',' + result['originalText'] latlngadd_tuple = address.placeSingle(originalText) result['place'] = latlngadd_tuple[0] #发生地点 result['longitude'] = str(latlngadd_tuple[1]) #地点经度 result['latitude'] = str(latlngadd_tuple[2]) #地点纬度 death = toYc.death(originalText) injured = toYc.Injured(originalText) lossNumber = toYc.loss(originalText) result['loss'] = str(lossNumber) #经济损失 result['injured'] = str(injured) #受伤人数 result['death'] = str(death) #死亡人数 result['province'] = latlngadd_tuple[3] #灾害发生的一级行政区划 result['country'] = latlngadd_tuple[4] #灾害发生国家 result['strength'] = '' result['occurTime'] = result['releaseTime'] #多个路径之间用分号隔开 result['more'] = '' #特殊字段 result['regional'] = '国内' #新闻发布地区 #灾害发生国家 result['current_website'] = '大众网' #灾害当前网站 result['isreleasetime'] = '1' #灾害发生时间是否是用发布时间代替 result['isrellonandlat'] = '1' resultSun = {} resultSun['title'] = result['title'] resultSun['originalText'] = result['originalText'] resultSun['pictures'] = result['pictures'] try: title = 'tsunami_ZH003' res = postgreCommand.insertData(result,resultSun,title) if res == 1: print(title,'数据插入成功!') elif res == 0: print(title,'数据更新成功!') except Exception as e: print("插入数据失败", str(e))