def getEventInfoList(linkIndexList, logFileName): hostName = 'http://newseed.pedaily.cn' eventInfoList = [] if linkIndexList: i = 1 for linkIndex in linkIndexList: link = hostName + linkIndex.strip() # statusCode = handle.getUrlStatus(link) # if statusCode == 200: if link: ## 获取对应信息 # 获取浓汤soup hooshSoup = crawl.getHooshSoup(link, logFileName) if hooshSoup: # print('获取到hooshSoup') # 初始化变量信息 investTitle = '' investTime = '' investType = '' investMoney = '' productCompanyInfoList = '' investCompanyInfoList = '' investIntroduce = '' try: if hooshSoup.find('div', class_='main').find( 'div', class_='record invest').find( 'div', class_='col-md-860'): # 定位到html标记的最小单位 soup = hooshSoup.find('div', class_='main').find( 'div', class_='record invest').find( 'div', class_='col-md-860') # 获取事件标题 investTitle = getEventTitle(soup) # 获取时间,类型,金额 investTime, investType, investMoney = getTimeTypeAndMoney( soup) # 获取并购相关公司名称,链接(公司信息以列表(name,link)形式返回) productCompanyInfoList, investCompanyInfoList = getInvestRelatedCompany( soup) # 获取事件介绍 investIntroduce = getInvestIntroduce(soup) else: print('这条数据信息已丢失...') ## 处理字段,形成列表 # print('到达数据校验这一步了') recordList = createRecordList(hostName, investTitle, investTime, investType, investMoney, productCompanyInfoList, investCompanyInfoList, investIntroduce) if recordList != -1: # 将处理完的一条记录数据加入列表 eventInfoList.append(recordList) print(recordList) except Exception as ex: print(ex) print('已处理第【', str(i), '】条记录') i += 1 return eventInfoList
def getProductCompanyInfoList(linkIndexList, logFileName): hostName = 'http://newseed.pedaily.cn' productInfoList = [] if linkIndexList: i = 1 for linkIndex in linkIndexList: link = hostName + linkIndex.strip() # statusCode = handle.getUrlStatus(link) # if statusCode == 200: if link: ## 获取对应信息 # 获取浓汤soup hooshSoup = crawl.getHooshSoup(link, logFileName) if hooshSoup: # print('获取到hooshSoup') # 初始化变量信息 productCompanyName = '' productCompanyFullName = '' createTime = '' area = '' productCompanyHomepage = '' companyIntroduce = '' try: if hooshSoup.find('div', class_='main').find( 'div', class_='record').find('div', class_='col-md-860'): # 定位到html标记的最小单位 soup = hooshSoup.find('div', class_='main').find( 'div', class_='record').find('div', class_='col-md-860') # 获取公司名称和注册名称 productCompanyName, productCompanyFullName = crawlInfo.getProductCompanyName( soup) # 获取公司创建时间,地域 createTime, area = crawlInfo.getProductCompanyCreateTimeAndArea( soup) # 获取并购相关公司名称,链接(公司信息以列表(name,link)形式返回) productCompanyHomepage = crawlInfo.getHomepage( soup) # 获取事件介绍 companyIntroduce = getCompanyIntroduce(soup) else: print('这条数据信息已丢失...') ## 处理字段,形成列表 recordList = crawlInfo.createProductCompanyRecordList( productCompanyName, link, productCompanyFullName, createTime, area, productCompanyHomepage, companyIntroduce) if recordList != -1: # 将处理完的一条记录数据加入列表 productInfoList.append(recordList) print(recordList) except Exception as ex: print(ex) print('已处理第【', str(i), '】条记录') i += 1 return productInfoList
def getTotalRecordNum(initUrl): ''' 获取页面总记录数 ''' recordNum = '' # 检验url链接 statusCode = handle.getUrlStatus(initUrl) if statusCode == 200: # 获取初始的浓汤 hooshSoup = crawl.getHooshSoup(initUrl) # 获取总记录数 if hooshSoup.find('span', id='total'): recordNum = hooshSoup.find('span', id='total').string return recordNum
def getEventLinkIndexList(pageLinkList, logFileName=''): ''' 获取事件链接索引列表 ''' eventLinkIndexList = [] if pageLinkList: i = 1 for pageLink in pageLinkList: if handle.getUrlStatus(pageLink) == 200: hooshSoup = crawl.getHooshSoup(pageLink, logFileName) if hooshSoup: tbodySoup = hooshSoup.find('tbody') for trTag in tbodySoup.find_all('tr'): linkIndex = trTag.find_all( 'td', class_='td6')[0].a.get('href') eventLinkIndexList.append(linkIndex) print('获取索引数目:', str(i), linkIndex) i += 1 print('eventLinkIndexList长度为:', str(len(eventLinkIndexList))) return eventLinkIndexList
def getCompanyLinkIndexList(pageLinkList, logFileName=''): ''' 获取公司链接索引列表 ''' companyLinkIndexList = [] if pageLinkList: i = 1 for pageLink in pageLinkList: if handle.getUrlStatus(pageLink) == 200: hooshSoup = crawl.getHooshSoup(pageLink, logFileName) if hooshSoup: ulSoup = hooshSoup.find('ul', id='newslist') for trTag in ulSoup.find_all('li'): linkIndex = trTag.find_all( 'div', class_='user-pic')[0].a.get('href') companyLinkIndexList.append(linkIndex) print('获取索引数目:', str(i), linkIndex) i += 1 print('companyLinkIndexList长度为:', str(len(companyLinkIndexList))) return companyLinkIndexList
# 遍历索引,抓取信息 # 2 生成输出文件字段 outputFilePath = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))),'data','newseed_data','resultSet',outputFileName) # 数据记录集合 infoList = [] i = 1 for linkIndex in linkIndexList: link = hostName + linkIndex.strip() try: statusCode = handle.getUrlStatus(link) if statusCode == 200: ## 获取对应信息 # 获取浓汤soup hooshSoup = crawl.getHooshSoup(link) # 初始化变量信息 investTitle = '' investTime = '' investType = '' investMoney = '' productCompanyInfoList = '' investCompanyInfoList = '' investIntroduce = '' if hooshSoup.find('div',class_='main').find('div',class_='record invest').find('div',class_='col-md-860'): # 定位到html标记的最小单位 soup = hooshSoup.find('div',class_='main').find('div',class_='record invest').find('div',class_='col-md-860') # 获取事件标题 investTitle = linkList.getEventTitle(soup) # 获取时间,类型,金额 investTime,investType,investMoney = linkList.getTimeTypeAndMoney(soup)