def getTitleAndContentList(strline): # 正则表达式 pattern = re.compile('(>[0-9][.)::、,,])|(>[【|\[][0-9][\]|】])', re.S) # 初始变量 articleList = [] boatList = [] finalArticleList = [] # 分段落算法 for item in strline.split('\n'): if pattern.findall(item): if boatList: articleList.append(boatList) boatList = [] boatList.append(item) else: boatList.append(item) # 格式化 for item in articleList: if item[0]: title = crawl.extractContentFromHtmlString(item[0]) else: title = '' contentList = item[1:] content = crawl.extractContentFromHtmlString(''.join(contentList)) finalArticleList.append([title, content]) return finalArticleList
def getProductCompanyCreateTimeAndArea(soup): time = '' area = '' if soup.find('div', class_='info'): infoSoup = soup.find('div', class_='info') if re.search('class="info">\s*(.*?)\s*<p class="keyword">', str(infoSoup), re.S): timeAndAreaStr = re.search( 'class="info">\s*(.*?)\s*<p class="keyword">', str(infoSoup), re.S).group(1) timeAndAreaList = crawl.extractContentFromHtmlString( timeAndAreaStr) # 去掉list中的“/” cleanedList = [] for item in timeAndAreaList: if item != '/': cleanedList.append(item) if cleanedList: if len(cleanedList) == 2: time = cleanedList[0] area = cleanedList[1] elif len(cleanedList) == 1: temp = cleanedList[0] if '年' in temp or '月' in temp or '日' in temp: time = temp else: area = temp return time, area
def getInfoFromHtml(url): ''' 爬虫主体程序 ''' header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36' } r = requests.get(url=url, headers=header) html = r.content.decode('utf-8') soup = BeautifulSoup(html) if soup.find('div', id='log-send-article'): oneSoup = soup.find('div', id='log-send-article') if oneSoup.find('div', id='related-article-wrap'): bodySoup = oneSoup.find('div', id='related-article-wrap') # 初始化数据存储数据 # lineData = [] # 顺序: title = '' author = '' articleTime = '' contentUrlList = '' content = '' tags = '' originTags = '' # 标题 if bodySoup.find('h1', class_='t-h1'): title = bodySoup.find('h1', class_='t-h1').string.replace(',', '') # 作者 if bodySoup.find('span', class_='author-name'): author = bodySoup.find( 'span', class_='author-name').find('a').string.replace(',', '') # 时间 if bodySoup.find('span', class_='article-time'): articleTime = bodySoup.find( 'span', class_='article-time').string.replace(',', '') # url if bodySoup.find('div', id='article_content'): contentSoup = bodySoup.find('div', id='article_content') contentHtml = str(contentSoup) if contentHtml: # 内容中的url链接 contentUrlList = getUrlListFromContentHtml(contentHtml) # 内容 content = crawl.extractContentFromHtmlString( contentHtml).replace(',', '') # 提取的标签 if content: tags = handle.extractTagsFromContent(content) return [ title, articleTime, url, originTags, tags, author, contentUrlList, content ] return -1
def getCompanyIntroduce(soup): ''' 获取事件介绍信息 ''' introduce = '' if soup.find('div', class_='info'): infoSoup = soup.find('div', class_='info') if re.findall('link">\s*.*?\s*</p>\s*(.*?)\s*</div>', str(infoSoup), re.S): pTagsStr = re.findall('link">\s*.*?\s*</p>\s*(.*?)\s*</div>', str(infoSoup), re.S)[0] # 从html标记中获取内容 contentList = crawl.extractContentFromHtmlString(pTagsStr) for item in contentList: introduce = introduce + item return introduce
def getTimeTypeAndMoney(soup): ''' 使用正则表达式来匹配 ''' # 筛选的标准库 timeSet = ['年', '月', '日'] typeSet = [ '不详', 'E轮', 'F轮', 'IPO上市及以后', 'D轮', 'A+轮', '其他轮', 'Pre-A', 'C轮', '天使', '种子', '并购', '股权投资', 'B轮', 'A轮' ] moneySet = [ '万日元', '万韩国元', '万新加坡元', '万人民币', '万港币', '万英镑', '万澳大利亚元', '万欧元', '万美元', '万新台币' ] # 初始化字段 investTime = '' investType = '' investMoney = '' if soup.find('div', class_='info'): infoSoup = soup.find('div', class_='info') # 使用正则截取需要的部分 if re.search('info">\s*(.*?)\s*<p class="keyword">', str(infoSoup), re.S): cake = re.search('info">\s*(.*?)\s*<p class="keyword">', str(infoSoup), re.S).group(1) contentList = crawl.extractContentFromHtmlString(cake) for content in contentList: # 判断是否为time for time in timeSet: if time in content: investTime = content break # 判断是否为type for type in typeSet: if type in content: investType = content break # 判断是否为money for money in moneySet: if money in content: investMoney = content break return investTime, investType, investMoney
def getVcCompanyCreateTimePlaceAndArea(soup): timeSet = ['年', '月', '日'] placeSet = [ '市', '省', '香港', '澳门', '台湾', '地区', '共和国', '国', '州', '巴黎', '瑞士', '柬埔寨', '城', '台北', '纽约', '加拿大' ] areaSet = ['本土', '外资', '合资', '海外'] createTime = '' vcCompanyPlace = '' vcCompanyArea = '' if soup.find('div', class_='info'): infoSoup = soup.find('div', class_='info') if re.search('class="info">\s*(.*?)\s*<p class="keyword">', str(infoSoup), re.S): timeAndAreaStr = re.search( 'class="info">\s*(.*?)\s*<p class="keyword">', str(infoSoup), re.S).group(1) timeAndAreaList = crawl.extractContentFromHtmlString( timeAndAreaStr) # 遍历内容列表 for content in timeAndAreaList: # 查找时间 for item in timeSet: if item in content: createTime = content break # 查找place for item in placeSet: if item in content: vcCompanyPlace = content break # 查找area for item in areaSet: if item in content: vcCompanyArea = content break return createTime, vcCompanyPlace, vcCompanyArea
initDic = getInitDic() # 这里将初始化部分提前 initDic['title'] = lineDic['data']['title'] initDic['url'] = lineDic['data']['currentUrl'] if lineDic['data']['user']: initDic['author'] = lineDic['data']['user']['name'] else: initDic['author'] = '' # 提取时间信息 postTime = getPostTime(lineDic['data']['published_at']) initDic['time'] = postTime # 提取内容content中的url链接 urlList = getUrlListFromContentHtml(lineDic['data']['content']) initDic['content_url'] = urlList # 提取内容 content = crawl.extractContentFromHtmlString( lineDic['data']['content']) initDic['content'] = content # 提取原始标签 originTags = getOriginTag(lineDic['data']['extraction_tags']) initDic['originTag'] = originTags # 提取标签 tags = handle.extractTagsFromContent(content) initDic['tag'] = tags jsonRecord = json.dumps(initDic, ensure_ascii=False) fw.write(jsonRecord + '\n') # print(i,jsonRecord) # i += 1 except Exception as ex: print('这条记录数据有问题...') i += 1
if isValid(lineDic['title']): # try: # 初始化记录字典 initDic = getInitDic() # 这里将初始化部分提前 initDic['title'] = lineDic['title'] initDic['url'] = lineDic['posturl'] initDic['author'] = lineDic['name'] # 提取时间信息 postTime = getTimeFromJson(lineDic['gtime'], timeStamp) initDic['time'] = postTime # 提取内容content中的url链接 urlList = getUrlListFromContentHtml(lineDic['contenthtml']) initDic['content_url'] = urlList # 提取内容 content = crawl.extractContentFromHtmlString( lineDic['contenthtml']) initDic['content'] = content # -提取标签- tags = handle.extractTagsFromContent(content) initDic['tag'] = tags jsonRecord = json.dumps(initDic, ensure_ascii=False) fw.write(jsonRecord + '\n') # print(i,jsonRecord) # print(i) # i += 1 # except Exception as ex: # print(ex) i += 1 else: break