Python extractContentFromHtmlString示例，util.crawl.extractContentFromHtmlString Python示例

示例#1

0

显示文件

文件： clean_origin_content.py 项目： michaelwangtd/netspider

def getTitleAndContentList(strline):
    # 正则表达式
    pattern = re.compile('(>[0-9][.):：、,，])|(>[【|\[][0-9][\]|】])', re.S)
    # 初始变量
    articleList = []
    boatList = []
    finalArticleList = []
    # 分段落算法
    for item in strline.split('\n'):
        if pattern.findall(item):
            if boatList:
                articleList.append(boatList)
                boatList = []
            boatList.append(item)
        else:
            boatList.append(item)
    # 格式化
    for item in articleList:
        if item[0]:
            title = crawl.extractContentFromHtmlString(item[0])
        else:
            title = ''
        contentList = item[1:]
        content = crawl.extractContentFromHtmlString(''.join(contentList))
        finalArticleList.append([title, content])
    return finalArticleList

示例#2

0

显示文件

文件： crawlInfo.py 项目： michaelwangtd/netspider

def getProductCompanyCreateTimeAndArea(soup):
    time = ''
    area = ''
    if soup.find('div', class_='info'):
        infoSoup = soup.find('div', class_='info')
        if re.search('class="info">\s*(.*?)\s*<p class="keyword">',
                     str(infoSoup), re.S):
            timeAndAreaStr = re.search(
                'class="info">\s*(.*?)\s*<p class="keyword">', str(infoSoup),
                re.S).group(1)
            timeAndAreaList = crawl.extractContentFromHtmlString(
                timeAndAreaStr)
            # 去掉list中的“/”
            cleanedList = []
            for item in timeAndAreaList:
                if item != '/':
                    cleanedList.append(item)
            if cleanedList:
                if len(cleanedList) == 2:
                    time = cleanedList[0]
                    area = cleanedList[1]
                elif len(cleanedList) == 1:
                    temp = cleanedList[0]
                    if '年' in temp or '月' in temp or '日' in temp:
                        time = temp
                    else:
                        area = temp
    return time, area

示例#3

0

显示文件

文件： crawl_original_data.py 项目： michaelwangtd/netspider

def getInfoFromHtml(url):
    '''
        爬虫主体程序
    '''
    header = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'
    }
    r = requests.get(url=url, headers=header)
    html = r.content.decode('utf-8')
    soup = BeautifulSoup(html)
    if soup.find('div', id='log-send-article'):
        oneSoup = soup.find('div', id='log-send-article')
        if oneSoup.find('div', id='related-article-wrap'):
            bodySoup = oneSoup.find('div', id='related-article-wrap')
            # 初始化数据存储数据
            # lineData = []   # 顺序：
            title = ''
            author = ''
            articleTime = ''
            contentUrlList = ''
            content = ''
            tags = ''
            originTags = ''
            # 标题
            if bodySoup.find('h1', class_='t-h1'):
                title = bodySoup.find('h1',
                                      class_='t-h1').string.replace(',', '')
            # 作者
            if bodySoup.find('span', class_='author-name'):
                author = bodySoup.find(
                    'span',
                    class_='author-name').find('a').string.replace(',', '')
            # 时间
            if bodySoup.find('span', class_='article-time'):
                articleTime = bodySoup.find(
                    'span', class_='article-time').string.replace(',', '')
            # url
            if bodySoup.find('div', id='article_content'):
                contentSoup = bodySoup.find('div', id='article_content')
                contentHtml = str(contentSoup)
                if contentHtml:
                    # 内容中的url链接
                    contentUrlList = getUrlListFromContentHtml(contentHtml)
                    # 内容
                    content = crawl.extractContentFromHtmlString(
                        contentHtml).replace(',', '')
                    # 提取的标签
                    if content:
                        tags = handle.extractTagsFromContent(content)
            return [
                title, articleTime, url, originTags, tags, author,
                contentUrlList, content
            ]
    return -1

示例#4

0

显示文件

文件： linkList.py 项目： michaelwangtd/netspider

def getCompanyIntroduce(soup):
    '''
        获取事件介绍信息
    '''
    introduce = ''
    if soup.find('div', class_='info'):
        infoSoup = soup.find('div', class_='info')
        if re.findall('link">\s*.*?\s*</p>\s*(.*?)\s*</div>', str(infoSoup),
                      re.S):
            pTagsStr = re.findall('link">\s*.*?\s*</p>\s*(.*?)\s*</div>',
                                  str(infoSoup), re.S)[0]
            # 从html标记中获取内容
            contentList = crawl.extractContentFromHtmlString(pTagsStr)
            for item in contentList:
                introduce = introduce + item
    return introduce

示例#5

0

显示文件

文件： linkList.py 项目： michaelwangtd/netspider

def getTimeTypeAndMoney(soup):
    '''
    使用正则表达式来匹配
    '''
    # 筛选的标准库
    timeSet = ['年', '月', '日']
    typeSet = [
        '不详', 'E轮', 'F轮', 'IPO上市及以后', 'D轮', 'A+轮', '其他轮', 'Pre-A', 'C轮', '天使',
        '种子', '并购', '股权投资', 'B轮', 'A轮'
    ]
    moneySet = [
        '万日元', '万韩国元', '万新加坡元', '万人民币', '万港币', '万英镑', '万澳大利亚元', '万欧元', '万美元',
        '万新台币'
    ]
    # 初始化字段
    investTime = ''
    investType = ''
    investMoney = ''
    if soup.find('div', class_='info'):
        infoSoup = soup.find('div', class_='info')
        # 使用正则截取需要的部分
        if re.search('info">\s*(.*?)\s*<p class="keyword">', str(infoSoup),
                     re.S):
            cake = re.search('info">\s*(.*?)\s*<p class="keyword">',
                             str(infoSoup), re.S).group(1)
            contentList = crawl.extractContentFromHtmlString(cake)
            for content in contentList:
                # 判断是否为time
                for time in timeSet:
                    if time in content:
                        investTime = content
                        break
                # 判断是否为type
                for type in typeSet:
                    if type in content:
                        investType = content
                        break
                # 判断是否为money
                for money in moneySet:
                    if money in content:
                        investMoney = content
                        break
    return investTime, investType, investMoney

示例#6

0

显示文件

文件： crawlInfo.py 项目： michaelwangtd/netspider

def getVcCompanyCreateTimePlaceAndArea(soup):
    timeSet = ['年', '月', '日']
    placeSet = [
        '市', '省', '香港', '澳门', '台湾', '地区', '共和国', '国', '州', '巴黎', '瑞士', '柬埔寨',
        '城', '台北', '纽约', '加拿大'
    ]
    areaSet = ['本土', '外资', '合资', '海外']
    createTime = ''
    vcCompanyPlace = ''
    vcCompanyArea = ''

    if soup.find('div', class_='info'):
        infoSoup = soup.find('div', class_='info')
        if re.search('class="info">\s*(.*?)\s*<p class="keyword">',
                     str(infoSoup), re.S):
            timeAndAreaStr = re.search(
                'class="info">\s*(.*?)\s*<p class="keyword">', str(infoSoup),
                re.S).group(1)
            timeAndAreaList = crawl.extractContentFromHtmlString(
                timeAndAreaStr)
            # 遍历内容列表
            for content in timeAndAreaList:
                # 查找时间
                for item in timeSet:
                    if item in content:
                        createTime = content
                        break
                # 查找place
                for item in placeSet:
                    if item in content:
                        vcCompanyPlace = content
                        break
                # 查找area
                for item in areaSet:
                    if item in content:
                        vcCompanyArea = content
                        break

    return createTime, vcCompanyPlace, vcCompanyArea

示例#7

0

显示文件

文件： clean_origin_data.py 项目： michaelwangtd/netspider

                initDic = getInitDic()
                # 这里将初始化部分提前
                initDic['title'] = lineDic['data']['title']
                initDic['url'] = lineDic['data']['currentUrl']
                if lineDic['data']['user']:
                    initDic['author'] = lineDic['data']['user']['name']
                else:
                    initDic['author'] = ''
                # 提取时间信息
                postTime = getPostTime(lineDic['data']['published_at'])
                initDic['time'] = postTime
                # 提取内容content中的url链接
                urlList = getUrlListFromContentHtml(lineDic['data']['content'])
                initDic['content_url'] = urlList
                # 提取内容
                content = crawl.extractContentFromHtmlString(
                    lineDic['data']['content'])
                initDic['content'] = content
                # 提取原始标签
                originTags = getOriginTag(lineDic['data']['extraction_tags'])
                initDic['originTag'] = originTags
                # 提取标签
                tags = handle.extractTagsFromContent(content)
                initDic['tag'] = tags

                jsonRecord = json.dumps(initDic, ensure_ascii=False)
                fw.write(jsonRecord + '\n')
                # print(i,jsonRecord)
                # i += 1
            except Exception as ex:
                print('这条记录数据有问题...')
            i += 1

示例#8

0

显示文件

文件： clean_origin_content.py 项目： michaelwangtd/netspider

            if isValid(lineDic['title']):
                # try:
                # 初始化记录字典
                initDic = getInitDic()
                # 这里将初始化部分提前
                initDic['title'] = lineDic['title']
                initDic['url'] = lineDic['posturl']
                initDic['author'] = lineDic['name']
                # 提取时间信息
                postTime = getTimeFromJson(lineDic['gtime'], timeStamp)
                initDic['time'] = postTime
                # 提取内容content中的url链接
                urlList = getUrlListFromContentHtml(lineDic['contenthtml'])
                initDic['content_url'] = urlList
                # 提取内容
                content = crawl.extractContentFromHtmlString(
                    lineDic['contenthtml'])
                initDic['content'] = content
                # -提取标签-
                tags = handle.extractTagsFromContent(content)
                initDic['tag'] = tags

                jsonRecord = json.dumps(initDic, ensure_ascii=False)
                fw.write(jsonRecord + '\n')
                # print(i,jsonRecord)
                # print(i)
                # i += 1
            # except Exception as ex:
            #     print(ex)
            i += 1
        else:
            break