Python getHooshSoup示例，util.crawl.getHooshSoup Python示例

示例#1

0

显示文件

文件： linkList.py 项目： michaelwangtd/netspider

def getEventInfoList(linkIndexList, logFileName):
    hostName = 'http://newseed.pedaily.cn'
    eventInfoList = []
    if linkIndexList:
        i = 1
        for linkIndex in linkIndexList:
            link = hostName + linkIndex.strip()
            # statusCode = handle.getUrlStatus(link)
            # if statusCode == 200:
            if link:
                ## 获取对应信息
                # 获取浓汤soup
                hooshSoup = crawl.getHooshSoup(link, logFileName)
                if hooshSoup:
                    # print('获取到hooshSoup')
                    # 初始化变量信息
                    investTitle = ''
                    investTime = ''
                    investType = ''
                    investMoney = ''
                    productCompanyInfoList = ''
                    investCompanyInfoList = ''
                    investIntroduce = ''
                    try:
                        if hooshSoup.find('div', class_='main').find(
                                'div', class_='record invest').find(
                                    'div', class_='col-md-860'):
                            # 定位到html标记的最小单位
                            soup = hooshSoup.find('div', class_='main').find(
                                'div', class_='record invest').find(
                                    'div', class_='col-md-860')
                            # 获取事件标题
                            investTitle = getEventTitle(soup)
                            # 获取时间，类型，金额
                            investTime, investType, investMoney = getTimeTypeAndMoney(
                                soup)
                            # 获取并购相关公司名称，链接（公司信息以列表(name,link)形式返回）
                            productCompanyInfoList, investCompanyInfoList = getInvestRelatedCompany(
                                soup)
                            # 获取事件介绍
                            investIntroduce = getInvestIntroduce(soup)
                        else:
                            print('这条数据信息已丢失...')
                            ## 处理字段，形成列表
                        # print('到达数据校验这一步了')
                        recordList = createRecordList(hostName, investTitle,
                                                      investTime, investType,
                                                      investMoney,
                                                      productCompanyInfoList,
                                                      investCompanyInfoList,
                                                      investIntroduce)
                        if recordList != -1:
                            # 将处理完的一条记录数据加入列表
                            eventInfoList.append(recordList)
                            print(recordList)
                    except Exception as ex:
                        print(ex)
            print('已处理第【', str(i), '】条记录')
            i += 1
        return eventInfoList

示例#2

0

显示文件

文件： linkList.py 项目： michaelwangtd/netspider

def getProductCompanyInfoList(linkIndexList, logFileName):
    hostName = 'http://newseed.pedaily.cn'
    productInfoList = []
    if linkIndexList:
        i = 1
        for linkIndex in linkIndexList:
            link = hostName + linkIndex.strip()
            # statusCode = handle.getUrlStatus(link)
            # if statusCode == 200:
            if link:
                ## 获取对应信息
                # 获取浓汤soup
                hooshSoup = crawl.getHooshSoup(link, logFileName)
                if hooshSoup:
                    # print('获取到hooshSoup')
                    # 初始化变量信息
                    productCompanyName = ''
                    productCompanyFullName = ''
                    createTime = ''
                    area = ''
                    productCompanyHomepage = ''
                    companyIntroduce = ''
                    try:
                        if hooshSoup.find('div', class_='main').find(
                                'div',
                                class_='record').find('div',
                                                      class_='col-md-860'):
                            # 定位到html标记的最小单位
                            soup = hooshSoup.find('div', class_='main').find(
                                'div',
                                class_='record').find('div',
                                                      class_='col-md-860')
                            # 获取公司名称和注册名称
                            productCompanyName, productCompanyFullName = crawlInfo.getProductCompanyName(
                                soup)
                            # 获取公司创建时间，地域
                            createTime, area = crawlInfo.getProductCompanyCreateTimeAndArea(
                                soup)
                            # 获取并购相关公司名称，链接（公司信息以列表(name,link)形式返回）
                            productCompanyHomepage = crawlInfo.getHomepage(
                                soup)
                            # 获取事件介绍
                            companyIntroduce = getCompanyIntroduce(soup)
                        else:
                            print('这条数据信息已丢失...')
                            ## 处理字段，形成列表
                        recordList = crawlInfo.createProductCompanyRecordList(
                            productCompanyName, link, productCompanyFullName,
                            createTime, area, productCompanyHomepage,
                            companyIntroduce)
                        if recordList != -1:
                            # 将处理完的一条记录数据加入列表
                            productInfoList.append(recordList)
                            print(recordList)
                    except Exception as ex:
                        print(ex)
            print('已处理第【', str(i), '】条记录')
            i += 1
        return productInfoList

示例#3

0

显示文件

文件： linkList.py 项目： michaelwangtd/netspider

def getTotalRecordNum(initUrl):
    '''
    获取页面总记录数
    '''
    recordNum = ''
    # 检验url链接
    statusCode = handle.getUrlStatus(initUrl)
    if statusCode == 200:
        # 获取初始的浓汤
        hooshSoup = crawl.getHooshSoup(initUrl)
        # 获取总记录数
        if hooshSoup.find('span', id='total'):
            recordNum = hooshSoup.find('span', id='total').string
    return recordNum

示例#4

0

显示文件

文件： linkList.py 项目： michaelwangtd/netspider

def getEventLinkIndexList(pageLinkList, logFileName=''):
    '''
    获取事件链接索引列表
    '''
    eventLinkIndexList = []
    if pageLinkList:
        i = 1
        for pageLink in pageLinkList:
            if handle.getUrlStatus(pageLink) == 200:
                hooshSoup = crawl.getHooshSoup(pageLink, logFileName)
                if hooshSoup:
                    tbodySoup = hooshSoup.find('tbody')
                    for trTag in tbodySoup.find_all('tr'):
                        linkIndex = trTag.find_all(
                            'td', class_='td6')[0].a.get('href')
                        eventLinkIndexList.append(linkIndex)
                        print('获取索引数目：', str(i), linkIndex)
                        i += 1
    print('eventLinkIndexList长度为：', str(len(eventLinkIndexList)))
    return eventLinkIndexList

示例#5

0

显示文件

文件： linkList.py 项目： michaelwangtd/netspider

def getCompanyLinkIndexList(pageLinkList, logFileName=''):
    '''
    获取公司链接索引列表
    '''
    companyLinkIndexList = []
    if pageLinkList:
        i = 1
        for pageLink in pageLinkList:
            if handle.getUrlStatus(pageLink) == 200:
                hooshSoup = crawl.getHooshSoup(pageLink, logFileName)
                if hooshSoup:
                    ulSoup = hooshSoup.find('ul', id='newslist')
                    for trTag in ulSoup.find_all('li'):
                        linkIndex = trTag.find_all(
                            'div', class_='user-pic')[0].a.get('href')
                        companyLinkIndexList.append(linkIndex)
                        print('获取索引数目：', str(i), linkIndex)
                        i += 1
    print('companyLinkIndexList长度为：', str(len(companyLinkIndexList)))
    return companyLinkIndexList

示例#6

0

显示文件

    # 遍历索引,抓取信息
    # 2 生成输出文件字段
    outputFilePath = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))),'data','newseed_data','resultSet',outputFileName)

    # 数据记录集合
    infoList = []

    i = 1
    for linkIndex in linkIndexList:
        link = hostName + linkIndex.strip()
        try:
            statusCode = handle.getUrlStatus(link)
            if statusCode == 200:
                ## 获取对应信息
                # 获取浓汤soup
                hooshSoup = crawl.getHooshSoup(link)
                # 初始化变量信息
                investTitle = ''
                investTime = ''
                investType = ''
                investMoney = ''
                productCompanyInfoList = ''
                investCompanyInfoList = ''
                investIntroduce = ''
                if hooshSoup.find('div',class_='main').find('div',class_='record invest').find('div',class_='col-md-860'):
                    # 定位到html标记的最小单位
                    soup = hooshSoup.find('div',class_='main').find('div',class_='record invest').find('div',class_='col-md-860')
                    # 获取事件标题
                    investTitle = linkList.getEventTitle(soup)
                    # 获取时间，类型，金额
                    investTime,investType,investMoney = linkList.getTimeTypeAndMoney(soup)