Python getSoupByStr示例，framework.htmlParser.getSoupByStr Python示例

示例#1

0

显示文件

def koolearn(muluUrl, stage):

    global conn, csor
    if not conn or (not csor):
        conn, csor = getTmathConnCsor()

    muluHtmlContent = getContentWithUA(muluUrl, defaultPCUa)
    muluSoup = getSoupByStr(muluHtmlContent)
    for pageLi in muluSoup.select('.list01 ul li'):
        try:
            title = pageLi.select_one('h3').get_text()
            if u'下载' in title:
                continue
            descTag = pageLi.select_one('.js2 p')
            if not descTag:
                descTag = pageLi.select_one('.js p')
            desc = descTag.get_text()

            tags = pageLi.select_one('.c_lv')['title']
            ntype = tags  #从标签中选择一个具有代表性的作为类型，一般为第二个
            if len(tags) > 3:
                ts = tags.split(' ')
                if len(ts) > 2:
                    ntype = ts[1]
            contentUrl = pageLi.select_one('h3 a')['href']

            kooleanStartByContentUrl(conn, contentUrl, csor, desc, ntype,
                                     stage, tags, title)
        except Exception as ee:
            print traceback.format_exc()

    #获取下一页
    footLinks = muluSoup.select('#page a')
    nextUrl = footLinks[len(footLinks) - 1]['href']
    koolearn(urlparse.urljoin(muluUrl, nextUrl), stage)

示例#2

0

显示文件

文件： dbImgSrcUrlJoin.py 项目： zyq001/pycrawler

def handleHtml(baseUrl, htmlContent):
    soup = getSoupByStr(htmlContent)
    modfied = False
    for img in soup.select('img'):
        imgSrc = img['src']
        if not imgSrc.startswith('http'):
            img['src'] = urlparse.urljoin(baseUrl, imgSrc)
            modfied = True
    return unicode(soup), modfied

示例#3

0

显示文件

文件： dbImgSrcUrlJoin.py 项目： zyq001/pycrawler

def cleanHtml(htmlContent):
    soup = getSoupByStr(htmlContent)
    modfied = False
    [a.unwrap() for a in soup.select('a')]

    for img in soup.select('img'):
        if not img.has_key('style') or len(img['style']) < 1:
            img['style'] = 'max-width:100%'
        else:
            preStyle = img['style']
            if preStyle.endswith(';'):
                img['style'] = img['style'] + 'max-width:100%;'
            else:
                img['style'] = img['style'] + ';max-width:100%'

        modfied = True
    return unicode(soup), modfied

示例#4

0

显示文件

文件： dealMathFormateMidHigh.py 项目： zyq001/pycrawler

# sql = "select * from t_topic_new where id = 2645 or id = 2678"
# sql = "select * from t_topic_new where id > 2994 and id < 3094"
# sql = "select * from t_topic_new where id > 3411 and  id < 3730"#初中
sql = "select * from t_topic_new where id > 1647 and  id < 1757"  #大学

print sql
csor.execute(sql)

results = csor.fetchall()
for row in results:
    content = row[4]
    # content = row[4].replace('mi', 'mo')
    id = row[0]

    soup = getSoupByStr(content)

    t = soup.select('.title')
    if t and len(t) == 1:
        t[0].extract()
    else:
        print 'exteact Title fail, title:', unicode(t).encode('utf-8')

    pointStr = u""
    sampleStr = u""

    sampleBgn = False
    answerBgn = False
    answerEnd = False  #多个例题时碰到 需要标记解析何时结束

    answerTag = soup.new_tag('div')

示例#5

0

显示文件

def handleByMTID(mid):
    baseUrl = 'http://api.yingyangcan.com.cn/interface/ajax/book/getbaseinfo.ajax?contentid='
    capListBaseUrl = 'http://api.yingyangcan.com.cn/interface/ajax/book/getcatalog.ajax?contentid=' + str(mid) \
                     +'&pageindex=1&pagesize=100000000'
    capContentBaseUrl = 'http://api.yingyangcan.com.cn/interface/ajax/book/getcharpter.ajax?chapterindex='  #2&contentid=171117'
    bookDetailUrl = 'http://m.yingyangcan.com.cn/interface/template/content/book_detail.vhtml?id='
    url = baseUrl + str(mid)
    baseInfoContent = getContentWithUA(url, ua)
    if not baseInfoContent:
        baseInfoContent = getContentWithUA(url, ua)
    baseObj = json.loads(baseInfoContent)

    baseData = baseObj['data']
    author = baseData['author']
    title = baseData['name']
    coverUrl = baseData['coverUrl']
    contentUrl = baseData['contentUrl']
    count = baseData['count']
    isOver = baseData['isOver']
    BookType = '连载'
    if isOver == 1:
        BookType = '完结'

    bookDetailHtml = getContentWithUA(bookDetailUrl + str(mid), ua)
    bookDetailSoup = getSoupByStr(bookDetailHtml)
    bookDesc = bookDetailSoup.select_one('#J-desc').get_text().replace(
        '\n', '').replace('\t\t', '\t')

    bookObj = dict()
    bookObj['subtitle'] = bookDesc
    bookObj['source'] = "" + str(mid)
    bookObj['rawUrl'] = url
    bookObj['title'] = title
    bookObj['chapterNum'] = count
    bookObj['imgUrl'] = coverUrl
    bookObj['author'] = author
    bookObj['size'] = count * 1000
    bookObj['category'] = '仙侠'
    bookObj['type'] = '重生'

    bookObj['bookType'] = BookType

    bookObj['typeCode'] = 4
    bookObj['categoryCode'] = 1

    bookObj['viewNum'] = random.randint(500000, 1000000)

    m2 = hashlib.md5()
    forDigest = title + u'#' + author
    m2.update(forDigest.encode('utf-8'))
    digest = m2.hexdigest()

    bookObj['digest'] = digest

    bookObj = insertBookWithConn(bookObj, conn2, csor2)

    # myBookId = bookObj['id']
    #
    for cid in range(1047, count + 1):

        capContentUrl = capContentBaseUrl + str(cid) + '&contentid=' + str(mid)
        capContent = getContentWithUA(capContentUrl, ua)
        if not capContent:
            capContent = getContentWithUA(capContentUrl, ua)
        capListJsonObj = json.loads(capContent)
        if not (capListJsonObj['status'] == 1000
                and capListJsonObj['message'] == u'成功'):
            capListJsonObj = json.loads(capContent)
            if not (capListJsonObj['status'] == 1000
                    and capListJsonObj['message'] == u'成功'):
                continue
        capObj = dict()
        orgContent = capListJsonObj['data']['chapter']
        contentSoup = getSoupByStr(orgContent)
        del contentSoup.body['style']
        content = unicode(contentSoup.body).replace(u'<body>', '').replace(
            u'</body>', '').replace(u'\n\n', u'\n').replace(
                u'<br><br>', u'<br>').replace(u'<br\><br\>', u'<br\>')
        capObj['content'] = content
        capObj['title'] = unicode(contentSoup.title.get_text())
        capObj['rawUrl'] = capContentUrl
        # capObj['size'] = int(WordsCount)
        capObj['size'] = len(content)
        capObj['bookId'] = bookObj['id']
        capObj['source'] = bookObj['source']
        capObj['idx'] = cid
        capObj['bookUUID'] = bookObj['digest']

        m2 = hashlib.md5()
        forDigest = bookObj['digest'] + capObj['title'] + u'#' + str(cid)
        m2.update(forDigest.encode('utf-8'))
        digest = m2.hexdigest()
        capObj['digest'] = digest

        capId = insertCapWithCapObj(capObj, conn2, csor2)
        if not capId:
            continue
        upload2Bucket(str(capObj['id']) + '.json', json.dumps(capObj))

示例#6

0

显示文件

def kooleanStartByContentUrl(conn,
                             contentUrl,
                             csor,
                             desc='',
                             ntype='',
                             stage='',
                             tags='',
                             title=''):
    detailHtmlContent = getContentWithUA(contentUrl, defaultPCUa)
    detailContentSoup = getSoupByStr(detailHtmlContent)
    detailContent = ''
    contentDiv = detailContentSoup.select_one('.show_l2 .mt40')
    contentDiv.select('p')[0].extract()  # 第一个p标签为介绍，删掉
    cps = contentDiv.select('p')
    for ci in range(0, len(cps)):
        if cps[ci].select('a'):
            print 'has link ,extract,  contentUrl:'
            cps[ci].extract()

        if ci in [len(cps) - 1, len(cps) - 2,
                  len(cps) - 3] and (u'新东方' in cps[ci].get_text()
                                     or u'来源' in cps[ci].get_text()):
            for cc in range(ci, len(cps)):
                cps[cc].extract()
            break
    detailContent = detailContent + unicode(contentDiv)
    # 如果有分页，不算最后一个回链页
    for page in range(2, 100):
        cUrl = contentUrl.replace('.html', '_' + str(page) + '.html')
        moreContentHtmlContent = getContentWithUA(cUrl, defaultPCUa)
        if not moreContentHtmlContent:
            print 'no more content, ', cUrl
            break
        moreContentSoup = getSoupByStr(moreContentHtmlContent)
        # 去掉最后两个p
        moreContentDiv = moreContentSoup.select_one('.show_l2 .mt40')
        pps = moreContentDiv.select('p')
        for ci in range(0, len(pps)):
            if pps[ci].select('a'):
                # print 'has link ,extract, link:',unicode(pps[ci]),' contentUrl:',cUrl
                print 'has link ,extract, link2:'
                pps[ci].extract()

            if ci in [len(pps) - 1, len(pps) - 2,
                      len(pps) - 3] and (u'新东方' in pps[ci].get_text()
                                         or u'来源' in pps[ci].get_text()):
                for cc in range(ci, len(pps)):
                    pps[cc].extract()
                break
        # pps[len(pps) - 1].extract()
        # pps[len(pps) - 2].extract()

        [a.unwrap() for a in moreContentDiv.select('a')]
        for img in moreContentDiv.select('img'):
            if not img.has_key('style') or len(img['style']) < 1:
                img['style'] = 'max-width:100%'
            else:
                preStyle = img['style']
                if preStyle.endswith(';'):
                    img['style'] = img['style'] + 'max-width:100%;'
                else:
                    img['style'] = img['style'] + ';max-width:100%'

        detailContent = detailContent + unicode(moreContentDiv)

    # 入库
    csor.execute(
        'insert ignore into daily_news_copy (name,type,content,stage,author,tag,contentUrl,description) VALUES (%s,'
        '%s,%s,%s,%s,%s,%s,%s)',
        (title, ntype, detailContent.replace(u'新东方在线论坛', '').replace(
            u'相关链接：',
            '').replace(u'来源：新东方在线论坛', '').replace(u'新东方在线', '').replace(
                u'新东方', ''), stage, u'新东方', tags, contentUrl, desc))
    conn.commit()

示例#7

0

显示文件

def juren():
    csor, conn = getConn()

    #小升初笑话
    for i in range(1, 25):
        #笑话
        # url = 'http://aoshu.juren.com/chzt/shuxueshouchaobao/xiaoxiaohua/index_' + str(i) + '.html'
        #故事
        url = 'http://aoshu.juren.com/tiku/mryt/yimryt/index_' + str(
            i) + '.html'
        #名人
        # url =
        if i == 1:
            # url = 'http://aoshu.juren.com/chzt/shuxueshouchaobao/xiaoxiaohua/'
            # url = 'http://aoshu.juren.com/chzt/shuxueshouchaobao/xiaogushi/'
            # url = 'http://aoshu.juren.com/chzt/shuxueshouchaobao/neirongsucai/'
            url = 'http://aoshu.juren.com/tiku/mryt/yimryt/'
        content = getContent(url)
        if not content:
            print 'get content failed, url: ', url
            continue
        soup = getSoupByStr(content)
        if not soup:
            print 'get soup filed, url:', url
            continue
        for listting in soup.select(".listing1"):
            for a in listting.select('a'):
                text = a.get_text()
                titles = text.split(u'：')
                if len(titles) < 2:
                    titles = text.split(u':')
                if len(titles) < 2:
                    title = text
                else:
                    title = titles[1]
                deatilUrl = a['href']
                contentHtml = getContent(deatilUrl)
                if not contentHtml:
                    print 'get detail failed'
                    continue
                contentSoup = getSoupByStr(contentHtml).select('.mainContent')
                content = ''
                ps = contentSoup[0].select('p')
                length = len(ps)
                for j in range(1, length):
                    pJ = ps[j]
                    pText = pJ.get_text()
                    if u'本期精彩专题推荐' in pText or u'本期' in pText or u'精彩推荐' in pText\
                            or u'点击下一页查看答案' in pText or u'下一页查看答案' in pText or u'查看答案' in pText\
                            or len(pJ.select('a')) > 0:
                        print 'not content,break,  text:' + pText
                        break
                    content += unicode(pJ)
                contentHtml2 = getContent(deatilUrl.replace(
                    '.html', '_2.html'))
                if not contentHtml2:
                    print 'get detail failed'
                    continue
                # contentSoup2 = getSoupByStr(contentHtml2.replace('<br /></p>','')).select('.mainContent')
                contentSoup2 = getSoupByStr(contentHtml2).select(
                    '.mainContent')
                ps = contentSoup2[0].select('p')
                length = len(ps)
                for j in range(0, length):
                    pJ = ps[j]
                    pText = pJ.get_text()
                    if u'本期精彩专题推荐' in pText or u'本期' in pText or u'精彩推荐' in pText or len(
                            pJ.select('a')) > 0:
                        print 'not content,break,  text:' + pText
                        break
                    content += unicode(pJ)

                sql = "INSERT ignore INTO daily(name, \
                                        type, content,stage, gred) \
                                        VALUES ('%s', '%d', '%s', '%s', '%d')"                                                                               % \
                      (title, 3, content, '3', 1)
                try:
                    # 执行sql语句
                    print sql
                    csor.execute(sql)
                    # 提交到数据库执行
                    print conn.commit()
                except:
                    # 发生错误时回滚
                    conn.rollback()
    conn.close()

示例#8

0

显示文件

def today():
    baseUrl = 'http://www.todayonhistory.com/'
    conn, csor = getTmathConnCsor()
    for month in range(1, 13):
        for day in range(1, 32):
            type = '全部'
            jsonurl = baseUrl + str(month) + '/' + str(day)
            htmlContent = getContentWithUA(jsonurl, defaultPCUa)
            if not htmlContent or u'404-历史上的今天' in htmlContent:
                print 'no content skip month:', str(month), ' day:', str(day)
                continue
            soup = getSoupByStr(htmlContent)
            if '404' in soup.title:
                print '404 skip month:', str(month), ' day:', str(day)
                continue
            listUl = soup.select_one('ul.oh')
            for listLi in listUl.select('li'):
                liClasses = listLi['class']
                if 'typeid_53' in liClasses:
                    type = u'纪念'
                elif 'typeid_54' in liClasses:
                    type = u'节假日'
                elif 'typeid_55' in liClasses:
                    type = u'逝世'
                elif 'typeid_56' in liClasses:
                    type = u'出生'
                elif 'typeid_57' in liClasses:
                    type = u'事件'
                solarYear = listLi.select_one('span[class="poh"]').get_text()
                link = listLi.select_one('a')
                if not link:
                    print 'no link content, maybe bs4 bug, skip'
                    continue
                contentUrl = link['href']
                title = link['title']
                contentText = ''

                imgUrl = ''
                imgTag = listLi.select_one('img')
                if imgTag:
                    imgUrl = urlparse.urljoin(baseUrl, imgTag['src'])

                detailContentHtml = getContentWithUA(contentUrl, defaultPCUa)
                if detailContentHtml:
                    contentSoup = getSoupByStr(detailContentHtml)
                    contentBody = contentSoup.select_one('.body')

                    n1 = contentBody.select_one('.page')
                    if n1:
                        n1.extract()
                    n2 = contentBody.select_one('.keyword')
                    if n2:
                        n2.extract()
                    n3 = contentBody.select_one('.extra')
                    if n3:
                        n3.extract()
                    n4 = contentBody.select_one('.mgg')
                    if n4:
                        n4.extract()
                    n5 = contentBody.select_one('.poh')
                    if n5:
                        n5.extract()
                    n6 = contentBody.select_one('.framebox')
                    if n6:
                        n6.extract()

                    # for divTag in contentBody.select('div'):
                    #     divTag.extract()# 去除多余的div

                    contentText = unicode(contentBody)
                csor.execute(
                    'insert ignore into daily_today (name ,type ,content  '
                    ',month ,day ,thumbImg ,solaryear,srcUrl) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)',
                    (title, type, contentText, month, day, imgUrl, solarYear,
                     contentUrl))
                conn.commit()

            jsonBaseUrl = 'http://www.todayonhistory.com/index.php?m=content&c=index&a=json_event&page='
            #&pagesize=40&month=2&day=13'
            for page in range(1, 5):
                jsonurl = jsonBaseUrl + str(
                    page) + '&pagesize=40&month=' + str(month) + '&day=' + str(
                        day)
                jsonContent = getContentWithUA(jsonurl, defaultPCUa)
                if not jsonContent or len(jsonContent) < 10:
                    print 'json url return null or too short, maybe finished'
                    break
                jsonLists = json.loads(jsonContent)
                for jsonObj in jsonLists:
                    tid = jsonObj['id']
                    contentUrl2 = jsonObj['url']
                    title = jsonObj['title']
                    thumb = urlparse.urljoin(baseUrl, jsonObj['thumb'])
                    solaryear = jsonObj['solaryear']

                    contentText = ''

                    detailContentHtml = getContentWithUA(
                        contentUrl2, defaultPCUa)
                    if detailContentHtml:
                        contentSoup = getSoupByStr(detailContentHtml)
                        contentBody = contentSoup.select_one('.body')
                        # for divTag in contentBody.select('div'):
                        #     divTag.extract()  # 去除多余的div

                        n1 = contentBody.select_one('.page')
                        if n1:
                            n1.extract()
                        n2 = contentBody.select_one('.keyword')
                        if n2:
                            n2.extract()
                        n3 = contentBody.select_one('.extra')
                        if n3:
                            n3.extract()
                        n4 = contentBody.select_one('.mgg')
                        if n4:
                            n4.extract()
                        n5 = contentBody.select_one('.poh')
                        if n5:
                            n5.extract()
                        n6 = contentBody.select_one('.framebox')
                        if n6:
                            n6.extract()
                        n7 = contentBody.select_one('.mad')
                        if n7:
                            n7.extract()

                        contentText = unicode(contentBody)
                    csor.execute(
                        'insert ignore into daily_today (name ,type ,content  '
                        ',month ,day ,thumbImg ,solaryear,srcUrl) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)',
                        (title, '全部', contentText, month, day, thumb,
                         solaryear, contentUrl2))
                    conn.commit()

            print 'done month:', str(month), ' day: ', str(day)

示例#9

0

显示文件

文件： dushuCleanerEsou.py 项目： zyq001/pycrawler

        if not (u'easou' in url or u'3dllc' in url):
            continue
        try:
            newContent, redUrl = getContentAndRedictedUrl(url)

        except Exception as e:
            print e
            continue
        except requests.exceptions.ConnectionError as er:
            print er
            continue
        if not (redUrl and u'3dllc' in redUrl):
            urlContents[url] = content.encode('utf-8')
            continue

        soup = getSoupByStr(newContent)

        ps = soup.select('.zhang-txt-nei-rong')[0]

        for ad in ps.select('.con_ad'):
            ad.extract()

        for ad in ps.select('.bd-load-s'):
            ad.extract()

        for ad in ps.select('#pageselect'):
            ad.extract()

        for ad in ps.select('iframe'):
            ad.extract()

示例#10

0

显示文件

文件： mutiThreadDxecutor.py 项目： zyq001/pycrawler

def getAndParse(url):
    # 跳过的host
    continue1 = False
    for ig in ignores['hosts']:
        if ig in url:
            continue1 = True
            break
    if continue1:
        return None

    try:
        newContent, redUrl = getContentAndRedictedUrl(url)

    except Exception as e:
        print 'new content1', e
        try:
            newContent, redUrl = getContentAndRedictedUrl(url)

        except Exception as e:
            print 'new content1', e
            return None

        except requests.exceptions.ConnectionError as er:
            print 'new content2', er
            return None

    except requests.exceptions.ConnectionError as er:
        print 'new content2', er
        try:
            newContent, redUrl = getContentAndRedictedUrl(url)

        except Exception as e:
            print 'new content1', e
            return None

        except requests.exceptions.ConnectionError as er:
            print 'new content2', er
            return None

    if not redUrl:
        return None

    # 对跳转后的url，再过滤一遍
    continue2 = False
    for ig in ignores['hosts']:
        if ig in redUrl:
            continue2 = True
            return None

    if continue2:
        return None

    urlHost = urlparse(redUrl).hostname

    # new2 = newContent.encode('utf-8')
    soup = getSoupByStr(newContent)
    # soup = getSoupByStr(new2, "utf-8")

    # 统一清理通用噪声

    for rm in rules['common']['rm']:
        removeNodesFromSoup(rm, soup)  # 删除停止node

    if rules.has_key(urlHost):
        contentRule = rules[urlHost]['content']
        if contentRule:  # 有配置正文规则
            specContent = soup.select(contentRule)  # 根据配置，抽取正文
            if specContent and len(specContent) > 0:
                del specContent[0].attrs
                soup = specContent[0]
        # 不管有没有配置正文规则，都应该遍历删除rm节点
        if rules[urlHost]['rm'] and len(rules[urlHost]['rm']) > 0:
            for rm in rules[urlHost]['rm']:
                removeNodesFromSoup(rm, soup)  # 删除停止node

        unwrapUseless(soup)

        content = unicode(soup).replace(u'<body>', '').replace(u'</body>', '') \
            .replace(u'</div>', '').replace(u'<div>', '')

        newContent2 = cleanTailHead(urlHost, content)
        if newContent2 != content:
            content = newContent2

    else:  # m没有配置任何规则，自动抽取正文
        print urlHost, ' : ', url
        return None

        attemp = soup.select('#content')  #很多小说网站正文都是#content
        if attemp and len(attemp):
            #猜中了
            unwrapUseless(soup)
            content = unicode(soup).replace(u'<body>', '').replace(u'</body>', '') \
                .replace(u'</div>', '').replace(u'<div>', '')
        else:
            # doc = Document(unicode(soup))
            # content = doc.summary(html_partial=True)
            print 'content no change : ', urlHost

    if content and len(content) < 10:
        return None

    # newSoup = getSoupByStr(content)
    # newSoup.select('div')[0].unwrap()

    # content = unicode(newSoup).replace(u'<body>','').replace(u'</body>','')
    # content = content.replace(r'<p>\d+、.*</b></p>', '')

    # content = re.sub(u'<p>\d+、((?:.|\n)*?)</p>', "", content, 1)
    content = content.replace(u'�', u'')
    content = content.replace(u'\'', r'\'')
    return content, urlHost