def koolearn(muluUrl, stage): global conn, csor if not conn or (not csor): conn, csor = getTmathConnCsor() muluHtmlContent = getContentWithUA(muluUrl, defaultPCUa) muluSoup = getSoupByStr(muluHtmlContent) for pageLi in muluSoup.select('.list01 ul li'): try: title = pageLi.select_one('h3').get_text() if u'下载' in title: continue descTag = pageLi.select_one('.js2 p') if not descTag: descTag = pageLi.select_one('.js p') desc = descTag.get_text() tags = pageLi.select_one('.c_lv')['title'] ntype = tags #从标签中选择一个具有代表性的作为类型,一般为第二个 if len(tags) > 3: ts = tags.split(' ') if len(ts) > 2: ntype = ts[1] contentUrl = pageLi.select_one('h3 a')['href'] kooleanStartByContentUrl(conn, contentUrl, csor, desc, ntype, stage, tags, title) except Exception as ee: print traceback.format_exc() #获取下一页 footLinks = muluSoup.select('#page a') nextUrl = footLinks[len(footLinks) - 1]['href'] koolearn(urlparse.urljoin(muluUrl, nextUrl), stage)
def handleHtml(baseUrl, htmlContent): soup = getSoupByStr(htmlContent) modfied = False for img in soup.select('img'): imgSrc = img['src'] if not imgSrc.startswith('http'): img['src'] = urlparse.urljoin(baseUrl, imgSrc) modfied = True return unicode(soup), modfied
def cleanHtml(htmlContent): soup = getSoupByStr(htmlContent) modfied = False [a.unwrap() for a in soup.select('a')] for img in soup.select('img'): if not img.has_key('style') or len(img['style']) < 1: img['style'] = 'max-width:100%' else: preStyle = img['style'] if preStyle.endswith(';'): img['style'] = img['style'] + 'max-width:100%;' else: img['style'] = img['style'] + ';max-width:100%' modfied = True return unicode(soup), modfied
# sql = "select * from t_topic_new where id = 2645 or id = 2678" # sql = "select * from t_topic_new where id > 2994 and id < 3094" # sql = "select * from t_topic_new where id > 3411 and id < 3730"#初中 sql = "select * from t_topic_new where id > 1647 and id < 1757" #大学 print sql csor.execute(sql) results = csor.fetchall() for row in results: content = row[4] # content = row[4].replace('mi', 'mo') id = row[0] soup = getSoupByStr(content) t = soup.select('.title') if t and len(t) == 1: t[0].extract() else: print 'exteact Title fail, title:', unicode(t).encode('utf-8') pointStr = u"" sampleStr = u"" sampleBgn = False answerBgn = False answerEnd = False #多个例题时碰到 需要标记解析何时结束 answerTag = soup.new_tag('div')
def handleByMTID(mid): baseUrl = 'http://api.yingyangcan.com.cn/interface/ajax/book/getbaseinfo.ajax?contentid=' capListBaseUrl = 'http://api.yingyangcan.com.cn/interface/ajax/book/getcatalog.ajax?contentid=' + str(mid) \ +'&pageindex=1&pagesize=100000000' capContentBaseUrl = 'http://api.yingyangcan.com.cn/interface/ajax/book/getcharpter.ajax?chapterindex=' #2&contentid=171117' bookDetailUrl = 'http://m.yingyangcan.com.cn/interface/template/content/book_detail.vhtml?id=' url = baseUrl + str(mid) baseInfoContent = getContentWithUA(url, ua) if not baseInfoContent: baseInfoContent = getContentWithUA(url, ua) baseObj = json.loads(baseInfoContent) baseData = baseObj['data'] author = baseData['author'] title = baseData['name'] coverUrl = baseData['coverUrl'] contentUrl = baseData['contentUrl'] count = baseData['count'] isOver = baseData['isOver'] BookType = '连载' if isOver == 1: BookType = '完结' bookDetailHtml = getContentWithUA(bookDetailUrl + str(mid), ua) bookDetailSoup = getSoupByStr(bookDetailHtml) bookDesc = bookDetailSoup.select_one('#J-desc').get_text().replace( '\n', '').replace('\t\t', '\t') bookObj = dict() bookObj['subtitle'] = bookDesc bookObj['source'] = "" + str(mid) bookObj['rawUrl'] = url bookObj['title'] = title bookObj['chapterNum'] = count bookObj['imgUrl'] = coverUrl bookObj['author'] = author bookObj['size'] = count * 1000 bookObj['category'] = '仙侠' bookObj['type'] = '重生' bookObj['bookType'] = BookType bookObj['typeCode'] = 4 bookObj['categoryCode'] = 1 bookObj['viewNum'] = random.randint(500000, 1000000) m2 = hashlib.md5() forDigest = title + u'#' + author m2.update(forDigest.encode('utf-8')) digest = m2.hexdigest() bookObj['digest'] = digest bookObj = insertBookWithConn(bookObj, conn2, csor2) # myBookId = bookObj['id'] # for cid in range(1047, count + 1): capContentUrl = capContentBaseUrl + str(cid) + '&contentid=' + str(mid) capContent = getContentWithUA(capContentUrl, ua) if not capContent: capContent = getContentWithUA(capContentUrl, ua) capListJsonObj = json.loads(capContent) if not (capListJsonObj['status'] == 1000 and capListJsonObj['message'] == u'成功'): capListJsonObj = json.loads(capContent) if not (capListJsonObj['status'] == 1000 and capListJsonObj['message'] == u'成功'): continue capObj = dict() orgContent = capListJsonObj['data']['chapter'] contentSoup = getSoupByStr(orgContent) del contentSoup.body['style'] content = unicode(contentSoup.body).replace(u'<body>', '').replace( u'</body>', '').replace(u'\n\n', u'\n').replace( u'<br><br>', u'<br>').replace(u'<br\><br\>', u'<br\>') capObj['content'] = content capObj['title'] = unicode(contentSoup.title.get_text()) capObj['rawUrl'] = capContentUrl # capObj['size'] = int(WordsCount) capObj['size'] = len(content) capObj['bookId'] = bookObj['id'] capObj['source'] = bookObj['source'] capObj['idx'] = cid capObj['bookUUID'] = bookObj['digest'] m2 = hashlib.md5() forDigest = bookObj['digest'] + capObj['title'] + u'#' + str(cid) m2.update(forDigest.encode('utf-8')) digest = m2.hexdigest() capObj['digest'] = digest capId = insertCapWithCapObj(capObj, conn2, csor2) if not capId: continue upload2Bucket(str(capObj['id']) + '.json', json.dumps(capObj))
def kooleanStartByContentUrl(conn, contentUrl, csor, desc='', ntype='', stage='', tags='', title=''): detailHtmlContent = getContentWithUA(contentUrl, defaultPCUa) detailContentSoup = getSoupByStr(detailHtmlContent) detailContent = '' contentDiv = detailContentSoup.select_one('.show_l2 .mt40') contentDiv.select('p')[0].extract() # 第一个p标签为介绍,删掉 cps = contentDiv.select('p') for ci in range(0, len(cps)): if cps[ci].select('a'): print 'has link ,extract, contentUrl:' cps[ci].extract() if ci in [len(cps) - 1, len(cps) - 2, len(cps) - 3] and (u'新东方' in cps[ci].get_text() or u'来源' in cps[ci].get_text()): for cc in range(ci, len(cps)): cps[cc].extract() break detailContent = detailContent + unicode(contentDiv) # 如果有分页,不算最后一个回链页 for page in range(2, 100): cUrl = contentUrl.replace('.html', '_' + str(page) + '.html') moreContentHtmlContent = getContentWithUA(cUrl, defaultPCUa) if not moreContentHtmlContent: print 'no more content, ', cUrl break moreContentSoup = getSoupByStr(moreContentHtmlContent) # 去掉最后两个p moreContentDiv = moreContentSoup.select_one('.show_l2 .mt40') pps = moreContentDiv.select('p') for ci in range(0, len(pps)): if pps[ci].select('a'): # print 'has link ,extract, link:',unicode(pps[ci]),' contentUrl:',cUrl print 'has link ,extract, link2:' pps[ci].extract() if ci in [len(pps) - 1, len(pps) - 2, len(pps) - 3] and (u'新东方' in pps[ci].get_text() or u'来源' in pps[ci].get_text()): for cc in range(ci, len(pps)): pps[cc].extract() break # pps[len(pps) - 1].extract() # pps[len(pps) - 2].extract() [a.unwrap() for a in moreContentDiv.select('a')] for img in moreContentDiv.select('img'): if not img.has_key('style') or len(img['style']) < 1: img['style'] = 'max-width:100%' else: preStyle = img['style'] if preStyle.endswith(';'): img['style'] = img['style'] + 'max-width:100%;' else: img['style'] = img['style'] + ';max-width:100%' detailContent = detailContent + unicode(moreContentDiv) # 入库 csor.execute( 'insert ignore into daily_news_copy (name,type,content,stage,author,tag,contentUrl,description) VALUES (%s,' '%s,%s,%s,%s,%s,%s,%s)', (title, ntype, detailContent.replace(u'新东方在线论坛', '').replace( u'相关链接:', '').replace(u'来源:新东方在线论坛', '').replace(u'新东方在线', '').replace( u'新东方', ''), stage, u'新东方', tags, contentUrl, desc)) conn.commit()
def juren(): csor, conn = getConn() #小升初笑话 for i in range(1, 25): #笑话 # url = 'http://aoshu.juren.com/chzt/shuxueshouchaobao/xiaoxiaohua/index_' + str(i) + '.html' #故事 url = 'http://aoshu.juren.com/tiku/mryt/yimryt/index_' + str( i) + '.html' #名人 # url = if i == 1: # url = 'http://aoshu.juren.com/chzt/shuxueshouchaobao/xiaoxiaohua/' # url = 'http://aoshu.juren.com/chzt/shuxueshouchaobao/xiaogushi/' # url = 'http://aoshu.juren.com/chzt/shuxueshouchaobao/neirongsucai/' url = 'http://aoshu.juren.com/tiku/mryt/yimryt/' content = getContent(url) if not content: print 'get content failed, url: ', url continue soup = getSoupByStr(content) if not soup: print 'get soup filed, url:', url continue for listting in soup.select(".listing1"): for a in listting.select('a'): text = a.get_text() titles = text.split(u':') if len(titles) < 2: titles = text.split(u':') if len(titles) < 2: title = text else: title = titles[1] deatilUrl = a['href'] contentHtml = getContent(deatilUrl) if not contentHtml: print 'get detail failed' continue contentSoup = getSoupByStr(contentHtml).select('.mainContent') content = '' ps = contentSoup[0].select('p') length = len(ps) for j in range(1, length): pJ = ps[j] pText = pJ.get_text() if u'本期精彩专题推荐' in pText or u'本期' in pText or u'精彩推荐' in pText\ or u'点击下一页查看答案' in pText or u'下一页查看答案' in pText or u'查看答案' in pText\ or len(pJ.select('a')) > 0: print 'not content,break, text:' + pText break content += unicode(pJ) contentHtml2 = getContent(deatilUrl.replace( '.html', '_2.html')) if not contentHtml2: print 'get detail failed' continue # contentSoup2 = getSoupByStr(contentHtml2.replace('<br /></p>','')).select('.mainContent') contentSoup2 = getSoupByStr(contentHtml2).select( '.mainContent') ps = contentSoup2[0].select('p') length = len(ps) for j in range(0, length): pJ = ps[j] pText = pJ.get_text() if u'本期精彩专题推荐' in pText or u'本期' in pText or u'精彩推荐' in pText or len( pJ.select('a')) > 0: print 'not content,break, text:' + pText break content += unicode(pJ) sql = "INSERT ignore INTO daily(name, \ type, content,stage, gred) \ VALUES ('%s', '%d', '%s', '%s', '%d')" % \ (title, 3, content, '3', 1) try: # 执行sql语句 print sql csor.execute(sql) # 提交到数据库执行 print conn.commit() except: # 发生错误时回滚 conn.rollback() conn.close()
def today(): baseUrl = 'http://www.todayonhistory.com/' conn, csor = getTmathConnCsor() for month in range(1, 13): for day in range(1, 32): type = '全部' jsonurl = baseUrl + str(month) + '/' + str(day) htmlContent = getContentWithUA(jsonurl, defaultPCUa) if not htmlContent or u'404-历史上的今天' in htmlContent: print 'no content skip month:', str(month), ' day:', str(day) continue soup = getSoupByStr(htmlContent) if '404' in soup.title: print '404 skip month:', str(month), ' day:', str(day) continue listUl = soup.select_one('ul.oh') for listLi in listUl.select('li'): liClasses = listLi['class'] if 'typeid_53' in liClasses: type = u'纪念' elif 'typeid_54' in liClasses: type = u'节假日' elif 'typeid_55' in liClasses: type = u'逝世' elif 'typeid_56' in liClasses: type = u'出生' elif 'typeid_57' in liClasses: type = u'事件' solarYear = listLi.select_one('span[class="poh"]').get_text() link = listLi.select_one('a') if not link: print 'no link content, maybe bs4 bug, skip' continue contentUrl = link['href'] title = link['title'] contentText = '' imgUrl = '' imgTag = listLi.select_one('img') if imgTag: imgUrl = urlparse.urljoin(baseUrl, imgTag['src']) detailContentHtml = getContentWithUA(contentUrl, defaultPCUa) if detailContentHtml: contentSoup = getSoupByStr(detailContentHtml) contentBody = contentSoup.select_one('.body') n1 = contentBody.select_one('.page') if n1: n1.extract() n2 = contentBody.select_one('.keyword') if n2: n2.extract() n3 = contentBody.select_one('.extra') if n3: n3.extract() n4 = contentBody.select_one('.mgg') if n4: n4.extract() n5 = contentBody.select_one('.poh') if n5: n5.extract() n6 = contentBody.select_one('.framebox') if n6: n6.extract() # for divTag in contentBody.select('div'): # divTag.extract()# 去除多余的div contentText = unicode(contentBody) csor.execute( 'insert ignore into daily_today (name ,type ,content ' ',month ,day ,thumbImg ,solaryear,srcUrl) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)', (title, type, contentText, month, day, imgUrl, solarYear, contentUrl)) conn.commit() jsonBaseUrl = 'http://www.todayonhistory.com/index.php?m=content&c=index&a=json_event&page=' #&pagesize=40&month=2&day=13' for page in range(1, 5): jsonurl = jsonBaseUrl + str( page) + '&pagesize=40&month=' + str(month) + '&day=' + str( day) jsonContent = getContentWithUA(jsonurl, defaultPCUa) if not jsonContent or len(jsonContent) < 10: print 'json url return null or too short, maybe finished' break jsonLists = json.loads(jsonContent) for jsonObj in jsonLists: tid = jsonObj['id'] contentUrl2 = jsonObj['url'] title = jsonObj['title'] thumb = urlparse.urljoin(baseUrl, jsonObj['thumb']) solaryear = jsonObj['solaryear'] contentText = '' detailContentHtml = getContentWithUA( contentUrl2, defaultPCUa) if detailContentHtml: contentSoup = getSoupByStr(detailContentHtml) contentBody = contentSoup.select_one('.body') # for divTag in contentBody.select('div'): # divTag.extract() # 去除多余的div n1 = contentBody.select_one('.page') if n1: n1.extract() n2 = contentBody.select_one('.keyword') if n2: n2.extract() n3 = contentBody.select_one('.extra') if n3: n3.extract() n4 = contentBody.select_one('.mgg') if n4: n4.extract() n5 = contentBody.select_one('.poh') if n5: n5.extract() n6 = contentBody.select_one('.framebox') if n6: n6.extract() n7 = contentBody.select_one('.mad') if n7: n7.extract() contentText = unicode(contentBody) csor.execute( 'insert ignore into daily_today (name ,type ,content ' ',month ,day ,thumbImg ,solaryear,srcUrl) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)', (title, '全部', contentText, month, day, thumb, solaryear, contentUrl2)) conn.commit() print 'done month:', str(month), ' day: ', str(day)
if not (u'easou' in url or u'3dllc' in url): continue try: newContent, redUrl = getContentAndRedictedUrl(url) except Exception as e: print e continue except requests.exceptions.ConnectionError as er: print er continue if not (redUrl and u'3dllc' in redUrl): urlContents[url] = content.encode('utf-8') continue soup = getSoupByStr(newContent) ps = soup.select('.zhang-txt-nei-rong')[0] for ad in ps.select('.con_ad'): ad.extract() for ad in ps.select('.bd-load-s'): ad.extract() for ad in ps.select('#pageselect'): ad.extract() for ad in ps.select('iframe'): ad.extract()
def getAndParse(url): # 跳过的host continue1 = False for ig in ignores['hosts']: if ig in url: continue1 = True break if continue1: return None try: newContent, redUrl = getContentAndRedictedUrl(url) except Exception as e: print 'new content1', e try: newContent, redUrl = getContentAndRedictedUrl(url) except Exception as e: print 'new content1', e return None except requests.exceptions.ConnectionError as er: print 'new content2', er return None except requests.exceptions.ConnectionError as er: print 'new content2', er try: newContent, redUrl = getContentAndRedictedUrl(url) except Exception as e: print 'new content1', e return None except requests.exceptions.ConnectionError as er: print 'new content2', er return None if not redUrl: return None # 对跳转后的url,再过滤一遍 continue2 = False for ig in ignores['hosts']: if ig in redUrl: continue2 = True return None if continue2: return None urlHost = urlparse(redUrl).hostname # new2 = newContent.encode('utf-8') soup = getSoupByStr(newContent) # soup = getSoupByStr(new2, "utf-8") # 统一清理通用噪声 for rm in rules['common']['rm']: removeNodesFromSoup(rm, soup) # 删除停止node if rules.has_key(urlHost): contentRule = rules[urlHost]['content'] if contentRule: # 有配置正文规则 specContent = soup.select(contentRule) # 根据配置,抽取正文 if specContent and len(specContent) > 0: del specContent[0].attrs soup = specContent[0] # 不管有没有配置正文规则,都应该遍历删除rm节点 if rules[urlHost]['rm'] and len(rules[urlHost]['rm']) > 0: for rm in rules[urlHost]['rm']: removeNodesFromSoup(rm, soup) # 删除停止node unwrapUseless(soup) content = unicode(soup).replace(u'<body>', '').replace(u'</body>', '') \ .replace(u'</div>', '').replace(u'<div>', '') newContent2 = cleanTailHead(urlHost, content) if newContent2 != content: content = newContent2 else: # m没有配置任何规则,自动抽取正文 print urlHost, ' : ', url return None attemp = soup.select('#content') #很多小说网站正文都是#content if attemp and len(attemp): #猜中了 unwrapUseless(soup) content = unicode(soup).replace(u'<body>', '').replace(u'</body>', '') \ .replace(u'</div>', '').replace(u'<div>', '') else: # doc = Document(unicode(soup)) # content = doc.summary(html_partial=True) print 'content no change : ', urlHost if content and len(content) < 10: return None # newSoup = getSoupByStr(content) # newSoup.select('div')[0].unwrap() # content = unicode(newSoup).replace(u'<body>','').replace(u'</body>','') # content = content.replace(r'<p>\d+、.*</b></p>', '') # content = re.sub(u'<p>\d+、((?:.|\n)*?)</p>', "", content, 1) content = content.replace(u'�', u'') content = content.replace(u'\'', r'\'') return content, urlHost