def find1stPermalink(): (isFound, retInfo) = (False, "Unknown error!") try: #http://blog.csdn.net/chdhust # try find last page homeUrl = gVal['blogEntryUrl'] homeRespHtml = crifanLib.getUrlRespHtml(homeUrl) logging.debug("homeRespHtml for %s, is:\n%s", homeUrl, homeRespHtml) # <a href="/chenglinhust/article/list/22">尾页</a> # <span> 1079条 共54页</span><strong>1</strong> <a href="/chenglinhust/article/list/2">2</a> <a href="/chenglinhust/article/list/3">3</a> <a href="/chenglinhust/article/list/4">4</a> <a href="/chenglinhust/article/list/5">5</a> <a href="/chenglinhust/article/list/6">...</a> <a href="/chenglinhust/article/list/2">下一页</a> <a href="/chenglinhust/article/list/54">尾页</a> foundLastListPageUrl = re.search( '<a\s+?href="(?P<lastListPageUrl>/\w+?/article/list/\d+)">尾页</a>', homeRespHtml, re.I) logging.debug("foundLastListPageUrl=%s", foundLastListPageUrl) if (foundLastListPageUrl): lastListPageUrl = foundLastListPageUrl.group("lastListPageUrl") lastListPageUrl = gConst['spaceDomain'] + lastListPageUrl logging.debug("lastListPageUrl=%s", lastListPageUrl) # pageNum = 10000; # pageNum = lastPageNum; # getPostUrl = gVal['blogEntryUrl'] + "/article/list/" + str(pageNum); # http://blog.csdn.net/chenglinhust/article/list/54 respHtml = crifanLib.getUrlRespHtml(lastListPageUrl) logging.debug("ret html for %s, is:\n%s", lastListPageUrl, respHtml) soup = htmlToSoup(respHtml) # <span class="link_title"><a href="/v_july_v/article/details/5934051"> # 算法面试:精选微软经典的算法面试100题(第1-20题) # </a></span> foundTitLink = soup.findAll(attrs={"class": "link_title"}) # logging.debug("found articles=%s", foundTitLink); articleNum = len(foundTitLink) logging.debug("articleNum=%s", articleNum) if (foundTitLink): lastArticle = foundTitLink[-1] # print "lastArticle=",lastArticle; aVal = lastArticle.a # print "aVal=",aVal; href = aVal['href'] # print "href=",href; fristLink = gConst['spaceDomain'] + href # print "fristLink=",fristLink; retInfo = fristLink isFound = True logging.debug("retInfo=%s,isFound=%s", retInfo, isFound) except: (isFound, retInfo) = (False, "Unknown error!") return (isFound, retInfo)
def parse(self, url, encoding='utf-8', **kwargs): try: html = crifanLib.getUrlRespHtml(url) selector = lxml.etree.HTML(html) results = {} for key, value in kwargs.items(): res = selector.xpath(value) for index, value in enumerate(res): data = value if isinstance(data, lxml.etree._Element): data = value.xpath('string(.)') if isinstance(data, unicode): data = data.encode('gbk', 'ignore') data = re.sub('\s', '', data, flags=re.UNICODE) res[index] = data results[key] = res return results except: date = time.strftime('%Y-%m-%d') f = open(date + 'errors.log', 'a') hms = time.strftime('%H:%M:%S : \r\n') f.write(hms) traceback.print_exc(file=f) f.write('\n') f.flush() f.close()
def extractBlogTitAndDesc(blogEntryUrl) : (blogTitle, blogDescription) = ("", ""); try: logging.debug("Now extract blog title and description from blogEntryUrl=%s", blogEntryUrl); respHtml = crifanLib.getUrlRespHtml(blogEntryUrl); #logging.debug("url=%s return html=\n%s", blogEntryUrl, respHtml); # <div class="headerinner"> # <h1><a href="http://blog.tianya.cn/blogger/blog_main.asp?BlogID=2723898">素颜素语</a><img src="/images/mb2.gif" style="margin-left:15px" align="absmiddle"></h1> # <div class="blog-header-rul"><a href="http://blog.tianya.cn/blogger/blog_main.asp?BlogID=2723898">http://susu7788.blog.tianya.cn</a> <a href="rss.asp?BlogID=2723898" target="_blank">[RSS订阅]</a></div> # <div class="blogsign">真诚待人,用心写文。</div> # </div> soup = htmlToSoup(respHtml); foundHeaderInner = soup.find(attrs={"class":"headerinner"}); logging.debug("foundHeaderInner=%s", foundHeaderInner); h1 = foundHeaderInner.h1; h1a = h1.a; logging.debug("h1a=%s", h1a); h1aStr = h1a.string; logging.debug("h1aStr=%s", h1aStr); blogTitle = h1aStr; logging.debug("blogTitle=%s", blogTitle); foundBlogsign = foundHeaderInner.find(attrs={"class":"blogsign"}); logging.debug("foundBlogsign=%s", foundBlogsign); blogDescription = foundBlogsign.string; logging.debug("blogDescription=%s", blogDescription); except: (blogTitle, blogDescription) = ("", ""); return (blogTitle, blogDescription);
def fetchAndParseComments(url, html): parsedCommentsList = []; try : #logging.debug("fetchAndParseComments_html=\n%s", html); soup = getSoupFromUrl(url); #<a href="#allcomments">评论:235</a> #foundAllCmtNum = soup.find(attrs={"href":"#allcomments"}); foundAllCmtNum = re.search('<a href="#allcomments">.+?(?P<allCmtNum>\d+)</a>', html); logging.debug("foundAllCmtNum=%s", foundAllCmtNum); if(foundAllCmtNum): allCmtNum = foundAllCmtNum.group("allCmtNum"); logging.debug("allCmtNum=%s", allCmtNum); allCmtNumInt = int(allCmtNum); logging.debug("allCmtNumInt=%s", allCmtNumInt); logging.debug("total comment number=%d", allCmtNumInt); if(allCmtNumInt > 0): allCmtCellDictList = []; # calc total comment page num maxCmtNumPerPage = 50; toalCmtPageNum = math.ceil(float(allCmtNumInt)/float(50)); toalCmtPageNum = int(toalCmtPageNum); logging.debug("toalCmtPageNum=%d", toalCmtPageNum); # get all comment cell dict list if(toalCmtPageNum <= 1): allCmtCellDictList = extractCmtCellDictList(html); else: postId = extractPostIdFromPermaLink(url); # for each comment page get html and extract comment cell string list for eachPageIdx in range(toalCmtPageNum): eachPageNum = eachPageIdx + 1; #genrate comment url for each page #http://blog.tianya.cn/blogger/post_read.asp?BlogID=3210723&PostID=41963783&page=2 postReadUrl = genPostReadUrl(gVal['blogId'], postId, eachPageNum); respHtml = crifanLib.getUrlRespHtml(postReadUrl); singlePageCmtCellStrList = extractCmtCellDictList(respHtml); logging.debug("len(singlePageCmtCellStrList)=%s", len(singlePageCmtCellStrList)); allCmtCellDictList.extend(singlePageCmtCellStrList); logging.debug("len(allCmtCellDictList)=%s", len(allCmtCellDictList)); # parse each comment cell string into dest comment dict parseAllCommentsList(allCmtCellDictList, parsedCommentsList); logging.debug("total parsed %d comments", len(parsedCommentsList)); #print "total parsed comments= %d"%(len(parsedCommentsList)); except : logging.debug("Error while fetch and parse comment for %s", url); return parsedCommentsList;
def extractBlogTitAndDesc(blogEntryUrl) : (blogTitle, blogDescription) = ("", ""); #print "blogEntryUrl=",blogEntryUrl; try: respHtml = crifanLib.getUrlRespHtml(blogEntryUrl); #logging.debug("url=%s return html=\n%s", blogEntryUrl, respHtml); # <div id="blog_title"> # <h1> # <a href="/v_JULY_v">结构之法 算法之道</a></h1> # <h2></h2> # <div class="clear"> # </div> # </div> # <div id="blog_title"> # <h1> # <a href="/MoreWindows">MoreWindows</a></h1> # <h2>学习不在于学了多少,而在于学会多少。</h2> # <div class="clear"> # </div> # </div> soup = htmlToSoup(respHtml); foundTitle = soup.find(id="blog_title"); #print "foundTitle=",foundTitle; foundTitleA = foundTitle.a; #print "foundTitleA=",foundTitleA; titStr = foundTitleA.string; #print "titStr=",titStr; blogTitle = unicode(titStr); #h1 = foundTitle.h1; h2 = foundTitle.h2; #print "h1=",h1; #print "h2=",h2; #h1Str = h1.string; h2Str = h2.string; #print "h1Str=",h1Str; #print "h2Str=",h2Str; blogDescription = unicode(h2Str); #print "blogDescription=",blogDescription; except: (blogTitle, blogDescription) = ("", ""); return (blogTitle, blogDescription);
def extractBlogTitAndDesc(blogEntryUrl): (blogTitle, blogDescription) = ("", "") #print "blogEntryUrl=",blogEntryUrl; try: respHtml = crifanLib.getUrlRespHtml(blogEntryUrl) #logging.debug("url=%s return html=\n%s", blogEntryUrl, respHtml); # <div id="blog_title"> # <h1> # <a href="/v_JULY_v">结构之法 算法之道</a></h1> # <h2></h2> # <div class="clear"> # </div> # </div> # <div id="blog_title"> # <h1> # <a href="/MoreWindows">MoreWindows</a></h1> # <h2>学习不在于学了多少,而在于学会多少。</h2> # <div class="clear"> # </div> # </div> soup = htmlToSoup(respHtml) foundTitle = soup.find(id="blog_title") #print "foundTitle=",foundTitle; foundTitleA = foundTitle.a #print "foundTitleA=",foundTitleA; titStr = foundTitleA.string #print "titStr=",titStr; blogTitle = unicode(titStr) #h1 = foundTitle.h1; h2 = foundTitle.h2 #print "h1=",h1; #print "h2=",h2; #h1Str = h1.string; h2Str = h2.string #print "h1Str=",h1Str; #print "h2Str=",h2Str; blogDescription = unicode(h2Str) #print "blogDescription=",blogDescription; except: (blogTitle, blogDescription) = ("", "") return (blogTitle, blogDescription)
def __get_urls(self, page, reg, encoding='utf-8'): try: html = crifanLib.getUrlRespHtml(page) selector = lxml.etree.HTML(html) urls = selector.xpath(reg) return urls except: date = time.strftime('%Y-%m-%d') f = open(date + 'errors.log', 'a') hms = time.strftime('%H:%M:%S : \n') f.write(hms) traceback.print_exc(file=f) f.write('\n') f.flush() f.close()
def extractBlogUserFromBlogId(blogId): blogUser = ""; #http://blog.tianya.cn/blogger/blog_main.asp?BlogID=2723898 blogEntryUrl = genEntryUrl(str(blogId)); #extract blogUser #<div class="blog-header-rul"><a href="http://blog.tianya.cn/blogger/blog_main.asp?BlogID=2723898">http://susu7788.blog.tianya.cn</a> <a href="rss.asp?BlogID=2723898" target="_blank">[RSS订阅]</a></div> respHtml = crifanLib.getUrlRespHtml(blogEntryUrl); foundBlogHeaderRurl = re.search('<div\s+class="blog-header-rul"><a\s+href="http://blog\.tianya\.cn/blogger/blog_main\.asp\?BlogID=' + blogId + '">http://(?P<blogUser>\w+).blog.tianya.cn</a>', respHtml); logging.debug("foundBlogHeaderRurl=%s", foundBlogHeaderRurl); if(foundBlogHeaderRurl): blogUser = foundBlogHeaderRurl.group("blogUser"); logging.debug("Extracted blog user %s from entry url %s", blogUser, blogEntryUrl); else: blogUser = ""; return blogUser;
def getSoupFromUrl(url): soup = None; if(url in gVal['soupDict']) : logging.debug("%s exist in soupDict, so get soup from cache", url); soup = gVal['soupDict'][url]; else : logging.debug("%s not in soupDict, so get html then get soup", url); # get url's soup html = crifanLib.getUrlRespHtml(url); soup = htmlToSoup(html); # store soup gVal['soupDict'][url] = soup; return soup;
def find1stPermalink(): (isFound, retInfo) = (False, "Unknown error!"); try: # someUrl = gVal['blogEntryUrl'] + "???"; logging.debug("someUrl=%s", someUrl); respHtml = crifanLib.getUrlRespHtml(someUrl); logging.debug("respHtml=%s", respHtml); #TODO: write your own logic to find the first permanent link of post # retInfo = lastHref; # isFound = True; except: (isFound, retInfo) = (False, "Unknown error!"); return (isFound, retInfo);
def extractBlogTitAndDesc(blogEntryUrl) : (blogTitle, blogDescription) = ("", ""); try: foundTitDesc = False; logging.debug("Now extract blog title and description from blogEntryUrl=%s", blogEntryUrl); respHtml = crifanLib.getUrlRespHtml(blogEntryUrl); #logging.debug("blogEntryUrl=%s return html=\n%s", blogEntryUrl, respHtml); soup = htmlToSoup(respHtml); #TODO: write your own logic to extract blog title and description # blogDescription = metaContent; # foundTitDesc = True; if(foundTitDesc): gVal['blogTitle'] = blogTitle; gVal['blogDescription'] = blogDescription; except: (blogTitle, blogDescription) = ("", ""); return (blogTitle, blogDescription);
def find1stPermalink(): (isFound, retInfo) = (False, "Unknown error!"); try: #http://blog.csdn.net/v_JULY_v/article/list/7 #pageNum = 7; pageNum = 10000; getPostUrl = gVal['blogEntryUrl'] + "/article/list/" + str(pageNum); respHtml = crifanLib.getUrlRespHtml(getPostUrl); #logging.debug("ret html for %s, is:\n%s", getPostUrl, respHtml); soup = htmlToSoup(respHtml); # <span class="link_title"><a href="/v_july_v/article/details/5934051"> # 算法面试:精选微软经典的算法面试100题(第1-20题) # </a></span> foundTitLink = soup.findAll(attrs={"class":"link_title"}); #logging.debug("found articles=%s", foundTitLink); articleNum = len(foundTitLink); #print "articleNum=",articleNum; if(foundTitLink) : lastArticle = foundTitLink[-1]; #print "lastArticle=",lastArticle; aVal = lastArticle.a; #print "aVal=",aVal; href = aVal['href']; #print "href=",href; fristLink = gConst['spaceDomain'] + href; #print "fristLink=",fristLink; retInfo = fristLink; isFound = True; except: (isFound, retInfo) = (False, "Unknown error!"); return (isFound, retInfo);
def find1stPermalink(): (isFound, retInfo) = (False, "Unknown error!") try: #http://blog.csdn.net/v_JULY_v/article/list/7 #pageNum = 7; pageNum = 10000 getPostUrl = gVal['blogEntryUrl'] + "/article/list/" + str(pageNum) respHtml = crifanLib.getUrlRespHtml(getPostUrl) #logging.debug("ret html for %s, is:\n%s", getPostUrl, respHtml); soup = htmlToSoup(respHtml) # <span class="link_title"><a href="/v_july_v/article/details/5934051"> # 算法面试:精选微软经典的算法面试100题(第1-20题) # </a></span> foundTitLink = soup.findAll(attrs={"class": "link_title"}) #logging.debug("found articles=%s", foundTitLink); articleNum = len(foundTitLink) #print "articleNum=",articleNum; if (foundTitLink): lastArticle = foundTitLink[-1] #print "lastArticle=",lastArticle; aVal = lastArticle.a #print "aVal=",aVal; href = aVal['href'] #print "href=",href; fristLink = gConst['spaceDomain'] + href #print "fristLink=",fristLink; retInfo = fristLink isFound = True except: (isFound, retInfo) = (False, "Unknown error!") return (isFound, retInfo)
def extractSongInfoDictList(singleAllrecUrl): songInfoDictList = [] logging.debug("singleAllrecUrl=%s", singleAllrecUrl) respHtml = crifanLib.getUrlRespHtml(singleAllrecUrl) # logging.debug("respHtml=\n%s", respHtml); soup = BeautifulSoup(respHtml, fromEncoding=gConst["stHtmlEnc"]) # findAllsong = soup.find(attrs={"class":"u_song_tab u_song_all"}); # logging.debug("findAllsong=%s", findAllsong); # pretifiedHtml = soup.prettify(encoding=gCfg['outputFile']['encode']); # pretifiedHtml = soup.prettify(); # pretifiedHtml = soup; pretifiedHtml = unicode(soup) # logging.info("type(pretifiedHtml)=%s", type(pretifiedHtml)); # logging.debug("pretifiedHtml=%s", pretifiedHtml); # <table cellpadding="0" cellspacing="0" border="0" width="420" class="u_song_tab u_song_all"> # ... # <script> # ... # WL("1", "3055432","E.R‖<拯救听觉 前奏一起就注定是极品>丨 -- 忆の蓝色 ","<span class=date>6</span>天前"); # ... # </script> # </table> # foundAllWl = re.findall(r'WL\("\d+",\s*?"\d+",\s*?".+? -- .+? ",\s*?".+?"\);', pretifiedHtml); # foundAllWl = re.findall(r'WL\("\d+", "\d+",".+? -- .+? ",".+?"\);', pretifiedHtml); # foundAllWl = re.findall(r'WL\("\d+", "\d+",".+ -- .+ ",".+?"\);', pretifiedHtml); # foundAllWl = re.findall(r'WL\("\d+", "\d+",".+ -- .+ ",".+?"\);', pretifiedHtml); # WL("133", "255907","10.Two Moon Butterflies ","2012-01-02 16:21:18"); foundAllWl = re.findall(r'WL\("\d+",\s*?"\d+",\s*?".+",\s*?".+?"\);', pretifiedHtml) foundAllWlNum = len(foundAllWl) # logging.info("foundAllWl=%s", foundAllWl); logging.info("foundAllWlNum=%d", foundAllWlNum) for singleWl in foundAllWl: # logging.info("singleWl=%s", singleWl); # extract all song info songInfoDict = { "number": "", "id": "", "title": "", "singer": "", "time": None, # finally will store datetime type value } # WL("26", "3080141","Lone Wanderer -- Rameses B ","<span class=date>24</span>天前"); # WL("31", "3155354","泰语 我不会要求 -- Da Endorphine ","一个月0天前"); # WL("35", "3154813","北京北京(中国好声音) -- 梁博 ft 黄勇 ","一个月1天前"); # WL("73", "3048771","< 小漠 迟来生日快乐 > 超赞大气的节奏 Sexy的嗓音 -- 羊氏 Club° ","一个月26天前"); # WL("80", "3084927","Not Inveted Love -- Stan Crown ","2012-08-08 11:27:03"); # foundSongInfo = re.search(r'WL\("(?P<number>\d+)",\s*?"(?P<id>\d+)",\s*?"(?P<title>.+?) -- (?P<singer>.+?) ",\s*?"(?P<time>.+?)"\);', singleWl); # WL("133", "255907","10.Two Moon Butterflies ","2012-01-02 16:21:18"); foundSongInfo = re.search( r'WL\("(?P<number>\d+)",\s*?"(?P<id>\d+)",\s*?"(?P<title>.+?)( -- (?P<singer>.+?))? ",\s*?"(?P<time>.+?)"\);', singleWl, ) logging.debug("foundSongInfo=%s", foundSongInfo) if foundSongInfo: songInfoDict["number"] = foundSongInfo.group("number") songInfoDict["id"] = foundSongInfo.group("id") songInfoDict["title"] = foundSongInfo.group("title") songInfoDict["singer"] = foundSongInfo.group("singer") if not songInfoDict["singer"]: logging.debug("singer is empty for %s", singleWl) # WL("254", "226169","妞子 - 郝爽 ","2009-01-09 15:33:13"); # WL("255", "161306","Kate Havnevik-Solo ","2009-01-09 15:32:59"); # WL("256", "341131","when you say nothing at all - Alison Krauss ","2009-01-09 15:32:34"); # try parse the title into title and singer foundHyphen = re.search(r"(?P<subTitle>.+)\s*\-\s*(?P<subSinger>.+)", songInfoDict["title"]) logging.debug("foundHyphen=%s", foundHyphen) if foundHyphen: subTitle = foundHyphen.group("subTitle") subSinger = foundHyphen.group("subSinger") subTitle = subTitle.strip() subSinger = subSinger.strip() if subTitle and subSinger: songInfoDict["title"] = subTitle songInfoDict["singer"] = subSinger logging.info("Extrat sub title and singer OK. subTitle=%s, subSinger=%s", subTitle, subSinger) datetimeStr = foundSongInfo.group("time") # logging.info("type(datetimeStr)=%s", type(datetimeStr)); # <type 'unicode'> convertedDatetime = None # parse time to YYYY-MM-DD hh:mm:ss foundDatetime = re.search(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", datetimeStr) if foundDatetime: convertedDatetime = datetime.strptime(datetimeStr, "%Y-%m-%d %H:%M:%S") logging.debug("type(convertedDatetime)=%s", type(convertedDatetime)) else: curDatetime = datetime.now() logging.debug("curDatetime=%s", curDatetime) convertedDatetime = curDatetime - convertStrToDatetimeDelta(datetimeStr) logging.debug("convertedDatetime=%s", convertedDatetime) songInfoDict["time"] = convertedDatetime logging.debug("songInfoDict=%s", songInfoDict) songInfoDictList.append(songInfoDict) else: logging.warning("Can not parse single song info string %s !", singleWl) return songInfoDictList
def fetchAndParseComments(url, html): parsedCommentsList = [] try: #logging.debug("fetchAndParseComments_html=\n%s", html); soup = getSoupFromUrl(url) #<span class="link_comments" title="评论次数"><a href="#comments">评论</a>(207)</span> foundCmtNum = soup.find(attrs={"class": "link_comments"}) #print "foundCmtNum=",foundCmtNum; if (foundCmtNum): #cmtNumStr = foundCmtNum.string; #print "foundCmtNum.contents=",foundCmtNum.contents; cmtNumStr = foundCmtNum.contents[1] #print "cmtNumStr=",cmtNumStr; cmtNumStr = cmtNumStr.strip() cmtNumStr = cmtNumStr.replace("(", "") cmtNumStr = cmtNumStr.replace(")", "") #print "cmtNumStr=",cmtNumStr; totalCmtNum = int(cmtNumStr) #print "totalCmtNum=",totalCmtNum; logging.debug("total comment number=%d", totalCmtNum) allCmtList = [] if (totalCmtNum > 0): # fetch comments needGetMore = True curCmtPageIdx = 1 #http://blog.csdn.net/v_july_v/comment/list/6543438?page=1&_0.8271710660267246 getCmtMainUrl = gConst['spaceDomain'] + "/" + gVal[ 'blogUser'] + "/comment/list/" + str(extractArticleId(url)) while (needGetMore): #http://blog.csdn.net/v_july_v/comment/list/6543438?page=1&_0.8271710660267246 getCmtUrl = getCmtMainUrl + "?page=" + str( curCmtPageIdx) + "&_" + str(random.random()) logging.debug("for get comments, generated url=%s", getCmtUrl) respJson = crifanLib.getUrlRespHtml(getCmtUrl) logging.debug("return comment json string=\n%s", respJson) cmtDict = json.loads(respJson) #print "len(cmtDict)=",len(cmtDict); #{"list":[{"ArticleId":6543438,"BlogId":943376,"CommentId":2177665,"Content":"看了好多篇,收货很多, 看的羨慕妒恨!~~~~~~~~\n虽然有很多之前没看过,很多之气都没听过,现在终于见识了!楼主的无私,很感动,就也无私的推荐给公司研发的所有同事,同事都是大赞!!","ParentId":0,"PostTime":"3天前 12:50","Replies":null,"UserName":"******","Userface":"http://avatar.csdn.net/6/3/7/3_zxyzlx.jpg"}, cmtList = cmtDict['list'] cmtListLen = len(cmtList) #print "cmtListLen=",cmtListLen; allCmtList.extend(cmtList) logging.debug( "Add %d returned comment list into all comment list", cmtListLen) #,"page":{"PageSize":100,"PageIndex":1,"RecordCount":173,"PageCount":2},"fileName":"6543438"} cmtPage = cmtDict['page'] #print "cmtPage=",cmtPage; pageIndex = cmtPage['PageIndex'] #print "pageIndex=",pageIndex; pageCount = cmtPage['PageCount'] #print "pageCount=",pageCount; logging.debug( "Returned comment info: page=%s, fileName=%s", cmtDict['page'], cmtDict['fileName']) if (curCmtPageIdx < pageCount): curCmtPageIdx += 1 else: needGetMore = False gVal['dbgSubCmtNum'] = 0 parseAllCommentsList(allCmtList, parsedCommentsList) logging.debug( "total parsed %d comments, which include %d sub comments", len(parsedCommentsList), gVal['dbgSubCmtNum']) #print "total parsed comments= %d"%(len(parsedCommentsList)); except: logging.debug("Error while fetch and parse comment for %s", url) return parsedCommentsList
def fetchUrlList(pageNum): (totalUrlNum, totalPageNum, urlList) = (0, 0, []) articleListUrl = gVal['blogEntryUrl'] + "/article/list/" + str(pageNum) logging.debug("from %s to extract url list", articleListUrl) respHtml = crifanLib.getUrlRespHtml(articleListUrl) #http://blog.csdn.net/aomandeshangxiao/article/list/2 # <div id="papelist" class="pagelist"> # <span> 106条数据 共3页</span><strong>1</strong> <a href="/aomandeshangxiao/article/list/2">2</a> <a href="/aomandeshangxiao/article/list/3">3</a> <a href="/aomandeshangxiao/article/list/2">下一页</a> <a href="/aomandeshangxiao/article/list/3">尾页</a> # </div> # http://blog.csdn.net/chenglinhust/article/list/54 # <span> 1079条 共54页</span> foundPageNum = re.search( "<span>\s*?(?P<totalUrlNum>\d+)条(数据)?\s+?共(?P<totalPageNum>\d+)页</span>", respHtml) logging.debug("foundPageNum=%s", foundPageNum) if (foundPageNum): totalUrlNum = foundPageNum.group("totalUrlNum") totalUrlNum = int(totalUrlNum) totalPageNum = foundPageNum.group("totalPageNum") totalPageNum = int(totalPageNum) logging.debug("totalUrlNum=%s, totalPageNum=%s", totalUrlNum, totalPageNum) # extrat url list #<span class="link_title"><a href="/v_july_v/article/details/7382693"> foundArticleIds = re.findall( '<span class="link_title"><a href="/' + gVal['blogUser'] + '/article/details/(\d+)">', respHtml, re.I) #print "len(foundArticleIds)=",len(foundArticleIds); for eachId in foundArticleIds: eachUrl = genPermaLink(eachId) #print "eachUrl=",eachUrl; urlList.append(eachUrl) if (not foundPageNum): #logging.debug("Not extract totalUrlNum from respHtml=\n%s", respHtml); #http://blog.csdn.net/qq1059458376/article/list/1 # <div class="list_item article_item"> # <div class="article_title"> # <span class="ico ico_type_Original"></span> # <h3> # <span class="link_title"><a href="/qq1059458376/article/details/8145497"> # 50个Android开发人员必备UI效果源码[转载] # </a></span> # </h3> # </div> #... # </div> soup = htmlToSoup(respHtml) foundAllListItem = soup.findAll( name="div", attrs={"class": "list_item article_item"}) logging.debug("foundAllListItem=%s", foundAllListItem) if (foundAllListItem): for eachListItem in foundAllListItem: foundLinkTitle = eachListItem.find( name="span", attrs={"class": "link_title"}) logging.debug("foundLinkTitle=%s", foundLinkTitle) if (foundLinkTitle): linkTitleA = foundLinkTitle.a linkTitleAHref = linkTitleA['href'] fullLink = gConst['spaceDomain'] + linkTitleAHref logging.debug("fullLink=%s", fullLink) urlList.append(fullLink) logging.debug( "for pageNum=%d, extracted info: totalUrlNum=%d, totalPageNum=%d, urlList=%s", pageNum, totalUrlNum, totalPageNum, urlList) return (totalUrlNum, totalPageNum, urlList)
def fetchUrlList(pageNum): (totalUrlNum, totalPageNum, urlList) = (0, 0, []); articleListUrl = gVal['blogEntryUrl'] + "/article/list/" + str(pageNum); logging.debug("from %s to extract url list", articleListUrl); respHtml = crifanLib.getUrlRespHtml(articleListUrl); #http://blog.csdn.net/aomandeshangxiao/article/list/2 # <div id="papelist" class="pagelist"> # <span> 106条数据 共3页</span><strong>1</strong> <a href="/aomandeshangxiao/article/list/2">2</a> <a href="/aomandeshangxiao/article/list/3">3</a> <a href="/aomandeshangxiao/article/list/2">下一页</a> <a href="/aomandeshangxiao/article/list/3">尾页</a> # </div> foundPageNum = re.search("<span>\s*?(?P<totalUrlNum>\d+)条数据\s+?共(?P<totalPageNum>\d+)页</span>", respHtml); logging.debug("foundPageNum=%s", foundPageNum); if(foundPageNum): totalUrlNum = foundPageNum.group("totalUrlNum"); totalUrlNum = int(totalUrlNum); totalPageNum = foundPageNum.group("totalPageNum"); totalPageNum = int(totalPageNum); #print "totalUrlNum=",totalUrlNum,"totalPageNum=",totalPageNum; # extrat url list #<span class="link_title"><a href="/v_july_v/article/details/7382693"> foundArticleIds = re.findall('<span class="link_title"><a href="/' + gVal['blogUser'] + '/article/details/(\d+)">', respHtml, re.I); #print "len(foundArticleIds)=",len(foundArticleIds); for eachId in foundArticleIds: eachUrl = genPermaLink(eachId); #print "eachUrl=",eachUrl; urlList.append(eachUrl); if(not foundPageNum): #logging.debug("Not extract totalUrlNum from respHtml=\n%s", respHtml); #http://blog.csdn.net/qq1059458376/article/list/1 # <div class="list_item article_item"> # <div class="article_title"> # <span class="ico ico_type_Original"></span> # <h3> # <span class="link_title"><a href="/qq1059458376/article/details/8145497"> # 50个Android开发人员必备UI效果源码[转载] # </a></span> # </h3> # </div> #... # </div> soup = htmlToSoup(respHtml); foundAllListItem = soup.findAll(name="div", attrs={"class":"list_item article_item"}); logging.debug("foundAllListItem=%s", foundAllListItem); if(foundAllListItem): for eachListItem in foundAllListItem: foundLinkTitle = eachListItem.find(name="span", attrs={"class":"link_title"}); logging.debug("foundLinkTitle=%s", foundLinkTitle); if(foundLinkTitle): linkTitleA = foundLinkTitle.a; linkTitleAHref = linkTitleA['href']; fullLink = gConst['spaceDomain'] + linkTitleAHref; logging.debug("fullLink=%s", fullLink); urlList.append(fullLink); logging.debug("for pageNum=%d, extracted info: totalUrlNum=%d, totalPageNum=%d, urlList=%s", pageNum, totalUrlNum, totalPageNum, urlList); return (totalUrlNum, totalPageNum, urlList);
def main(): global gVal global gCfg # 0. main procedure begin parser = OptionParser() parser.add_option( "-t", "--timeSpanStr", action="store", type="string", dest="timeSpanStr", help=u"Time span/duration. eg:20120901-20121001", ) parser.add_option( "-o", "--outputFilename", action="store", type="string", dest="outputFilename", help=u"Output file name. default is recMusicHtml.txt", ) parser.add_option( "-u", "--userId", action="store", type="string", dest="userId", help=u"User's songtaste ID. default is crifan's user id: 351979", ) logging.info(u"版本信息:%s", __VERSION__) printDelimiterLine() (options, args) = parser.parse_args() for i in dir(options): exec(i + " = options." + i) if userId: gCfg["userId"] = userId gCfg["timeSpanStr"] = timeSpanStr if not gCfg["timeSpanStr"]: # set default time span curDatetime = datetime.now() logging.info("curDatetime=%s", curDatetime) gVal["timeSpan"]["start"] = curDatetime.replace(day=1) gVal["timeSpan"]["end"] = curDatetime gVal["timeSpan"]["startDateStr"] = datetime.strftime(gVal["timeSpan"]["end"], "%Y%m%d") gVal["timeSpan"]["endDateStr"] = datetime.strftime(gVal["timeSpan"]["start"], "%Y%m%d") gCfg["timeSpanStr"] = gVal["timeSpan"]["startDateStr"] + "-" + gVal["timeSpan"]["endDateStr"] logging.info("Set defalt gVal['timeSpan']=%s", gVal["timeSpan"]) else: # parse input time span string foundTimeSpanStr = re.search( "(?P<startDateStr>(?P<startYear>\d{4})(?P<startMonth>\d{2})(?P<startDay>\d{2}))-(?P<endDateStr>(?P<endYear>\d{4})(?P<endMonth>\d{2})(?P<endDay>\d{2}))", gCfg["timeSpanStr"], ) logging.info("foundTimeSpanStr=%s", foundTimeSpanStr) if foundTimeSpanStr: gVal["timeSpan"]["startDateStr"] = foundTimeSpanStr.group("startDateStr") gVal["timeSpan"]["endDateStr"] = foundTimeSpanStr.group("endDateStr") startYear = int(foundTimeSpanStr.group("startYear")) startMonth = int(foundTimeSpanStr.group("startMonth")) startDay = int(foundTimeSpanStr.group("startDay")) endYear = int(foundTimeSpanStr.group("endYear")) endMonth = int(foundTimeSpanStr.group("endMonth")) endDay = int(foundTimeSpanStr.group("endDay")) gVal["timeSpan"]["start"] = datetime(year=startYear, month=startMonth, day=startDay) gVal["timeSpan"]["end"] = datetime(year=endYear, month=endMonth, day=endDay) logging.info("parsed gVal['timeSpan']=%s", gVal["timeSpan"]) else: logging.error("Fail to parse the input time span str %s", gCfg["timeSpanStr"]) sys.exit(2) if outputFilename: gCfg["outputFile"]["name"] = outputFilename else: defaultFilename = str(gCfg["userId"]) + u" 【歌曲批量推荐】" + gCfg["timeSpanStr"] + ".html" gCfg["outputFile"]["name"] = defaultFilename gVal["userMainUrl"] = generateUserMainUrl(gCfg["userId"]) logging.info("Time span = %s", gCfg["timeSpanStr"]) logging.info("Output file name = %s", gCfg["outputFile"]["name"]) logging.info("Songtaste user id = %s", gCfg["userId"]) logging.info("User main url = %s", gVal["userMainUrl"]) respHtml = crifanLib.getUrlRespHtml(gVal["userMainUrl"]) soup = BeautifulSoup(respHtml, fromEncoding=gConst["stHtmlEnc"]) # extract user name # <h1 class="h1user">crifan</h1> foundH1user = soup.find(attrs={"class": "h1user"}) logging.debug("foundH1user=%s", foundH1user) if foundH1user: gVal["username"] = foundH1user.string logging.info("Extracted songtaste username is %s", gVal["username"]) else: logging.error("Cannot extract user name for songtaste main user url %s !", gVal["userMainUrl"]) sys.exit(2) # extract total rec music number # <p class="more"><a href="/user/351979/allrec" class="underline">全部 306 首推荐</a></p> soupUni = unicode(soup) foundAllrecNum = re.search(u"全部 (?P<allrecNum>\d+) 首推荐", soupUni) logging.info("foundAllrecNum=%s", foundAllrecNum) if foundAllrecNum: gVal["allrec"]["num"] = foundAllrecNum.group("allrecNum") gVal["allrec"]["num"] = int(gVal["allrec"]["num"]) logging.info("gVal['allrec']['num']=%d", gVal["allrec"]["num"]) gVal["allrec"]["totalPage"] = gVal["allrec"]["num"] / gConst["songNumPerPage"] if (gVal["allrec"]["num"] - gConst["songNumPerPage"] * gVal["allrec"]["totalPage"]) > 0: gVal["allrec"]["totalPage"] += 1 logging.info("gVal['allrec']=%s", gVal["allrec"]) # extract all song info dict list totalSongInfoDictList = [] for pageIdx in range(gVal["allrec"]["totalPage"]): pageNum = pageIdx + 1 singleAllrecUrl = generateAllrecUrl(gCfg["userId"], pageNum) logging.info("pageIdx=%d, pageNum=%d, singleAllrecUrl=%s", pageIdx, pageNum, singleAllrecUrl) singlePageSongInfoDictList = extractSongInfoDictList(singleAllrecUrl) logging.info("Current allrec page extracted %d songs info", len(singlePageSongInfoDictList)) totalSongInfoDictList.extend(singlePageSongInfoDictList) logging.info("Total extracted song info number: %d", len(totalSongInfoDictList)) # filter out the song within desiginated time span withinTimeSpanSongList = [] for singleSongDict in totalSongInfoDictList: if (singleSongDict["time"] >= gVal["timeSpan"]["start"]) and ( singleSongDict["time"] <= gVal["timeSpan"]["end"] ): logging.info("found within time span time=%s", singleSongDict["time"]) withinTimeSpanSongList.append(singleSongDict) # generate output outputUni = "" # generate header outputUni += generateOutputHeader(gVal["timeSpan"]["startDateStr"], gVal["timeSpan"]["endDateStr"]) # generate each song info for singSongInfoDict in withinTimeSpanSongList: outputUni += generateSingleSongStr(singSongInfoDict) # generate tail outputToFile(outputUni)
def fetchAndParseComments(url, html): parsedCommentsList = []; try : #logging.debug("fetchAndParseComments_html=\n%s", html); soup = getSoupFromUrl(url); #<span class="link_comments" title="评论次数"><a href="#comments">评论</a>(207)</span> foundCmtNum = soup.find(attrs={"class":"link_comments"}); #print "foundCmtNum=",foundCmtNum; if(foundCmtNum): #cmtNumStr = foundCmtNum.string; #print "foundCmtNum.contents=",foundCmtNum.contents; cmtNumStr = foundCmtNum.contents[1]; #print "cmtNumStr=",cmtNumStr; cmtNumStr = cmtNumStr.strip(); cmtNumStr = cmtNumStr.replace("(", ""); cmtNumStr = cmtNumStr.replace(")", ""); #print "cmtNumStr=",cmtNumStr; totalCmtNum = int(cmtNumStr); #print "totalCmtNum=",totalCmtNum; logging.debug("total comment number=%d", totalCmtNum); allCmtList = []; if(totalCmtNum > 0): # fetch comments needGetMore = True; curCmtPageIdx = 1; #http://blog.csdn.net/v_july_v/comment/list/6543438?page=1&_0.8271710660267246 getCmtMainUrl = gConst['spaceDomain'] + "/" + gVal['blogUser'] + "/comment/list/" + str(extractArticleId(url)); while(needGetMore): #http://blog.csdn.net/v_july_v/comment/list/6543438?page=1&_0.8271710660267246 getCmtUrl = getCmtMainUrl + "?page=" + str(curCmtPageIdx) + "&_" + str(random.random()); logging.debug("for get comments, generated url=%s", getCmtUrl); respJson = crifanLib.getUrlRespHtml(getCmtUrl); logging.debug("return comment json string=\n%s", respJson); cmtDict = json.loads(respJson); #print "len(cmtDict)=",len(cmtDict); #{"list":[{"ArticleId":6543438,"BlogId":943376,"CommentId":2177665,"Content":"看了好多篇,收货很多, 看的羨慕妒恨!~~~~~~~~\n虽然有很多之前没看过,很多之气都没听过,现在终于见识了!楼主的无私,很感动,就也无私的推荐给公司研发的所有同事,同事都是大赞!!","ParentId":0,"PostTime":"3天前 12:50","Replies":null,"UserName":"******","Userface":"http://avatar.csdn.net/6/3/7/3_zxyzlx.jpg"}, cmtList = cmtDict['list']; cmtListLen = len(cmtList); #print "cmtListLen=",cmtListLen; allCmtList.extend(cmtList); logging.debug("Add %d returned comment list into all comment list", cmtListLen); #,"page":{"PageSize":100,"PageIndex":1,"RecordCount":173,"PageCount":2},"fileName":"6543438"} cmtPage = cmtDict['page']; #print "cmtPage=",cmtPage; pageIndex = cmtPage['PageIndex']; #print "pageIndex=",pageIndex; pageCount = cmtPage['PageCount']; #print "pageCount=",pageCount; logging.debug("Returned comment info: page=%s, fileName=%s", cmtDict['page'], cmtDict['fileName']); if(curCmtPageIdx < pageCount): curCmtPageIdx += 1; else : needGetMore = False; gVal['dbgSubCmtNum'] = 0; parseAllCommentsList(allCmtList, parsedCommentsList); logging.debug("total parsed %d comments, which include %d sub comments", len(parsedCommentsList), gVal['dbgSubCmtNum']); #print "total parsed comments= %d"%(len(parsedCommentsList)); except : logging.debug("Error while fetch and parse comment for %s", url); return parsedCommentsList;
def find1stPermalink(): (isFound, retInfo) = (False, "Unknown error!"); try: #http://blog.tianya.cn/blogger/post_list.asp?BlogID=2723898&CategoryID=0 allPostCategoryId = 0; allCategoryUrl = genPostListUrl(gVal['blogId'], allPostCategoryId); respHtml = crifanLib.getUrlRespHtml(allCategoryUrl); logging.debug("allCategoryUrl=%s resp respHtml=%s", allCategoryUrl, respHtml); # <div class="pages pos-relative"> # 页码:1/7 << <a href="/blogger/post_list.asp?BlogID=2723898&CategoryID=0&page=1" class="here">1</a> <a href="/blogger/post_list.asp?BlogID=2723898&CategoryID=0&page=2" class="">2</a> <a href="/blogger/post_list.asp?BlogID=2723898&CategoryID=0&page=3" class="">3</a> <a href="/blogger/post_list.asp?BlogID=2723898&CategoryID=0&page=4" class="">4</a> <a href="/blogger/post_list.asp?BlogID=2723898&CategoryID=0&page=5" class="">5</a> <a href="/blogger/post_list.asp?BlogID=2723898&CategoryID=0&page=6" title=下5页>></a> <a href="/blogger/post_list.asp?BlogID=2723898&CategoryID=0&page=7" title=尾页>>></a> # <span class="pos-right gototop"><a href="#">返回顶部</a></span> # </div> #foundPagesPos = re.search('<div\s+class="pages pos-relative">.+?1/(?P<totalPageNum>\d+)', respHtml, re.S); #foundPagesPos = re.search('<div\s+class="pages pos-relative">\s+...1/(?P<totalPageNum>\d+)', respHtml); foundPagesPos = re.search('<div\s+class="pages pos-relative">\s+(?P<yemaStr>.+)1/(?P<totalPageNum>\d+)', respHtml); foundPagesPos = re.search('<div\s+class="pages pos-relative">\s+.+1/(?P<totalPageNum>\d+)', respHtml); logging.debug("foundPagesPos=%s", foundPagesPos); if(foundPagesPos): #yemaStr = foundPagesPos.group("yemaStr"); #logging.info("yemaStr=%s", yemaStr); totalPageNum = foundPagesPos.group("totalPageNum"); logging.debug("totalPageNum=%s", totalPageNum); totalPageNumInt = int(totalPageNum); if(totalPageNumInt == 1): lastPageHtml = respHtml; else: #http://blog.tianya.cn/blogger/post_list.asp?BlogID=2723898&CategoryID=0&page=7 lastPageUrl = genPostListUrl(gVal['blogId'], 0, totalPageNumInt); logging.debug("lastPageUrl=%s", lastPageUrl); lastPageHtml = crifanLib.getUrlRespHtml(lastPageUrl); # special: #http://blog.tianya.cn/blogger/post_list.asp?BlogID=4338249&CategoryID=0&page=26 # contain no post, so here calc prev page lastButOneInt = totalPageNumInt - 1; lastButOnePageUrl = genPostListUrl(gVal['blogId'], 0, lastButOneInt); logging.debug("lastButOnePageUrl=%s", lastButOnePageUrl); lastButOnePageHtml = crifanLib.getUrlRespHtml(lastButOnePageUrl); #<li class="articlecell cf"><p class="ptit"><a href="/blogger/post_read.asp?BlogID=2723898&PostID=47717072" target="_blank">再见过去,我们的足迹</a></p><p class="ptime">2012-10-27 08:13</p><p class="pcomments">3</p></li> soup = htmlToSoup(lastPageHtml); foundArticleCellCf = soup.findAll(attrs={"class":"articlecell cf"}); if(not foundArticleCellCf): # for special: #http://blog.tianya.cn/blogger/post_list.asp?BlogID=4338249&CategoryID=0&page=26 soup = htmlToSoup(lastButOnePageHtml); foundArticleCellCf = soup.findAll(attrs={"class":"articlecell cf"}); logging.debug("foundArticleCellCf=%s", foundArticleCellCf); articleCellCfLen = len(foundArticleCellCf); logging.debug("articleCellCfLen=%s", articleCellCfLen); lastPostSoup = foundArticleCellCf[-1]; logging.debug("lastPostSoup=%s", lastPostSoup); logging.debug("lastPostSoup.contents=%s", lastPostSoup.contents); #p0 = lastPostSoup.p[0]; p0 = lastPostSoup.contents[0]; logging.debug("p0=%s", p0); p0a = p0.a; logging.debug("p0a=%s", p0a); href = p0a['href']; logging.debug("href=%s", href); lastPostUrl = gConst['spaceDomain'] + href; logging.debug("lastPostUrl=%s", lastPostUrl); if(lastPostUrl): retInfo = lastPostUrl; isFound = True; except: (isFound, retInfo) = (False, "Unknown error!"); return (isFound, retInfo);