Python getUrlRespHtml 예제들, crifanLib.getUrlRespHtml Python 예제들

예제 #1

0

파일 보기

def find1stPermalink():
    (isFound, retInfo) = (False, "Unknown error!")

    try:
        #http://blog.csdn.net/chdhust
        # try find last page
        homeUrl = gVal['blogEntryUrl']
        homeRespHtml = crifanLib.getUrlRespHtml(homeUrl)
        logging.debug("homeRespHtml for %s, is:\n%s", homeUrl, homeRespHtml)
        # <a href="/chenglinhust/article/list/22">尾页</a>
        # <span> 1079条  共54页</span><strong>1</strong> <a href="/chenglinhust/article/list/2">2</a> <a href="/chenglinhust/article/list/3">3</a> <a href="/chenglinhust/article/list/4">4</a> <a href="/chenglinhust/article/list/5">5</a> <a href="/chenglinhust/article/list/6">...</a> <a href="/chenglinhust/article/list/2">下一页</a> <a href="/chenglinhust/article/list/54">尾页</a>

        foundLastListPageUrl = re.search(
            '<a\s+?href="(?P<lastListPageUrl>/\w+?/article/list/\d+)">尾页</a>',
            homeRespHtml, re.I)
        logging.debug("foundLastListPageUrl=%s", foundLastListPageUrl)

        if (foundLastListPageUrl):
            lastListPageUrl = foundLastListPageUrl.group("lastListPageUrl")
            lastListPageUrl = gConst['spaceDomain'] + lastListPageUrl
            logging.debug("lastListPageUrl=%s", lastListPageUrl)
            # pageNum = 10000;
            # pageNum = lastPageNum;
            # getPostUrl = gVal['blogEntryUrl'] + "/article/list/" + str(pageNum);
            # http://blog.csdn.net/chenglinhust/article/list/54
            respHtml = crifanLib.getUrlRespHtml(lastListPageUrl)
            logging.debug("ret html for %s, is:\n%s", lastListPageUrl,
                          respHtml)
            soup = htmlToSoup(respHtml)
            # <span class="link_title"><a href="/v_july_v/article/details/5934051">
            # 算法面试：精选微软经典的算法面试100题（第1-20题）
            # </a></span>
            foundTitLink = soup.findAll(attrs={"class": "link_title"})
            # logging.debug("found articles=%s", foundTitLink);
            articleNum = len(foundTitLink)
            logging.debug("articleNum=%s", articleNum)

            if (foundTitLink):
                lastArticle = foundTitLink[-1]
                # print "lastArticle=",lastArticle;
                aVal = lastArticle.a
                # print "aVal=",aVal;
                href = aVal['href']
                # print "href=",href;

                fristLink = gConst['spaceDomain'] + href
                # print "fristLink=",fristLink;
                retInfo = fristLink
                isFound = True

                logging.debug("retInfo=%s,isFound=%s", retInfo, isFound)
    except:
        (isFound, retInfo) = (False, "Unknown error!")

    return (isFound, retInfo)

예제 #2

0

파일 보기

파일: worm.py 프로젝트: yanzms/wormproject

    def parse(self, url, encoding='utf-8', **kwargs):
        try:
            html = crifanLib.getUrlRespHtml(url)
            selector = lxml.etree.HTML(html)
            results = {}
            for key, value in kwargs.items():
                res = selector.xpath(value)
                for index, value in enumerate(res):
                    data = value
                    if isinstance(data, lxml.etree._Element):
                        data = value.xpath('string(.)')
                    if isinstance(data, unicode):
                        data = data.encode('gbk', 'ignore')
                    data = re.sub('\s', '', data, flags=re.UNICODE)
                    res[index] = data
                results[key] = res
            return results

        except:
            date = time.strftime('%Y-%m-%d')
            f = open(date + 'errors.log', 'a')
            hms = time.strftime('%H:%M:%S : \r\n')
            f.write(hms)
            traceback.print_exc(file=f)
            f.write('\n')
            f.flush()
            f.close()

예제 #3

0

파일 보기

파일: BlogTianya.py 프로젝트: wodear/blogs-to-wordpress

def extractBlogTitAndDesc(blogEntryUrl) :
    (blogTitle, blogDescription) = ("", "");

    try:
        logging.debug("Now extract blog title and description from blogEntryUrl=%s", blogEntryUrl);
        respHtml = crifanLib.getUrlRespHtml(blogEntryUrl);
        #logging.debug("url=%s return html=\n%s", blogEntryUrl, respHtml);
        
        # <div class="headerinner">
            # <h1><a href="http://blog.tianya.cn/blogger/blog_main.asp?BlogID=2723898">素颜素语</a><img src="/images/mb2.gif" style="margin-left:15px" align="absmiddle"></h1>
            # <div class="blog-header-rul"><a href="http://blog.tianya.cn/blogger/blog_main.asp?BlogID=2723898">http://susu7788.blog.tianya.cn</a> <a href="rss.asp?BlogID=2723898" target="_blank">[RSS订阅]</a></div>
            # <div class="blogsign">真诚待人，用心写文。</div>
        # </div>

        soup = htmlToSoup(respHtml);

        foundHeaderInner = soup.find(attrs={"class":"headerinner"});
        logging.debug("foundHeaderInner=%s", foundHeaderInner);
        h1 = foundHeaderInner.h1;
        h1a = h1.a;
        logging.debug("h1a=%s", h1a);
        h1aStr = h1a.string;
        logging.debug("h1aStr=%s", h1aStr);
        blogTitle = h1aStr;
        logging.debug("blogTitle=%s", blogTitle);
        
        foundBlogsign = foundHeaderInner.find(attrs={"class":"blogsign"});
        logging.debug("foundBlogsign=%s", foundBlogsign);
        blogDescription = foundBlogsign.string;
        logging.debug("blogDescription=%s", blogDescription);
    except:
        (blogTitle, blogDescription) = ("", "");
    
    return (blogTitle, blogDescription);

예제 #4

0

파일 보기

파일: BlogTianya.py 프로젝트: wodear/blogs-to-wordpress

def fetchAndParseComments(url, html):
    parsedCommentsList = [];
    
    try :
        #logging.debug("fetchAndParseComments_html=\n%s", html);
        
        soup = getSoupFromUrl(url);

        #<a href="#allcomments">评论:235</a>
        #foundAllCmtNum = soup.find(attrs={"href":"#allcomments"});
        foundAllCmtNum = re.search('<a href="#allcomments">.+?(?P<allCmtNum>\d+)</a>', html);
        logging.debug("foundAllCmtNum=%s", foundAllCmtNum);
        if(foundAllCmtNum):
            allCmtNum = foundAllCmtNum.group("allCmtNum");
            logging.debug("allCmtNum=%s", allCmtNum);
            allCmtNumInt = int(allCmtNum);
            logging.debug("allCmtNumInt=%s", allCmtNumInt);
            logging.debug("total comment number=%d", allCmtNumInt);
            if(allCmtNumInt > 0):
                allCmtCellDictList = [];
                
                # calc total comment page num
                maxCmtNumPerPage = 50;
                toalCmtPageNum = math.ceil(float(allCmtNumInt)/float(50));
                toalCmtPageNum = int(toalCmtPageNum);
                logging.debug("toalCmtPageNum=%d", toalCmtPageNum);

                # get all comment cell dict list
                if(toalCmtPageNum <= 1):
                    allCmtCellDictList = extractCmtCellDictList(html);
                else:
                    postId = extractPostIdFromPermaLink(url);
                    # for each comment page get html and extract comment cell string list
                    for eachPageIdx in range(toalCmtPageNum):
                        eachPageNum = eachPageIdx + 1;
                        #genrate comment url for each page
                        #http://blog.tianya.cn/blogger/post_read.asp?BlogID=3210723&PostID=41963783&page=2
                        postReadUrl = genPostReadUrl(gVal['blogId'], postId, eachPageNum);
                        respHtml = crifanLib.getUrlRespHtml(postReadUrl);
                        singlePageCmtCellStrList = extractCmtCellDictList(respHtml);
                        logging.debug("len(singlePageCmtCellStrList)=%s", len(singlePageCmtCellStrList));
                        allCmtCellDictList.extend(singlePageCmtCellStrList);
                
                logging.debug("len(allCmtCellDictList)=%s", len(allCmtCellDictList));
                # parse each comment cell string into dest comment dict
                parseAllCommentsList(allCmtCellDictList, parsedCommentsList);
                logging.debug("total parsed %d comments", len(parsedCommentsList));
                #print "total parsed comments= %d"%(len(parsedCommentsList));
    except :
        logging.debug("Error while fetch and parse comment for %s", url);
    
    return parsedCommentsList;

예제 #5

0

파일 보기

파일: BlogCsdn.py 프로젝트: histronger/blogs-to-wordpress

def extractBlogTitAndDesc(blogEntryUrl) :
    (blogTitle, blogDescription) = ("", "");
    #print "blogEntryUrl=",blogEntryUrl;
    
    try:
        respHtml = crifanLib.getUrlRespHtml(blogEntryUrl);
        #logging.debug("url=%s return html=\n%s", blogEntryUrl, respHtml);
        
        # <div id="blog_title">
            # <h1>
                # <a href="/v_JULY_v">结构之法 算法之道</a></h1>
            # <h2></h2>
            # <div class="clear">
            # </div>
        # </div>
        
        # <div id="blog_title">
            # <h1>
                # <a href="/MoreWindows">MoreWindows</a></h1>
            # <h2>学习不在于学了多少，而在于学会多少。</h2>
            # <div class="clear">
            # </div>
        # </div>
        
        soup = htmlToSoup(respHtml);

        foundTitle = soup.find(id="blog_title");
        #print "foundTitle=",foundTitle;
        
        foundTitleA = foundTitle.a;
        #print "foundTitleA=",foundTitleA;
        titStr = foundTitleA.string;
        #print "titStr=",titStr;
        blogTitle = unicode(titStr);
        
        #h1 = foundTitle.h1;
        h2 = foundTitle.h2;
        #print "h1=",h1;
        #print "h2=",h2;
        #h1Str = h1.string;
        h2Str = h2.string;
        #print "h1Str=",h1Str;
        #print "h2Str=",h2Str;
        
        blogDescription = unicode(h2Str);
        #print "blogDescription=",blogDescription;
    except:
        (blogTitle, blogDescription) = ("", "");

    return (blogTitle, blogDescription);

예제 #6

0

파일 보기

def extractBlogTitAndDesc(blogEntryUrl):
    (blogTitle, blogDescription) = ("", "")
    #print "blogEntryUrl=",blogEntryUrl;

    try:
        respHtml = crifanLib.getUrlRespHtml(blogEntryUrl)
        #logging.debug("url=%s return html=\n%s", blogEntryUrl, respHtml);

        # <div id="blog_title">
        # <h1>
        # <a href="/v_JULY_v">结构之法 算法之道</a></h1>
        # <h2></h2>
        # <div class="clear">
        # </div>
        # </div>

        # <div id="blog_title">
        # <h1>
        # <a href="/MoreWindows">MoreWindows</a></h1>
        # <h2>学习不在于学了多少，而在于学会多少。</h2>
        # <div class="clear">
        # </div>
        # </div>

        soup = htmlToSoup(respHtml)

        foundTitle = soup.find(id="blog_title")
        #print "foundTitle=",foundTitle;

        foundTitleA = foundTitle.a
        #print "foundTitleA=",foundTitleA;
        titStr = foundTitleA.string
        #print "titStr=",titStr;
        blogTitle = unicode(titStr)

        #h1 = foundTitle.h1;
        h2 = foundTitle.h2
        #print "h1=",h1;
        #print "h2=",h2;
        #h1Str = h1.string;
        h2Str = h2.string
        #print "h1Str=",h1Str;
        #print "h2Str=",h2Str;

        blogDescription = unicode(h2Str)
        #print "blogDescription=",blogDescription;
    except:
        (blogTitle, blogDescription) = ("", "")

    return (blogTitle, blogDescription)

예제 #7

0

파일 보기

파일: worm.py 프로젝트: yanzms/wormproject

 def __get_urls(self, page, reg, encoding='utf-8'):
     try:
         html = crifanLib.getUrlRespHtml(page)
         selector = lxml.etree.HTML(html)
         urls = selector.xpath(reg)
         return urls
     except:
         date = time.strftime('%Y-%m-%d')
         f = open(date + 'errors.log', 'a')
         hms = time.strftime('%H:%M:%S : \n')
         f.write(hms)
         traceback.print_exc(file=f)
         f.write('\n')
         f.flush()
         f.close()

예제 #8

0

파일 보기

파일: BlogTianya.py 프로젝트: wodear/blogs-to-wordpress

def extractBlogUserFromBlogId(blogId):
    blogUser = "";
    
    #http://blog.tianya.cn/blogger/blog_main.asp?BlogID=2723898
    blogEntryUrl = genEntryUrl(str(blogId));
    
    #extract blogUser
    #<div class="blog-header-rul"><a href="http://blog.tianya.cn/blogger/blog_main.asp?BlogID=2723898">http://susu7788.blog.tianya.cn</a> <a href="rss.asp?BlogID=2723898" target="_blank">[RSS订阅]</a></div>
    respHtml = crifanLib.getUrlRespHtml(blogEntryUrl);
    foundBlogHeaderRurl = re.search('<div\s+class="blog-header-rul"><a\s+href="http://blog\.tianya\.cn/blogger/blog_main\.asp\?BlogID=' + blogId + '">http://(?P<blogUser>\w+).blog.tianya.cn</a>', respHtml);
    logging.debug("foundBlogHeaderRurl=%s", foundBlogHeaderRurl);
    if(foundBlogHeaderRurl):
        blogUser = foundBlogHeaderRurl.group("blogUser");
        logging.debug("Extracted blog user %s from entry url %s", blogUser, blogEntryUrl);
    else:
        blogUser = "";
    return blogUser;

예제 #9

0

파일 보기

파일: BlogXXX.py 프로젝트: wodear/blogs-to-wordpress

def getSoupFromUrl(url):
    soup = None;
    
    if(url in gVal['soupDict']) :
        logging.debug("%s exist in soupDict, so get soup from cache", url);
        soup = gVal['soupDict'][url];
    else :
        logging.debug("%s not in soupDict, so get html then get soup", url);
        
        # get url's soup
        html = crifanLib.getUrlRespHtml(url);
        soup = htmlToSoup(html);
        
        # store soup
        gVal['soupDict'][url] = soup;
    
    return soup;

예제 #10

0

파일 보기

파일: BlogXXX.py 프로젝트: wodear/blogs-to-wordpress

def find1stPermalink():
    (isFound, retInfo) = (False, "Unknown error!");
    
    try:
        #
        someUrl = gVal['blogEntryUrl'] + "???";
        logging.debug("someUrl=%s", someUrl);
        respHtml = crifanLib.getUrlRespHtml(someUrl);
        logging.debug("respHtml=%s", respHtml);
        
        #TODO: write your own logic to find the first permanent link of post
                    # retInfo = lastHref;
                    # isFound = True;
    except:
        (isFound, retInfo) = (False, "Unknown error!");

    return (isFound, retInfo);

예제 #11

0

파일 보기

파일: BlogXXX.py 프로젝트: wodear/blogs-to-wordpress

def extractBlogTitAndDesc(blogEntryUrl) :
    (blogTitle, blogDescription) = ("", "");

    try:
        foundTitDesc = False;
        logging.debug("Now extract blog title and description from blogEntryUrl=%s", blogEntryUrl);
        
        respHtml = crifanLib.getUrlRespHtml(blogEntryUrl);
        #logging.debug("blogEntryUrl=%s return html=\n%s", blogEntryUrl, respHtml);
        soup = htmlToSoup(respHtml);
        
        #TODO: write your own logic to extract blog title and description
        
                                # blogDescription = metaContent;
                                # foundTitDesc = True;
                    
        if(foundTitDesc):
            gVal['blogTitle']       = blogTitle;
            gVal['blogDescription'] = blogDescription;
    except:
        (blogTitle, blogDescription) = ("", "");

    return (blogTitle, blogDescription);

예제 #12

0

파일 보기

파일: BlogCsdn.py 프로젝트: histronger/blogs-to-wordpress

def find1stPermalink():
    (isFound, retInfo) = (False, "Unknown error!");
    
    try:
        #http://blog.csdn.net/v_JULY_v/article/list/7
        #pageNum = 7;
        pageNum = 10000;
        getPostUrl = gVal['blogEntryUrl'] + "/article/list/" + str(pageNum);
        respHtml = crifanLib.getUrlRespHtml(getPostUrl);
        #logging.debug("ret html for %s, is:\n%s", getPostUrl, respHtml);
        soup = htmlToSoup(respHtml);
        # <span class="link_title"><a href="/v_july_v/article/details/5934051">
        # 算法面试：精选微软经典的算法面试100题（第1-20题）
        # </a></span>
        foundTitLink = soup.findAll(attrs={"class":"link_title"});
        #logging.debug("found articles=%s", foundTitLink);
        
        articleNum = len(foundTitLink);
        #print "articleNum=",articleNum;
        
        if(foundTitLink) :
            lastArticle = foundTitLink[-1];
            #print "lastArticle=",lastArticle;
            aVal = lastArticle.a;
            #print "aVal=",aVal;
            href = aVal['href'];
            #print "href=",href;
            
            fristLink = gConst['spaceDomain'] + href;
            #print "fristLink=",fristLink;
            retInfo = fristLink;
            isFound = True;
    except:
        (isFound, retInfo) = (False, "Unknown error!");
    
    return (isFound, retInfo);

예제 #13

0

파일 보기

def find1stPermalink():
    (isFound, retInfo) = (False, "Unknown error!")

    try:
        #http://blog.csdn.net/v_JULY_v/article/list/7
        #pageNum = 7;
        pageNum = 10000
        getPostUrl = gVal['blogEntryUrl'] + "/article/list/" + str(pageNum)
        respHtml = crifanLib.getUrlRespHtml(getPostUrl)
        #logging.debug("ret html for %s, is:\n%s", getPostUrl, respHtml);
        soup = htmlToSoup(respHtml)
        # <span class="link_title"><a href="/v_july_v/article/details/5934051">
        # 算法面试：精选微软经典的算法面试100题（第1-20题）
        # </a></span>
        foundTitLink = soup.findAll(attrs={"class": "link_title"})
        #logging.debug("found articles=%s", foundTitLink);

        articleNum = len(foundTitLink)
        #print "articleNum=",articleNum;

        if (foundTitLink):
            lastArticle = foundTitLink[-1]
            #print "lastArticle=",lastArticle;
            aVal = lastArticle.a
            #print "aVal=",aVal;
            href = aVal['href']
            #print "href=",href;

            fristLink = gConst['spaceDomain'] + href
            #print "fristLink=",fristLink;
            retInfo = fristLink
            isFound = True
    except:
        (isFound, retInfo) = (False, "Unknown error!")

    return (isFound, retInfo)

예제 #14

0

파일 보기

파일: recSongtasteMusic.py 프로젝트: ZtupidTS/recsongtastemusic

def extractSongInfoDictList(singleAllrecUrl):
    songInfoDictList = []

    logging.debug("singleAllrecUrl=%s", singleAllrecUrl)
    respHtml = crifanLib.getUrlRespHtml(singleAllrecUrl)
    # logging.debug("respHtml=\n%s", respHtml);
    soup = BeautifulSoup(respHtml, fromEncoding=gConst["stHtmlEnc"])

    # findAllsong = soup.find(attrs={"class":"u_song_tab u_song_all"});
    # logging.debug("findAllsong=%s", findAllsong);
    # pretifiedHtml = soup.prettify(encoding=gCfg['outputFile']['encode']);
    # pretifiedHtml = soup.prettify();
    # pretifiedHtml = soup;
    pretifiedHtml = unicode(soup)
    # logging.info("type(pretifiedHtml)=%s", type(pretifiedHtml));
    # logging.debug("pretifiedHtml=%s", pretifiedHtml);

    # <table cellpadding="0" cellspacing="0" border="0" width="420" class="u_song_tab u_song_all">
    # ...
    # <script>
    # ...
    # WL("1", "3055432","E.R‖＜拯救听觉 前奏一起就注定是极品＞丨 -- 忆の蓝色 ","<span class=date>6</span>天前");
    # ...
    # </script>
    # </table>
    # foundAllWl = re.findall(r'WL\("\d+",\s*?"\d+",\s*?".+? -- .+? ",\s*?".+?"\);', pretifiedHtml);
    # foundAllWl = re.findall(r'WL\("\d+", "\d+",".+? -- .+? ",".+?"\);', pretifiedHtml);
    # foundAllWl = re.findall(r'WL\("\d+", "\d+",".+ -- .+ ",".+?"\);', pretifiedHtml);
    # foundAllWl = re.findall(r'WL\("\d+", "\d+",".+ -- .+ ",".+?"\);', pretifiedHtml);

    # WL("133", "255907","10.Two Moon Butterflies ","2012-01-02 16:21:18");
    foundAllWl = re.findall(r'WL\("\d+",\s*?"\d+",\s*?".+",\s*?".+?"\);', pretifiedHtml)
    foundAllWlNum = len(foundAllWl)
    # logging.info("foundAllWl=%s", foundAllWl);
    logging.info("foundAllWlNum=%d", foundAllWlNum)
    for singleWl in foundAllWl:
        # logging.info("singleWl=%s", singleWl);

        # extract all song info
        songInfoDict = {
            "number": "",
            "id": "",
            "title": "",
            "singer": "",
            "time": None,  # finally will store datetime type value
        }

        # WL("26", "3080141","Lone Wanderer -- Rameses B ","<span class=date>24</span>天前");
        # WL("31", "3155354","泰语 我不会要求 -- Da Endorphine ","一个月0天前");
        # WL("35", "3154813","北京北京（中国好声音） -- 梁博 ft 黄勇 ","一个月1天前");
        # WL("73", "3048771","＜ 小漠 迟来生日快乐 ＞ 超赞大气的节奏 Ｓexy的嗓音 -- 羊氏 Ｃlub° ","一个月26天前");
        # WL("80", "3084927","Not Inveted Love -- Stan Crown ","2012-08-08 11:27:03");
        # foundSongInfo = re.search(r'WL\("(?P<number>\d+)",\s*?"(?P<id>\d+)",\s*?"(?P<title>.+?) -- (?P<singer>.+?) ",\s*?"(?P<time>.+?)"\);', singleWl);
        # WL("133", "255907","10.Two Moon Butterflies ","2012-01-02 16:21:18");
        foundSongInfo = re.search(
            r'WL\("(?P<number>\d+)",\s*?"(?P<id>\d+)",\s*?"(?P<title>.+?)( -- (?P<singer>.+?))? ",\s*?"(?P<time>.+?)"\);',
            singleWl,
        )
        logging.debug("foundSongInfo=%s", foundSongInfo)
        if foundSongInfo:
            songInfoDict["number"] = foundSongInfo.group("number")
            songInfoDict["id"] = foundSongInfo.group("id")
            songInfoDict["title"] = foundSongInfo.group("title")
            songInfoDict["singer"] = foundSongInfo.group("singer")

            if not songInfoDict["singer"]:
                logging.debug("singer is empty for %s", singleWl)
                # WL("254", "226169","妞子 - 郝爽 ","2009-01-09 15:33:13");
                # WL("255", "161306","Kate Havnevik-Solo ","2009-01-09 15:32:59");
                # WL("256", "341131","when you say nothing at all - Alison Krauss ","2009-01-09 15:32:34");

                # try parse the title into title and singer
                foundHyphen = re.search(r"(?P<subTitle>.+)\s*\-\s*(?P<subSinger>.+)", songInfoDict["title"])
                logging.debug("foundHyphen=%s", foundHyphen)
                if foundHyphen:
                    subTitle = foundHyphen.group("subTitle")
                    subSinger = foundHyphen.group("subSinger")
                    subTitle = subTitle.strip()
                    subSinger = subSinger.strip()
                    if subTitle and subSinger:
                        songInfoDict["title"] = subTitle
                        songInfoDict["singer"] = subSinger
                        logging.info("Extrat sub title and singer OK. subTitle=%s, subSinger=%s", subTitle, subSinger)

            datetimeStr = foundSongInfo.group("time")
            # logging.info("type(datetimeStr)=%s", type(datetimeStr)); # <type 'unicode'>
            convertedDatetime = None

            # parse time to YYYY-MM-DD hh:mm:ss
            foundDatetime = re.search(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", datetimeStr)
            if foundDatetime:
                convertedDatetime = datetime.strptime(datetimeStr, "%Y-%m-%d %H:%M:%S")
                logging.debug("type(convertedDatetime)=%s", type(convertedDatetime))
            else:
                curDatetime = datetime.now()
                logging.debug("curDatetime=%s", curDatetime)

                convertedDatetime = curDatetime - convertStrToDatetimeDelta(datetimeStr)
                logging.debug("convertedDatetime=%s", convertedDatetime)

            songInfoDict["time"] = convertedDatetime
            logging.debug("songInfoDict=%s", songInfoDict)
            songInfoDictList.append(songInfoDict)
        else:
            logging.warning("Can not parse single song info string %s !", singleWl)
    return songInfoDictList

예제 #15

0

파일 보기

def fetchAndParseComments(url, html):
    parsedCommentsList = []

    try:
        #logging.debug("fetchAndParseComments_html=\n%s", html);

        soup = getSoupFromUrl(url)

        #<span class="link_comments" title="评论次数"><a href="#comments">评论</a>(207)</span>
        foundCmtNum = soup.find(attrs={"class": "link_comments"})
        #print "foundCmtNum=",foundCmtNum;
        if (foundCmtNum):
            #cmtNumStr = foundCmtNum.string;
            #print "foundCmtNum.contents=",foundCmtNum.contents;
            cmtNumStr = foundCmtNum.contents[1]
            #print "cmtNumStr=",cmtNumStr;
            cmtNumStr = cmtNumStr.strip()
            cmtNumStr = cmtNumStr.replace("(", "")
            cmtNumStr = cmtNumStr.replace(")", "")
            #print "cmtNumStr=",cmtNumStr;
            totalCmtNum = int(cmtNumStr)
            #print "totalCmtNum=",totalCmtNum;
            logging.debug("total comment number=%d", totalCmtNum)
            allCmtList = []
            if (totalCmtNum > 0):
                # fetch comments
                needGetMore = True
                curCmtPageIdx = 1
                #http://blog.csdn.net/v_july_v/comment/list/6543438?page=1&_0.8271710660267246
                getCmtMainUrl = gConst['spaceDomain'] + "/" + gVal[
                    'blogUser'] + "/comment/list/" + str(extractArticleId(url))
                while (needGetMore):
                    #http://blog.csdn.net/v_july_v/comment/list/6543438?page=1&_0.8271710660267246
                    getCmtUrl = getCmtMainUrl + "?page=" + str(
                        curCmtPageIdx) + "&_" + str(random.random())
                    logging.debug("for get comments, generated url=%s",
                                  getCmtUrl)
                    respJson = crifanLib.getUrlRespHtml(getCmtUrl)
                    logging.debug("return comment json string=\n%s", respJson)
                    cmtDict = json.loads(respJson)
                    #print "len(cmtDict)=",len(cmtDict);
                    #{"list":[{"ArticleId":6543438,"BlogId":943376,"CommentId":2177665,"Content":"看了好多篇，收货很多， 看的羨慕妒恨！~~~~~~~~\n虽然有很多之前没看过，很多之气都没听过，现在终于见识了！楼主的无私，很感动，就也无私的推荐给公司研发的所有同事，同事都是大赞！！","ParentId":0,"PostTime":"3天前 12:50","Replies":null,"UserName":"******","Userface":"http://avatar.csdn.net/6/3/7/3_zxyzlx.jpg"},
                    cmtList = cmtDict['list']
                    cmtListLen = len(cmtList)
                    #print "cmtListLen=",cmtListLen;
                    allCmtList.extend(cmtList)
                    logging.debug(
                        "Add %d returned comment list into all comment list",
                        cmtListLen)
                    #,"page":{"PageSize":100,"PageIndex":1,"RecordCount":173,"PageCount":2},"fileName":"6543438"}
                    cmtPage = cmtDict['page']
                    #print "cmtPage=",cmtPage;
                    pageIndex = cmtPage['PageIndex']
                    #print "pageIndex=",pageIndex;
                    pageCount = cmtPage['PageCount']
                    #print "pageCount=",pageCount;
                    logging.debug(
                        "Returned comment info: page=%s, fileName=%s",
                        cmtDict['page'], cmtDict['fileName'])
                    if (curCmtPageIdx < pageCount):
                        curCmtPageIdx += 1
                    else:
                        needGetMore = False

                gVal['dbgSubCmtNum'] = 0
                parseAllCommentsList(allCmtList, parsedCommentsList)
                logging.debug(
                    "total parsed %d comments, which include %d sub comments",
                    len(parsedCommentsList), gVal['dbgSubCmtNum'])
                #print "total parsed comments= %d"%(len(parsedCommentsList));
    except:
        logging.debug("Error while fetch and parse comment for %s", url)

    return parsedCommentsList

예제 #16

0

파일 보기

def fetchUrlList(pageNum):
    (totalUrlNum, totalPageNum, urlList) = (0, 0, [])

    articleListUrl = gVal['blogEntryUrl'] + "/article/list/" + str(pageNum)
    logging.debug("from %s to extract url list", articleListUrl)
    respHtml = crifanLib.getUrlRespHtml(articleListUrl)

    #http://blog.csdn.net/aomandeshangxiao/article/list/2
    # <div id="papelist" class="pagelist">
    # <span> 106条数据  共3页</span><strong>1</strong> <a href="/aomandeshangxiao/article/list/2">2</a> <a href="/aomandeshangxiao/article/list/3">3</a> <a href="/aomandeshangxiao/article/list/2">下一页</a> <a href="/aomandeshangxiao/article/list/3">尾页</a>
    # </div>

    # http://blog.csdn.net/chenglinhust/article/list/54
    # <span> 1079条  共54页</span>
    foundPageNum = re.search(
        "<span>\s*?(?P<totalUrlNum>\d+)条(数据)?\s+?共(?P<totalPageNum>\d+)页</span>",
        respHtml)
    logging.debug("foundPageNum=%s", foundPageNum)
    if (foundPageNum):
        totalUrlNum = foundPageNum.group("totalUrlNum")
        totalUrlNum = int(totalUrlNum)
        totalPageNum = foundPageNum.group("totalPageNum")
        totalPageNum = int(totalPageNum)
        logging.debug("totalUrlNum=%s, totalPageNum=%s", totalUrlNum,
                      totalPageNum)

        # extrat url list
        #<span class="link_title"><a href="/v_july_v/article/details/7382693">
        foundArticleIds = re.findall(
            '<span class="link_title"><a href="/' + gVal['blogUser'] +
            '/article/details/(\d+)">', respHtml, re.I)
        #print "len(foundArticleIds)=",len(foundArticleIds);
        for eachId in foundArticleIds:
            eachUrl = genPermaLink(eachId)
            #print "eachUrl=",eachUrl;
            urlList.append(eachUrl)

    if (not foundPageNum):
        #logging.debug("Not extract totalUrlNum from respHtml=\n%s", respHtml);
        #http://blog.csdn.net/qq1059458376/article/list/1

        # <div class="list_item article_item">

        # <div class="article_title">

        # <span class="ico ico_type_Original"></span>

        # <h3>

        # <span class="link_title"><a href="/qq1059458376/article/details/8145497">

        # 50个Android开发人员必备UI效果源码[转载]

        # </a></span>

        # </h3>

        # </div>
        #...
        # </div>
        soup = htmlToSoup(respHtml)
        foundAllListItem = soup.findAll(
            name="div", attrs={"class": "list_item article_item"})
        logging.debug("foundAllListItem=%s", foundAllListItem)
        if (foundAllListItem):
            for eachListItem in foundAllListItem:
                foundLinkTitle = eachListItem.find(
                    name="span", attrs={"class": "link_title"})
                logging.debug("foundLinkTitle=%s", foundLinkTitle)
                if (foundLinkTitle):
                    linkTitleA = foundLinkTitle.a
                    linkTitleAHref = linkTitleA['href']
                    fullLink = gConst['spaceDomain'] + linkTitleAHref
                    logging.debug("fullLink=%s", fullLink)
                    urlList.append(fullLink)

    logging.debug(
        "for pageNum=%d, extracted info: totalUrlNum=%d, totalPageNum=%d, urlList=%s",
        pageNum, totalUrlNum, totalPageNum, urlList)

    return (totalUrlNum, totalPageNum, urlList)

예제 #17

0

파일 보기

파일: BlogCsdn.py 프로젝트: histronger/blogs-to-wordpress

def fetchUrlList(pageNum):
    (totalUrlNum, totalPageNum, urlList) = (0, 0, []);
    
    articleListUrl = gVal['blogEntryUrl'] + "/article/list/" + str(pageNum);
    logging.debug("from %s to extract url list", articleListUrl);
    respHtml = crifanLib.getUrlRespHtml(articleListUrl);

    #http://blog.csdn.net/aomandeshangxiao/article/list/2
    # <div id="papelist" class="pagelist">
    # <span> 106条数据  共3页</span><strong>1</strong> <a href="/aomandeshangxiao/article/list/2">2</a> <a href="/aomandeshangxiao/article/list/3">3</a> <a href="/aomandeshangxiao/article/list/2">下一页</a> <a href="/aomandeshangxiao/article/list/3">尾页</a> 
    # </div>
    foundPageNum = re.search("<span>\s*?(?P<totalUrlNum>\d+)条数据\s+?共(?P<totalPageNum>\d+)页</span>", respHtml);
    logging.debug("foundPageNum=%s", foundPageNum);
    if(foundPageNum):
        totalUrlNum = foundPageNum.group("totalUrlNum");
        totalUrlNum = int(totalUrlNum);
        totalPageNum = foundPageNum.group("totalPageNum");
        totalPageNum = int(totalPageNum);
        #print "totalUrlNum=",totalUrlNum,"totalPageNum=",totalPageNum;
                
        # extrat url list
        #<span class="link_title"><a href="/v_july_v/article/details/7382693">
        foundArticleIds = re.findall('<span class="link_title"><a href="/' + gVal['blogUser'] + '/article/details/(\d+)">', respHtml, re.I);
        #print "len(foundArticleIds)=",len(foundArticleIds);
        for eachId in foundArticleIds:
            eachUrl = genPermaLink(eachId);
            #print "eachUrl=",eachUrl;
            urlList.append(eachUrl);

    if(not foundPageNum):
        #logging.debug("Not extract totalUrlNum from respHtml=\n%s", respHtml);
        #http://blog.csdn.net/qq1059458376/article/list/1
        
            # <div class="list_item article_item">

                # <div class="article_title">

            # <span class="ico ico_type_Original"></span>

            # <h3>

                # <span class="link_title"><a href="/qq1059458376/article/details/8145497">

                # 50个Android开发人员必备UI效果源码[转载]

                # </a></span>

            # </h3>

        # </div>
        #...
        # </div>
        soup = htmlToSoup(respHtml);
        foundAllListItem = soup.findAll(name="div", attrs={"class":"list_item article_item"});
        logging.debug("foundAllListItem=%s", foundAllListItem);
        if(foundAllListItem):
            for eachListItem in foundAllListItem:
                foundLinkTitle = eachListItem.find(name="span", attrs={"class":"link_title"});
                logging.debug("foundLinkTitle=%s", foundLinkTitle);
                if(foundLinkTitle):
                    linkTitleA = foundLinkTitle.a;
                    linkTitleAHref = linkTitleA['href'];
                    fullLink = gConst['spaceDomain'] + linkTitleAHref;
                    logging.debug("fullLink=%s", fullLink);
                    urlList.append(fullLink);
        
    logging.debug("for pageNum=%d, extracted info: totalUrlNum=%d, totalPageNum=%d, urlList=%s", pageNum, totalUrlNum, totalPageNum, urlList);
    
    return (totalUrlNum, totalPageNum, urlList);

예제 #18

0

파일 보기

파일: recSongtasteMusic.py 프로젝트: ZtupidTS/recsongtastemusic

def main():
    global gVal
    global gCfg

    # 0. main procedure begin
    parser = OptionParser()
    parser.add_option(
        "-t",
        "--timeSpanStr",
        action="store",
        type="string",
        dest="timeSpanStr",
        help=u"Time span/duration. eg:20120901-20121001",
    )
    parser.add_option(
        "-o",
        "--outputFilename",
        action="store",
        type="string",
        dest="outputFilename",
        help=u"Output file name. default is recMusicHtml.txt",
    )
    parser.add_option(
        "-u",
        "--userId",
        action="store",
        type="string",
        dest="userId",
        help=u"User's songtaste ID. default is crifan's user id: 351979",
    )

    logging.info(u"版本信息：%s", __VERSION__)
    printDelimiterLine()

    (options, args) = parser.parse_args()
    for i in dir(options):
        exec(i + " = options." + i)

    if userId:
        gCfg["userId"] = userId

    gCfg["timeSpanStr"] = timeSpanStr
    if not gCfg["timeSpanStr"]:
        # set default time span
        curDatetime = datetime.now()
        logging.info("curDatetime=%s", curDatetime)

        gVal["timeSpan"]["start"] = curDatetime.replace(day=1)
        gVal["timeSpan"]["end"] = curDatetime

        gVal["timeSpan"]["startDateStr"] = datetime.strftime(gVal["timeSpan"]["end"], "%Y%m%d")
        gVal["timeSpan"]["endDateStr"] = datetime.strftime(gVal["timeSpan"]["start"], "%Y%m%d")

        gCfg["timeSpanStr"] = gVal["timeSpan"]["startDateStr"] + "-" + gVal["timeSpan"]["endDateStr"]

        logging.info("Set defalt gVal['timeSpan']=%s", gVal["timeSpan"])
    else:
        # parse input time span string
        foundTimeSpanStr = re.search(
            "(?P<startDateStr>(?P<startYear>\d{4})(?P<startMonth>\d{2})(?P<startDay>\d{2}))-(?P<endDateStr>(?P<endYear>\d{4})(?P<endMonth>\d{2})(?P<endDay>\d{2}))",
            gCfg["timeSpanStr"],
        )
        logging.info("foundTimeSpanStr=%s", foundTimeSpanStr)
        if foundTimeSpanStr:
            gVal["timeSpan"]["startDateStr"] = foundTimeSpanStr.group("startDateStr")
            gVal["timeSpan"]["endDateStr"] = foundTimeSpanStr.group("endDateStr")

            startYear = int(foundTimeSpanStr.group("startYear"))
            startMonth = int(foundTimeSpanStr.group("startMonth"))
            startDay = int(foundTimeSpanStr.group("startDay"))

            endYear = int(foundTimeSpanStr.group("endYear"))
            endMonth = int(foundTimeSpanStr.group("endMonth"))
            endDay = int(foundTimeSpanStr.group("endDay"))

            gVal["timeSpan"]["start"] = datetime(year=startYear, month=startMonth, day=startDay)
            gVal["timeSpan"]["end"] = datetime(year=endYear, month=endMonth, day=endDay)
            logging.info("parsed gVal['timeSpan']=%s", gVal["timeSpan"])
        else:
            logging.error("Fail to parse the input time span str %s", gCfg["timeSpanStr"])
            sys.exit(2)

    if outputFilename:
        gCfg["outputFile"]["name"] = outputFilename
    else:
        defaultFilename = str(gCfg["userId"]) + u" 【歌曲批量推荐】" + gCfg["timeSpanStr"] + ".html"
        gCfg["outputFile"]["name"] = defaultFilename

    gVal["userMainUrl"] = generateUserMainUrl(gCfg["userId"])

    logging.info("Time span         = %s", gCfg["timeSpanStr"])
    logging.info("Output file name  = %s", gCfg["outputFile"]["name"])
    logging.info("Songtaste user id = %s", gCfg["userId"])
    logging.info("User main url     = %s", gVal["userMainUrl"])

    respHtml = crifanLib.getUrlRespHtml(gVal["userMainUrl"])
    soup = BeautifulSoup(respHtml, fromEncoding=gConst["stHtmlEnc"])
    # extract user name
    # <h1 class="h1user">crifan</h1>
    foundH1user = soup.find(attrs={"class": "h1user"})
    logging.debug("foundH1user=%s", foundH1user)
    if foundH1user:
        gVal["username"] = foundH1user.string
        logging.info("Extracted songtaste username is %s", gVal["username"])
    else:
        logging.error("Cannot extract user name for songtaste main user url %s !", gVal["userMainUrl"])
        sys.exit(2)

    # extract total rec music number
    # <p class="more"><a href="/user/351979/allrec" class="underline">全部 306 首推荐</a></p>
    soupUni = unicode(soup)
    foundAllrecNum = re.search(u"全部 (?P<allrecNum>\d+) 首推荐", soupUni)
    logging.info("foundAllrecNum=%s", foundAllrecNum)
    if foundAllrecNum:
        gVal["allrec"]["num"] = foundAllrecNum.group("allrecNum")
        gVal["allrec"]["num"] = int(gVal["allrec"]["num"])
        logging.info("gVal['allrec']['num']=%d", gVal["allrec"]["num"])
        gVal["allrec"]["totalPage"] = gVal["allrec"]["num"] / gConst["songNumPerPage"]
        if (gVal["allrec"]["num"] - gConst["songNumPerPage"] * gVal["allrec"]["totalPage"]) > 0:
            gVal["allrec"]["totalPage"] += 1
        logging.info("gVal['allrec']=%s", gVal["allrec"])

    # extract all song info dict list
    totalSongInfoDictList = []
    for pageIdx in range(gVal["allrec"]["totalPage"]):
        pageNum = pageIdx + 1
        singleAllrecUrl = generateAllrecUrl(gCfg["userId"], pageNum)
        logging.info("pageIdx=%d, pageNum=%d, singleAllrecUrl=%s", pageIdx, pageNum, singleAllrecUrl)
        singlePageSongInfoDictList = extractSongInfoDictList(singleAllrecUrl)
        logging.info("Current allrec page extracted %d songs info", len(singlePageSongInfoDictList))
        totalSongInfoDictList.extend(singlePageSongInfoDictList)

    logging.info("Total extracted song info number: %d", len(totalSongInfoDictList))

    # filter out the song within desiginated time span
    withinTimeSpanSongList = []
    for singleSongDict in totalSongInfoDictList:
        if (singleSongDict["time"] >= gVal["timeSpan"]["start"]) and (
            singleSongDict["time"] <= gVal["timeSpan"]["end"]
        ):
            logging.info("found within time span time=%s", singleSongDict["time"])
            withinTimeSpanSongList.append(singleSongDict)

    # generate output
    outputUni = ""
    # generate header
    outputUni += generateOutputHeader(gVal["timeSpan"]["startDateStr"], gVal["timeSpan"]["endDateStr"])

    # generate each song info
    for singSongInfoDict in withinTimeSpanSongList:
        outputUni += generateSingleSongStr(singSongInfoDict)

    # generate tail

    outputToFile(outputUni)

예제 #19

0

파일 보기

파일: BlogCsdn.py 프로젝트: histronger/blogs-to-wordpress

def fetchAndParseComments(url, html):
    parsedCommentsList = [];
    
    try :
        #logging.debug("fetchAndParseComments_html=\n%s", html);
        
        soup = getSoupFromUrl(url);
        
        #<span class="link_comments" title="评论次数"><a href="#comments">评论</a>(207)</span>
        foundCmtNum = soup.find(attrs={"class":"link_comments"});
        #print "foundCmtNum=",foundCmtNum;
        if(foundCmtNum):
            #cmtNumStr = foundCmtNum.string;
            #print "foundCmtNum.contents=",foundCmtNum.contents;
            cmtNumStr = foundCmtNum.contents[1];
            #print "cmtNumStr=",cmtNumStr;
            cmtNumStr = cmtNumStr.strip();
            cmtNumStr = cmtNumStr.replace("(", "");
            cmtNumStr = cmtNumStr.replace(")", "");
            #print "cmtNumStr=",cmtNumStr;
            totalCmtNum = int(cmtNumStr);
            #print "totalCmtNum=",totalCmtNum;
            logging.debug("total comment number=%d", totalCmtNum);
            allCmtList = [];
            if(totalCmtNum > 0):
                # fetch comments
                needGetMore = True;
                curCmtPageIdx = 1;
                #http://blog.csdn.net/v_july_v/comment/list/6543438?page=1&_0.8271710660267246
                getCmtMainUrl = gConst['spaceDomain'] + "/" + gVal['blogUser'] + "/comment/list/" + str(extractArticleId(url));
                while(needGetMore):
                    #http://blog.csdn.net/v_july_v/comment/list/6543438?page=1&_0.8271710660267246
                    getCmtUrl = getCmtMainUrl + "?page=" + str(curCmtPageIdx) + "&_" + str(random.random());
                    logging.debug("for get comments, generated url=%s", getCmtUrl);
                    respJson = crifanLib.getUrlRespHtml(getCmtUrl);
                    logging.debug("return comment json string=\n%s", respJson);
                    cmtDict = json.loads(respJson);
                    #print "len(cmtDict)=",len(cmtDict);
                    #{"list":[{"ArticleId":6543438,"BlogId":943376,"CommentId":2177665,"Content":"看了好多篇，收货很多， 看的羨慕妒恨！~~~~~~~~\n虽然有很多之前没看过，很多之气都没听过，现在终于见识了！楼主的无私，很感动，就也无私的推荐给公司研发的所有同事，同事都是大赞！！","ParentId":0,"PostTime":"3天前 12:50","Replies":null,"UserName":"******","Userface":"http://avatar.csdn.net/6/3/7/3_zxyzlx.jpg"},
                    cmtList = cmtDict['list'];
                    cmtListLen = len(cmtList);
                    #print "cmtListLen=",cmtListLen;
                    allCmtList.extend(cmtList);
                    logging.debug("Add %d returned comment list into all comment list", cmtListLen);
                    #,"page":{"PageSize":100,"PageIndex":1,"RecordCount":173,"PageCount":2},"fileName":"6543438"}
                    cmtPage = cmtDict['page'];
                    #print "cmtPage=",cmtPage;
                    pageIndex = cmtPage['PageIndex'];
                    #print "pageIndex=",pageIndex;
                    pageCount = cmtPage['PageCount'];
                    #print "pageCount=",pageCount;
                    logging.debug("Returned comment info: page=%s, fileName=%s", cmtDict['page'], cmtDict['fileName']);
                    if(curCmtPageIdx < pageCount):
                        curCmtPageIdx += 1;
                    else :
                        needGetMore = False;
                
                gVal['dbgSubCmtNum'] = 0;
                parseAllCommentsList(allCmtList, parsedCommentsList);
                logging.debug("total parsed %d comments, which include %d sub comments", len(parsedCommentsList), gVal['dbgSubCmtNum']);
                #print "total parsed comments= %d"%(len(parsedCommentsList));
    except :
        logging.debug("Error while fetch and parse comment for %s", url);

    return parsedCommentsList;

예제 #20

0

파일 보기

파일: BlogTianya.py 프로젝트: wodear/blogs-to-wordpress

def find1stPermalink():
    (isFound, retInfo) = (False, "Unknown error!");
    
    try:
        #http://blog.tianya.cn/blogger/post_list.asp?BlogID=2723898&CategoryID=0
        allPostCategoryId = 0;
        allCategoryUrl = genPostListUrl(gVal['blogId'], allPostCategoryId);
        
        respHtml = crifanLib.getUrlRespHtml(allCategoryUrl);
        logging.debug("allCategoryUrl=%s resp respHtml=%s", allCategoryUrl, respHtml);
        # <div class="pages pos-relative">
        # 页码：1/7 << <a href="/blogger/post_list.asp?BlogID=2723898&CategoryID=0&page=1" class="here">1</a> <a href="/blogger/post_list.asp?BlogID=2723898&CategoryID=0&page=2" class="">2</a> <a href="/blogger/post_list.asp?BlogID=2723898&CategoryID=0&page=3" class="">3</a> <a href="/blogger/post_list.asp?BlogID=2723898&CategoryID=0&page=4" class="">4</a> <a href="/blogger/post_list.asp?BlogID=2723898&CategoryID=0&page=5" class="">5</a> <a href="/blogger/post_list.asp?BlogID=2723898&CategoryID=0&page=6" title=下5页>></a> <a href="/blogger/post_list.asp?BlogID=2723898&CategoryID=0&page=7" title=尾页>>></a> 
            # <span class="pos-right gototop"><a href="#">返回顶部</a></span>
        # </div>
        #foundPagesPos = re.search('<div\s+class="pages pos-relative">.+?1/(?P<totalPageNum>\d+)', respHtml, re.S);
        #foundPagesPos = re.search('<div\s+class="pages pos-relative">\s+...1/(?P<totalPageNum>\d+)', respHtml);
        foundPagesPos = re.search('<div\s+class="pages pos-relative">\s+(?P<yemaStr>.+)1/(?P<totalPageNum>\d+)', respHtml);
        foundPagesPos = re.search('<div\s+class="pages pos-relative">\s+.+1/(?P<totalPageNum>\d+)', respHtml);
        logging.debug("foundPagesPos=%s", foundPagesPos);
        if(foundPagesPos):
            #yemaStr = foundPagesPos.group("yemaStr");
            #logging.info("yemaStr=%s", yemaStr);
            totalPageNum = foundPagesPos.group("totalPageNum");
            logging.debug("totalPageNum=%s", totalPageNum);
            totalPageNumInt = int(totalPageNum);
            if(totalPageNumInt == 1):
                lastPageHtml = respHtml;
            else:
                #http://blog.tianya.cn/blogger/post_list.asp?BlogID=2723898&CategoryID=0&page=7
                lastPageUrl = genPostListUrl(gVal['blogId'], 0, totalPageNumInt);
                logging.debug("lastPageUrl=%s", lastPageUrl);
                lastPageHtml = crifanLib.getUrlRespHtml(lastPageUrl);
                
                # special:
                #http://blog.tianya.cn/blogger/post_list.asp?BlogID=4338249&CategoryID=0&page=26
                # contain no post, so here calc prev page
                lastButOneInt = totalPageNumInt - 1;
                lastButOnePageUrl = genPostListUrl(gVal['blogId'], 0, lastButOneInt);
                logging.debug("lastButOnePageUrl=%s", lastButOnePageUrl);
                lastButOnePageHtml = crifanLib.getUrlRespHtml(lastButOnePageUrl);
        #<li class="articlecell cf"><p class="ptit"><a href="/blogger/post_read.asp?BlogID=2723898&PostID=47717072" target="_blank">再见过去，我们的足迹</a></p><p class="ptime">2012-10-27 08:13</p><p class="pcomments">3</p></li>
        soup = htmlToSoup(lastPageHtml);
        foundArticleCellCf = soup.findAll(attrs={"class":"articlecell cf"});
        if(not foundArticleCellCf):
            # for special:
            #http://blog.tianya.cn/blogger/post_list.asp?BlogID=4338249&CategoryID=0&page=26
            soup = htmlToSoup(lastButOnePageHtml);
            foundArticleCellCf = soup.findAll(attrs={"class":"articlecell cf"});
        logging.debug("foundArticleCellCf=%s", foundArticleCellCf);
        articleCellCfLen = len(foundArticleCellCf);
        logging.debug("articleCellCfLen=%s", articleCellCfLen);
        lastPostSoup = foundArticleCellCf[-1];
        logging.debug("lastPostSoup=%s", lastPostSoup);
        logging.debug("lastPostSoup.contents=%s", lastPostSoup.contents);
        #p0 = lastPostSoup.p[0];
        p0 = lastPostSoup.contents[0];
        logging.debug("p0=%s", p0);
        p0a = p0.a;
        logging.debug("p0a=%s", p0a);
        href = p0a['href'];
        logging.debug("href=%s", href);
        lastPostUrl = gConst['spaceDomain'] + href;
        logging.debug("lastPostUrl=%s", lastPostUrl);
        if(lastPostUrl):
            retInfo = lastPostUrl;
            isFound = True;
    except:
        (isFound, retInfo) = (False, "Unknown error!");
    
    return (isFound, retInfo);