Пример #1
0
def google_rank_new_by_html(uid, html):
    jpg = txt_wrap_by(
        'height="200" src="//',
        'photo.jpg?sz=200',
        html
    )
    jpg = '//%sphoto.jpg'%jpg #?sz=200

    follower = txt_wrap_by(
        '(',
        ')',
        txt_wrap_by(
            '>圈子中有', '</h4>', html
        )
    )
    if follower:
        follower = follower.replace(',', '')
        if not follower.isdigit():
            follower = 0
    else:
        follower = 0
    name = txt_wrap_by('<title>', '</title>', html).rsplit(' - ')[0]
    txt = txt_wrap_by(
            """介绍</h2><div """,
            '''</div></div><div class="''',
            html
        )
    if txt:
        txt = txt[txt.find('note">')+6:].replace('</div>', ' ').replace('<div>', ' ').replace('<span>', '').replace('</span>', '').strip()
    return google_rank_new(uid, follower, jpg, name, txt)
Пример #2
0
def page_parse(htm_file):

    html = open(htm_file).read()
    title = txt_wrap_by('<title>','- 知乎',html)
    tags = txt_wrap_by_all('xgm" href="javascript:;">', '</', html)
    reply_raw_list = txt_wrap_by_all('<div class="xmo">','class="xnq xml xnh">',html)
    replies = [ htm2txt(x)[0] for x in reply_raw_list ]

    js = '["current_question",' +txt_wrap_by("(['current_question', ",');',html)
    a = loads(js)

    answer_list=[]

    question_info={}
    question_info['answer'] = answer_list
    question_info['tags'] = [ x[0] for x in a[1][3] ]
    question_info['title'] = title
    question_info['body'] = htm2txt(txt_wrap_by('<div class="xvrw">','<a href="javascript',html))[0]
    replies_line = zip(a[1][12],replies)

    for x in replies_line:
        try:
            new_ans={}
            new_ans['name'] = x[0][2][0]
            new_ans['answer'] = x[1]
            new_ans['id'] = x[0][2][1]
            new_ans['signature'] = x[0][3]
            new_ans['votes'] = x[0][4]
            answer_list.append(new_ans)
        except:
            continue
    out_file.write(dumps(question_info)+'\n')
Пример #3
0
 def title(self, data):
     title = txt_wrap_by(
         '<tr><td class="tablelc"></td><td class="tablecc"><strong>标题:</strong>',
         '</td>', data)
     if not title:
         title = txt_wrap_by('<title>', '</title>', data)
     return title
Пример #4
0
def page_parse(htm_file):

    html = open(htm_file).read()
    title = txt_wrap_by('<title>', '- 知乎', html)
    tags = txt_wrap_by_all('xgm" href="javascript:;">', '</', html)
    reply_raw_list = txt_wrap_by_all('<div class="xmo">',
                                     'class="xnq xml xnh">', html)
    replies = [htm2txt(x)[0] for x in reply_raw_list]

    js = '["current_question",' + txt_wrap_by("(['current_question', ", ');',
                                              html)
    a = loads(js)

    answer_list = []

    question_info = {}
    question_info['answer'] = answer_list
    question_info['tags'] = [x[0] for x in a[1][3]]
    question_info['title'] = title
    question_info['body'] = htm2txt(
        txt_wrap_by('<div class="xvrw">', '<a href="javascript', html))[0]
    replies_line = zip(a[1][12], replies)

    for x in replies_line:
        try:
            new_ans = {}
            new_ans['name'] = x[0][2][0]
            new_ans['answer'] = x[1]
            new_ans['id'] = x[0][2][1]
            new_ans['signature'] = x[0][3]
            new_ans['votes'] = x[0][4]
            answer_list.append(new_ans)
        except:
            continue
    out_file.write(dumps(question_info) + '\n')
Пример #5
0
    def __call__(self, data, url):
        rid = url_last(url)
        cid = self.cid

        title = self.title(data)

        rec_num = txt_wrap_by('<span class="rec-num">', '人</span>', data) or 0
        like_num = txt_wrap_by('<span class="fav-num" data-tid="', '</a>喜欢</span>', data) or 0
        if like_num:
            like_num = txt_wrap_by('<a href="#">', '人', like_num)
            yield parse_like , URL_LIKE%(cid, rid), cid, rid

        _topic = _owner = 0

        owner_id = self.user_id(data)
        if owner_id is None:
            return

        try:
            owner_id = int(owner_id)
        except ValueError:
            _owner_id = DoubanUser.by_url(owner_id)
            if _owner_id:
                owner_id = _owner_id
            else:
                _owner = owner_id
                owner_id = 0

        topic_id = self.topic_id(data)
        try:
            topic_id = int(topic_id)
        except ValueError:
            _topic = topic_id
            topic_id = 0

        time = self.time(data)
        if time:
            time = int_by_string(time)

        feed_id = douban_feed_new(
            cid,
            rid,
            rec_num,
            like_num,
            title,
            self.htm(data),
            time,
            owner_id,
            topic_id
        )


        if _owner or _topic:
            DoubanFeedOwner(id=feed_id, topic=_topic, owner=_owner).save()

        #for user_id in user_id_by_txt(data):
        #    yield douban_recommendation_begin_tuple(user_id)

        if url in EXIST_PARSE:
            EXIST_PARSE.remove(url)
def zhihu_question_parser(html, url):
    name = txt_wrap_by(
        '<title>',
        ' - 知乎</title>',
        html
    )
    name = unescape(name)
    if  '<h3>邀请别人回答问题</h3>' in html:
        answer_count = txt_wrap_by('<span id="xhrw">', ' 个答案</span>', html)
    else:
        answer_count = txt_wrap_by('<h3 style="margin: 0 0 5px;">', ' 个答案</', html)

    tag = map(unescape, txt_wrap_by_all('<a class="xjl" href="javascript:;">', '</a>', html))
    #print tag[0]
    answer_count =  int(answer_count or 0)

    if answer_count:
        txt = filter(bool, txt_wrap_by_all('<div class="xmrw">','</div>', html))
        if not txt:
            print url
            print name
            #raw_input()
        else:
            print txt[0]
    else:
        if "个答案" in html and ("0 个答案" not in html) and "还没有答案" not in html:
            print url
            print html 
            #raw_input()
        txt = []

    RESULT.append((answer_count, url, name, tag, [htm2txt(i) for i in txt]))

    print how_long.again(), how_long.remain, how_long.done
Пример #7
0
def zhihu_question_parser(html, url):
    name = txt_wrap_by('<title>', ' - 知乎</title>', html)
    name = unescape(name)
    if '<h3>邀请别人回答问题</h3>' in html:
        answer_count = txt_wrap_by('<span id="xhrw">', ' 个答案</span>', html)
    else:
        answer_count = txt_wrap_by('<h3 style="margin: 0 0 5px;">', ' 个答案</',
                                   html)

    tag = map(
        unescape,
        txt_wrap_by_all('<a class="xjl" href="javascript:;">', '</a>', html))
    #print tag[0]
    answer_count = int(answer_count or 0)

    if answer_count:
        txt = filter(bool, txt_wrap_by_all('<div class="xmrw">', '</div>',
                                           html))
        if not txt:
            print url
            print name
            #raw_input()
        else:
            print txt[0]
    else:
        if "个答案" in html and ("0 个答案" not in html) and "还没有答案" not in html:
            print url
            print html
            #raw_input()
        txt = []

    RESULT.append((answer_count, url, name, tag, [htm2txt(i) for i in txt]))

    print how_long.again(), how_long.remain, how_long.done
Пример #8
0
 def _parse_user_response(self, callback, xml):
     if xml:
         from zkit.bot_txt import txt_wrap_by
         soup = txt_wrap_by('<author>', '</author>', xml)
         user = dict(uid=txt_wrap_by('<email>', '</email>', soup),
                     name=txt_wrap_by('<name>', '</name>', soup))
     else:
         user = None
     callback(user)
Пример #9
0
def main():
    cookies = ((
        '*****@*****.**',
        '_xsrf=7ed86e897bae4b9e8cf3e660efed7baf; q_c0=MTk2OTAzfGdmWDM5Q2pZNVpaUW9UTzA=|1326267926|eedfe70f85add0db0ecda1e73200cac9b085ecc6; __utma=155987696.1247389772.1322703824.1326190947.1326266591.29; __utmb=155987696.34.10.1326266591; __utmc=155987696; __utmz=155987696.1325768571.27.6.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=155987696.Logged%20In'
    ), )

    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset':
        'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Language':
        'en,en-US;q=0.8,zh-CN;q=0.6,zh;q=0.4',
        'Cache-Control':
        'max-age=0',
        'Connection':
        'keep-alive',
        'Host':
        'www.zhihu.com',
        'Referer:http':
        '//www.zhihu.com/',
        'User-Agent':
        'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11',
    }
    count = 0
    headers['cookie'] = cookies[0][1]
    explore_page = fetch('http://www.zhihu.com/explore', headers=headers)

    entry_list = txt_wrap_by_all('<div class="xxn">', '</div', explore_page)
    reting_raw = txt_wrap_by("['explore_list',", ');', explore_page)
    data = loads(reting_raw)
    author_list = [[i[3][1][0].encode('utf-8'), i[3][2].encode('utf-8')]
                   for i in data]
    rating_list = [i[3][3] for i in data]

    label_list = txt_wrap_by_all('"padding:3px 0 0" class="xm">', '</div',
                                 explore_page)
    result_label = [txt_wrap_by_all('">', '</a', i) for i in label_list]

    url_list = txt_wrap_by_all('<h2', '</h2>', explore_page)
    id_list = [txt_wrap_by('question/', '/answer', i) for i in url_list]
    title_list = [
        txt_wrap_by('">', '<', txt_wrap_by('href="', '/a>', i))
        for i in url_list
    ]

    url_list = txt_wrap_by_all('<h2', '</h2>', explore_page)
    id_list = [txt_wrap_by('question/', '/answer', i) for i in url_list]
    url_list = ['http://www.zhihu.com/question/%s' % id for id in id_list]

    entry_list = zip(title_list, rating_list, result_label, author_list,
                     url_list, entry_list)

    for entry in entry_list:
        content, pic_list = htm2txt(entry[5])
        Spider.insert(entry[0], entry[2], content, entry[3][0], entry[1],
                      entry[4], [], pic_list)
Пример #10
0
    def parse_page(self,page,url):
        print "Dongxi...%s"%url
        title = txt_wrap_by('<div class="content_title clearfix">','</h1>',page).strip().split('>')[-1].strip()
        author = txt_wrap_by('<a class="link_text_blue" href="','</a>',page).strip().split('>')[-1].strip()

        tags = map(lambda x:x.split('>')[-1],txt_wrap_by_all("<a  class='link_text_blue'",'</a>',page))
        rating_num = txt_wrap_by('onclick="favorate(',')',page)
        
        content = txt_wrap_by('id="full_text">','</div',page)

        yield self.parse_rat,'http://dongxi.net/content/widget/page_id/%s'%rating_num,title,author,tags, url,content
Пример #11
0
 def _parse_user_response(self, callback, xml):
     if xml:
         from zkit.bot_txt import txt_wrap_by
         soup = txt_wrap_by('<author>', '</author>', xml)
         user = dict(
             uid=txt_wrap_by('<email>', '</email>', soup),
             name=txt_wrap_by('<name>', '</name>', soup)
         )
     else:
         user = None
     callback(user)
Пример #12
0
    def __call__(self, data, url):
        rid = url_last(url)
        cid = self.cid

        title = self.title(data)

        rec_num = txt_wrap_by('<span class="rec-num">', '人</span>', data) or 0
        like_num = txt_wrap_by('<span class="fav-num" data-tid="',
                               '</a>喜欢</span>', data) or 0
        if like_num:
            like_num = txt_wrap_by('<a href="#">', '人', like_num)
            yield parse_like, URL_LIKE % (cid, rid), cid, rid

        _topic = _owner = 0

        owner_id = self.user_id(data)
        if owner_id is None:
            return

        try:
            owner_id = int(owner_id)
        except ValueError:
            _owner_id = DoubanUser.by_url(owner_id)
            if _owner_id:
                owner_id = _owner_id
            else:
                _owner = owner_id
                owner_id = 0

        topic_id = self.topic_id(data)
        try:
            topic_id = int(topic_id)
        except ValueError:
            _topic = topic_id
            topic_id = 0

        time = self.time(data)
        if time:
            time = int_by_string(time)

        feed_id = douban_feed_new(cid, rid, rec_num, like_num, title,
                                  self.htm(data), time, owner_id, topic_id)

        if _owner or _topic:
            DoubanFeedOwner(id=feed_id, topic=_topic, owner=_owner).save()

        #for user_id in user_id_by_txt(data):
        #    yield douban_recommendation_begin_tuple(user_id)

        if url in EXIST_PARSE:
            EXIST_PARSE.remove(url)
Пример #13
0
    def parse_page(self,filepath):
        with open(filepath) as f:
            page = f.read()

            title = txt_wrap_by('<title>译言网 | ', '</ti', page)
            tags_wrapper = txt_wrap_by('wumiiTags = "', '"', page)
            tags = tags_wrapper.split(',')
            author = txt_wrap_by('<h2 id="user_info"', '/a', page)
            author = txt_wrap_by('">','<',author)
            rating = txt_wrap_by('已有<span class="number">', '</span', page)
            content_wrapper = txt_wrap_by('id="conBox">','<div class="article_content">',page)
            url = txt_wrap_by('wumiiPermaLink = "','"',page)
            if content_wrapper:
                content,pic_list = htm2txt(content_wrapper)
            else:
                return 

            content = str(content)

            reply_wrapper_list = txt_wrap_by_all('class="comment_content">', '</ul', page)
            reply_list = []
            for reply_wrapper in reply_wrapper_list:
                reply_list.append(txt_wrap_by('<p>', '</p', reply_wrapper))

            Spider.insert(title, tags, content, author, rating ,url, reply_list, pic_list)
Пример #14
0
 def parse_index(self,page,url):
     link_wrap_list = txt_wrap_by_all('已翻译','<span',page)
     link_list = []
     for link_wrap in link_wrap_list:
         url = txt_wrap_by('href="','"',link_wrap)
         if url and not url_is_fetched(url):
             yield self.parse_page,'http://dongxi.net/%s'%url
Пример #15
0
    def __call__(self, html, url):
        html = txt_wrap_by('<ul class="list-m">', '</ul>', html)
        items = txt_wrap_by_all('<li class="item">', '</div>', html)
        if not items:
            items = txt_wrap_by_all('<h3><a', '</h3', html)

        links = []
        for item in items:
            link = txt_wrap_by('href="', '"', item)

            id = txt_wrap_by('http://www.douban.com/event/', '/', link)
            id = int(id)
            event = ImportDoubanEvent.get(id)
            if not event:
                yield self.parse_event_page, link, id
                ImportDoubanEvent(id=id, event_id=0).save()
Пример #16
0
def page_parse(html):
    title = txt_wrap_by('<title>', '- 知乎', html)
    tags = txt_wrap_by_all('data-tip="t$b$', '"', html)
    for i in tags:
        print i,
    print title
    print ''
    def __call__(self, html, url):
        html = txt_wrap_by('<ul class="list-m">', '</ul>', html)
        items = txt_wrap_by_all('<li class="item">', '</div>', html)
        if not items:
            items = txt_wrap_by_all('<h3><a', '</h3', html)

        links = []
        for item in items:
            link = txt_wrap_by('href="', '"', item)

            id = txt_wrap_by('http://www.douban.com/event/', '/', link)
            id = int(id)
            event = ImportDoubanEvent.get(id)
            if not event:
                yield self.parse_event_page, link , id
                ImportDoubanEvent(id=id,event_id=0).save()
def page_parse(html):
    title = txt_wrap_by('<title>', '- 知乎', html)
    tags = txt_wrap_by_all('data-tip="t$b$', '"', html)
    for i in tags:
        print i,
    print title
    print ''
Пример #19
0
def wm_parser(html, url):
    user = txt_wrap_by('&u=', '&', url)
    #print user
    time = txt_wrap_by('<li id="maxActionTimeInMs"  m="', '"', html)
    if time and 'm='+time not in url and int(time) > 0:
        yield wm_parser, url[:url.rfind('=')+1]+str(time)

    user_id = wm_user_id(user)
    for i in txt_wrap_by_all(' itemid="', '<p class="operating">', html):
        if 'class="content"' in i:
            id = i[:i.find('"')]

            wm = SpiderWm.get(wmid=id)
            if wm is None:
                yield wm_txt_parser, 'http://www.wumii.com/reader/article?id=%s'%id, user_id
            else:
                wm_fav(user_id, wm.id)
Пример #20
0
    def htm(self, data):
        result = [ ]
        html = txt_wrap_by('<div class="topic-content">', '</div>', data)
        if html:
            result.append(html)
        user_id = self.user_id(data)
        topic_reply = txt_wrap_by('<ul class="topic-reply">', '</ul>', data)
        topic_reply = txt_wrap_by_all(' <div class="reply-doc">', ' class="lnk-reply">回应</a>', topic_reply)

        for i in topic_reply:
            owner_id = txt_wrap_by('<div class="bg-img-green">', '</h4>', i)
            owner_id = txt_wrap_by('<a href="http://www.douban.com/people/', '/">', owner_id)
            if owner_id != user_id:
                break
            result.append(txt_wrap_by('</div>', '<div class="operation_div"', i))

        return '\n'.join(result)
Пример #21
0
def google_rank_new_by_html(uid, html):
    jpg = txt_wrap_by('height="200" src="//', 'photo.jpg?sz=200', html)
    jpg = '//%sphoto.jpg' % jpg  #?sz=200

    follower = txt_wrap_by('(', ')', txt_wrap_by('>圈子中有', '</h4>', html))
    if follower:
        follower = follower.replace(',', '')
        if not follower.isdigit():
            follower = 0
    else:
        follower = 0
    name = txt_wrap_by('<title>', '</title>', html).rsplit(' - ')[0]
    txt = txt_wrap_by("""介绍</h2><div """, '''</div></div><div class="''', html)
    if txt:
        txt = txt[txt.find('note">') + 6:].replace('</div>', ' ').replace(
            '<div>', ' ').replace('<span>', '').replace('</span>', '').strip()
    return google_rank_new(uid, follower, jpg, name, txt)
Пример #22
0
def wm_parser(html, url):
    user = txt_wrap_by('&u=', '&', url)
    #print user
    time = txt_wrap_by('<li id="maxActionTimeInMs"  m="', '"', html)
    if time and 'm=' + time not in url and int(time) > 0:
        yield wm_parser, url[:url.rfind('=') + 1] + str(time)

    user_id = wm_user_id(user)
    for i in txt_wrap_by_all(' itemid="', '<p class="operating">', html):
        if 'class="content"' in i:
            id = i[:i.find('"')]

            wm = SpiderWm.get(wmid=id)
            if wm is None:
                yield wm_txt_parser, 'http://www.wumii.com/reader/article?id=%s' % id, user_id
            else:
                wm_fav(user_id, wm.id)
Пример #23
0
def zhihu_question_parser(html, url):
    name = txt_wrap_by(
        '<title>',
        ' - 知乎</title>',
        html
    )
    name = unescape(name)
    print name
    print how_long.again(), how_long.remain, how_long.done
Пример #24
0
def main():
    cookies = (
        (
            "*****@*****.**",
            "_xsrf=7ed86e897bae4b9e8cf3e660efed7baf; q_c0=MTk2OTAzfGdmWDM5Q2pZNVpaUW9UTzA=|1326267926|eedfe70f85add0db0ecda1e73200cac9b085ecc6; __utma=155987696.1247389772.1322703824.1326190947.1326266591.29; __utmb=155987696.34.10.1326266591; __utmc=155987696; __utmz=155987696.1325768571.27.6.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=155987696.Logged%20In",
        ),
    )

    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
        "Accept-Language": "en,en-US;q=0.8,zh-CN;q=0.6,zh;q=0.4",
        "Cache-Control": "max-age=0",
        "Connection": "keep-alive",
        "Host": "www.zhihu.com",
        "Referer:http": "//www.zhihu.com/",
        "User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11",
    }
    count = 0
    headers["cookie"] = cookies[0][1]
    explore_page = fetch("http://www.zhihu.com/explore", headers=headers)

    entry_list = txt_wrap_by_all('<div class="xxn">', "</div", explore_page)
    reting_raw = txt_wrap_by("['explore_list',", ");", explore_page)
    data = loads(reting_raw)
    author_list = [[i[3][1][0].encode("utf-8"), i[3][2].encode("utf-8")] for i in data]
    rating_list = [i[3][3] for i in data]

    label_list = txt_wrap_by_all('"padding:3px 0 0" class="xm">', "</div", explore_page)
    result_label = [txt_wrap_by_all('">', "</a", i) for i in label_list]

    url_list = txt_wrap_by_all("<h2", "</h2>", explore_page)
    id_list = [txt_wrap_by("question/", "/answer", i) for i in url_list]
    title_list = [txt_wrap_by('">', "<", txt_wrap_by('href="', "/a>", i)) for i in url_list]

    url_list = txt_wrap_by_all("<h2", "</h2>", explore_page)
    id_list = [txt_wrap_by("question/", "/answer", i) for i in url_list]
    url_list = ["http://www.zhihu.com/question/%s" % id for id in id_list]

    entry_list = zip(title_list, rating_list, result_label, author_list, url_list, entry_list)

    for entry in entry_list:
        content, pic_list = htm2txt(entry[5])
        Spider.insert(entry[0], entry[2], content, entry[3][0], entry[1], entry[4], [], pic_list)
Пример #25
0
 def parse_index(self,page, url):
     print "!"
     link_wrapper_list = txt_wrap_by_all('<h5 clas', '</h5', page)
     link_list = []
     for link_wrapper in link_wrapper_list:
         url = txt_wrap_by('href="', '"', link_wrapper)
         filename = self.name_builder(url)
         if not url_is_fetched(url):
             yield self.save_page, url
         else:
             self.parse_page(filename)
Пример #26
0
    def htm(self, data):
        result = []
        html = txt_wrap_by('<div class="topic-content">', '</div>', data)
        if html:
            result.append(html)
        user_id = self.user_id(data)
        topic_reply = txt_wrap_by('<ul class="topic-reply">', '</ul>', data)
        topic_reply = txt_wrap_by_all(' <div class="reply-doc">',
                                      ' class="lnk-reply">回应</a>', topic_reply)

        for i in topic_reply:
            owner_id = txt_wrap_by('<div class="bg-img-green">', '</h4>', i)
            owner_id = txt_wrap_by('<a href="http://www.douban.com/people/',
                                   '/">', owner_id)
            if owner_id != user_id:
                break
            result.append(
                txt_wrap_by('</div>', '<div class="operation_div"', i))

        return '\n'.join(result)
Пример #27
0
def wm_txt_parser(html, url, user_id):
    id = url.rsplit('=')[-1]
    name = txt_wrap_by('target="_blank">', '</a></p>', html)
    author = txt_wrap_by('">来自:', '<', html)
    link = txt_wrap_by('href="', '"',
                       txt_wrap_by('<p class="info', '</p>', html))
    like = txt_wrap_by('class="num-likeIt">', '人喜欢</a>', html)
    txt = txt_wrap_by('<div class="content">', ' <p class="operating">', html)

    time = txt_wrap_by('<span class="time">', '</span>', html)
    wm = wm_save(id, like, name, author, link, time, txt)
    wm_fav(user_id, wm.id)
Пример #28
0
def wm_txt_parser(html, url, user_id):
    id = url.rsplit('=')[-1]
    name = txt_wrap_by('target="_blank">', '</a></p>', html)
    author = txt_wrap_by('">来自:', '<', html)
    link = txt_wrap_by(
        'href="',
        '"',
        txt_wrap_by('<p class="info', '</p>', html)
    )
    like = txt_wrap_by(
        'class="num-likeIt">',
        '人喜欢</a>',
        html
    )
    txt = txt_wrap_by(
        '<div class="content">',
       ' <p class="operating">',
        html
    )

    time = txt_wrap_by('<span class="time">', '</span>', html)
    wm = wm_save(id, like, name, author, link, time, txt)
    wm_fav(user_id, wm.id)
Пример #29
0
    def parse_page(self,filepath):
        with open(filepath) as f:
            page = f.read()

            title = txt_wrap_by('<title>', '- UCD大社区', page)
            author = txt_wrap_by('style=" float:left; color:#999;">', '</span', page)
            author = txt_wrap_by('作者:', '|', author)
            content_wrapper = txt_wrap_by('<div id="pageContentWrap" style="font-size:13px; ">', '</div', page)
            url =txt_wrap_by('阅读和发布评论:<a href="','"',page)
            blog_url = txt_wrap_by('>推荐您进入文章源地址阅读和发布评论:<a href="','"',page)

            if content_wrapper:
                content,pic_list = htm2txt(content_wrapper.decode('utf-8','ignore' ))
            else:
                return 
            
            content = str(content)
            tags = TAGGER.get_tag(content+title)
            #tags = TAGGER.get_tag(content+title)
            #out = dumps([title,url,tags])
            #print out
            out = dumps([ title, content, author, tags ])
            #out = dumps([ title, content, author, blog_url ])
            print out
Пример #30
0
 def topic_id(self, data):
     line = txt_wrap_by('http://site.douban.com/widget/notes/', '/', data)
     return line
Пример #31
0
 def time(self, data):
     line = txt_wrap_by('<span class="datetime">', '</span>', data)
     return line
Пример #32
0
 def title(self, data):
     title = txt_wrap_by('<title>', '</title>', data)
     return title
Пример #33
0
    def parse_event_page(self, page, url, douban_event_id):
        title = txt_wrap_by('h1>', '</h1>', page)
        pic_url = txt_wrap_by('href="', '"',
                              txt_wrap_by('class="album_photo"', '>', page))
        begin_time = txt_wrap_by('ail">', '<',
                                 txt_wrap_by('开始时间', '/div', page))
        end_time = txt_wrap_by('ail">', '<', txt_wrap_by('结束时间', '/div', page))
        address = unicode(
            txt_wrap_by(
                'span>', '<',
                txt_wrap_by('地点', 'br/>',
                            txt_wrap_by('class="obmo">', '</div',
                                        page)))).split(' ')
        typ = txt_wrap_by('类型: </span>', '<br/', page)
        typ = txt_wrap_by('">', '/', typ)
        intro = txt_wrap_by('play:none">', '<a href="javasc', page)
        phone = txt_wrap_by('电话', '<br/', intro)
        if not intro:
            intro = txt_wrap_by('<div class="wr">', '</div>', page)
        if phone:
            phone = phone.replace(':', '').replace(':', '')

        event = save_event(self, phone, address, begin_time, end_time, title,
                           intro, douban_event_id, typ)

        if event:
            yield save_pic, pic_url, event
Пример #34
0
 def time(self, data):
     line = txt_wrap_by('<div class="note-header">', '</span>', data)
     line = txt_wrap_by('<span class="pl">', None, line)
     return line
Пример #35
0
def zhihu_topic_parser(html, url):
    txt = txt_wrap_by( 'DZMT.push(["current_topic",', ')', html )
    global FETCH_COUNT

    print how_long.again(), how_long.done, how_long.remain 
    print loads(txt)[:2][0][0]
Пример #36
0
 def group_id(self, data):
     tmp = txt_wrap_by('<form action="/group/topic_search', '</form', data)
     return txt_wrap_by('<input type="hidden" value="', '" name="group',
                        tmp)
Пример #37
0
 def time(self, data):
     line = txt_wrap_by('<div class="topic-doc">','</span>',data)
     line = txt_wrap_by('<span class="color-green">',None,line)
     return line
Пример #38
0
 def title(self, data):
     title = txt_wrap_by('<tr><td class="tablelc"></td><td class="tablecc"><strong>标题:</strong>', '</td>', data)
     if not title:
         title = txt_wrap_by('<title>', '</title>', data)
     return title
Пример #39
0
 def topic_id(self, data):
     line = txt_wrap_by('<div class="aside">', '">回', data)
     line = txt_wrap_by('"http://www.douban.com/group/', '/', line)
     return line
Пример #40
0
 def htm(self, data):
     return txt_wrap_by('<pre class="note">', '</pre>', data)
Пример #41
0
 def user_id(self, data):
     line = txt_wrap_by('<div class="pic">', '">', data)
     line = txt_wrap_by('"http://www.douban.com/people/', '/', line)
     return line
Пример #42
0
 def htm(self, data):
     return txt_wrap_by(' class="note-content"><pre>', '</pre>', data)
    def parse_event_page(self, page, url, douban_event_id):
        title = txt_wrap_by('h1>', '</h1>', page)
        pic_url = txt_wrap_by('href="', '"', txt_wrap_by('class="album_photo"', '>', page))
        begin_time = txt_wrap_by('ail">', '<', txt_wrap_by('开始时间', '/div', page))
        end_time = txt_wrap_by('ail">', '<', txt_wrap_by('结束时间', '/div', page))
        address = unicode(txt_wrap_by('span>', '<', txt_wrap_by('地点', 'br/>', txt_wrap_by('class="obmo">', '</div', page)))).split(' ')
        typ = txt_wrap_by('类型: </span>', '<br/', page)
        typ = txt_wrap_by('">', '/', typ)
        intro = txt_wrap_by('play:none">', '<a href="javasc', page)
        phone = txt_wrap_by('电话', '<br/', intro)
        if not intro:
            intro = txt_wrap_by('<div class="wr">', '</div>', page)
        if phone:
            phone = phone.replace(':', '').replace(':', '')

        event = save_event(self, phone, address, begin_time, end_time, title, intro, douban_event_id, typ)

        if event:
            yield save_pic, pic_url, event
Пример #44
0
 def leader_id(self, data):
     t = txt_wrap_by('组长:', '</a>', data)
     return txt_wrap_by('www.douban.com/people/', '/">', data) or 0
Пример #45
0
 def htm(self, data):
     return txt_wrap_by(' class="note-content"><pre>', '</pre>', data)
Пример #46
0
 def intro(self, data):
     t = txt_wrap_by('class="infobox">', '<div class="rec-sec',
                     data.replace('\r', ' ').replace('\n', ' '))
     return txt_wrap_by('</p>', ' <div', t)
Пример #47
0
def zhihu_topic_parser(html, url):
    txt = txt_wrap_by('DZMT.push(["current_topic",', ')', html)
    global FETCH_COUNT

    print how_long.again(), how_long.done, how_long.remain
    print loads(txt)[:2][0][0]
Пример #48
0
 def topic_id(self, data):
     line = txt_wrap_by('http://site.douban.com/widget/notes/', '/', data)
     return line
Пример #49
0
 def group_short_url(self, data):
     return txt_wrap_by('http://www.douban.com/feed/group/', '/discussion',
                        data)
Пример #50
0
 def time(self, data):
     line = txt_wrap_by('<span class="datetime">','</span>', data)
     return line
Пример #51
0
 def name(self, data):
     return unescape(
         str(txt_wrap_by('<title>', '</title>', data).strip())[:-6])  #xxx小组
Пример #52
0
 def htm(self, data):
     return txt_wrap_by('<pre class="note">', '</pre>', data)
Пример #53
0
 def member_num(self, data):
     #女巫店小组 浏览所有店里的小孩们 (43025
     line = txt_wrap_by('/members">', ')</a>', data)
     if not line:
         return 0
     return int(txt_wrap_by(' (', None, line))
Пример #54
0
 def user_id(self, data):
     line = txt_wrap_by('<div class="pic">', '">', data)
     line = txt_wrap_by('"http://www.douban.com/people/', '/', line)
     return line
Пример #55
0
 def time(self, data):
     line = txt_wrap_by('<div class="note-header">', '</span>', data)
     line = txt_wrap_by('<span class="pl">', None, line)
     return line
Пример #56
0
 def member_num(self, data):
     # 女巫店小组 浏览所有店里的小孩们 (43025
     line = txt_wrap_by('/members">', ")</a>", data)
     if not line:
         return 0
     return int(txt_wrap_by(" (", None, line))