Exemplo n.º 1
0
def crawl_dialogue_by_comment_id(cid, mid):
    cur_time = int(time.time() * 1000)

    dialogue_url = AJAX_URL.format(cid, cur_time)

    html = get_page(dialogue_url, auth_level=2, is_ajax=True)
    dialogue_data = dialogue.get_dialogue(html, mid, cid)

    CommonOper.add_one(dialogue_data)
Exemplo n.º 2
0
def get_people_and_follows(people_id, selector):
    try:
        people = People()
        people.people_id = people_id
        people.name = selector.xpath(
            '//div[@class="aw-user-center"]/div[1]/div/h1/text()')[0].strip()
        people.desc = "".join(
            selector.xpath(
                '//div[@class="aw-user-center"]/div[1]/div/span/text()'))
        if selector.xpath('//i[contains(@class,"i-user-locate")]'):
            user_locate = selector.xpath(
                '//i[contains(@class,"i-user-locate")]')[0].getparent()
            people.province = "".join(user_locate.xpath('a[1]/text()'))
            people.city = "".join(user_locate.xpath('a[2]/text()'))
        if selector.xpath('//i[contains(@class,"i-user-post")]'):
            user_post = selector.xpath(
                '//i[contains(@class,"i-user-post")]')[0].getparent()
            people.post = "".join(user_post.xpath('text()')).strip()
        if selector.xpath('//i[contains(@class,"i-user-visits")]'):
            user_visits = selector.xpath(
                '//i[contains(@class,"i-user-visits")]')[0].getparent()
            user_visits_str = "".join(user_visits.xpath('text()'))
            people.home_visit_num = re.findall('(\d+)', user_visits_str)[0]
        people_type_spans = selector.xpath(
            '//div[@class="aw-user-center"]/div[1]/div/p[3]/span')
        people.user_type = people_type_spans[0].xpath(
            'a/em/text()')[0].replace("»", "").strip()
        people.weiwang_num = people_type_spans[1].xpath('em/text()')[0]
        people.agree_num = people_type_spans[2].xpath('em/text()')[0]
        people.thanks_num = people_type_spans[3].xpath('em/text()')[0]
        people.gold_num = people_type_spans[4].xpath('em/text()')[0]
        if '+' in people.gold_num:
            people.gold_num = 100
        if selector.xpath('//span[contains(text(),"最后活跃")]'):
            last_active_time_str = selector.xpath(
                '//span[contains(text(),"最后活跃")]')[0].getparent().getnext(
                ).xpath('text()')[0]
            people.last_active_time = str2datetime(last_active_time_str)
        CommonOper.add_one(people)
        CommonOper.add_filter_key("people_id", people_id)
    except Exception as e:
        jsl_log.warning(
            "get people info error,people_id:{},here are details {}".format(
                people_id,
                format_tb(e.__traceback__)[0]))
    app.send_task("tasks.people.do_follow",
                  args=(
                      people_id,
                      0,
                  ),
                  queue="people_queue",
                  routing_key="people")
Exemplo n.º 3
0
def crawl_dialogue_by_comment_id(cid, mid):
    cur_time = int(time.time() * 1000)

    dialogue_url = AJAX_URL.format(cid, cur_time)

    html = get_page(dialogue_url, auth_level=2, is_ajax=True)
    dialogue_data, uids = dialogue.get_dialogue(html, mid, cid)
    if dialogue_data:
        CommonOper.add_one(dialogue_data)

    if uids:
        for uid in uids:
            # crawl_person_infos_not_in_seed_ids(uid)
            app.send_task('tasks.user.crawl_person_infos_not_in_seed_ids',
                          args=(uid,),
                          queue='user_crawler',
                          routing_key='for_user_info')
Exemplo n.º 4
0
def crawl_dialogue_by_comment_id(cid, mid):
    cur_time = int(time.time() * 1000)

    dialogue_url = AJAX_URL.format(cid, cur_time)

    html = get_page(dialogue_url, auth_level=2, is_ajax=True)
    dialogue_data, uids = dialogue.get_dialogue(html, mid, cid)
    if dialogue_data:
        CommonOper.add_one(dialogue_data)

    if uids:
        for uid in uids:
            # crawl_person_infos_not_in_seed_ids(uid)
            app.send_task('tasks.user.crawl_person_infos_not_in_seed_ids',
                          args=(uid, ),
                          queue='user_crawler',
                          routing_key='for_user_info')
Exemplo n.º 5
0
def get_question_and_agree(question_id, selector):
    try:
        question = Question()
        question.question_id = selector.xpath(
            '//div[@id="question_topic_editor"]/@data-id')[0]
        question.title = selector.xpath(
            '//div[@class="aw-mod-head"]/h1/text()')[0]
        question.people_id = selector.xpath(
            '//dd[@class="pull-left"]/a/@data-id')[0]
        task_filter('people', question.people_id)
        post_time_str = selector.xpath(
            '//div[@class="aw-question-detail-meta"]/div[1]/span[1]/text()'
        )[0].replace("发表时间 ", "")
        question.post_time = str2datetime(post_time_str)
        access_time_str = selector.xpath(
            '//div[@class="aw-side-bar-mod-body"]/ul/li[1]/span/text()')[0]
        question.access_time = str2datetime(access_time_str)
        question.read_num = selector.xpath(
            '//div[@class="aw-side-bar-mod-body"]/ul/li[2]/span/text()')[0]
        question.follow_num = selector.xpath(
            '//div[@class="aw-side-bar-mod-body"]/ul/li[3]/span/text()')[0]
        question.content = "".join(
            selector.xpath(
                '//div[contains(@class,"aw-question-detail-txt")]/text()'))
        CommonOper.add_one(question)
        CommonOper.add_filter_key("question_id", question_id)
        agrees = []
        agree_list = selector.xpath(
            '//div[@class="aw-question-detail-meta"]/p[contains(@class,"aw-agree-by")]/a/@data-id'
        )
        for p in agree_list:
            task_filter('people', p)
            agree = Agree()
            agree.question_id = question.question_id
            agree.people_id = p
            agrees.append(agree)
        CommonOper.add_all(agrees)
    except Exception as e:
        jsl_log.warning(
            "get question error,question_id:{},here are details {}".format(
                question_id, e))