Exemplo n.º 1
0
def crawl_dialogue_by_comment_id(cid, mid):
    cur_time = int(time.time() * 1000)

    dialogue_url = AJAX_URL.format(cid, cur_time)

    html = get_page(dialogue_url, auth_level=2, is_ajax=True)
    dialogue_data = dialogue.get_dialogue(html, mid, cid)

    CommonOper.add_one(dialogue_data)
Exemplo n.º 2
0
def get_people_and_follows(people_id, selector):
    try:
        people = People()
        people.people_id = people_id
        people.name = selector.xpath(
            '//div[@class="aw-user-center"]/div[1]/div/h1/text()')[0].strip()
        people.desc = "".join(
            selector.xpath(
                '//div[@class="aw-user-center"]/div[1]/div/span/text()'))
        if selector.xpath('//i[contains(@class,"i-user-locate")]'):
            user_locate = selector.xpath(
                '//i[contains(@class,"i-user-locate")]')[0].getparent()
            people.province = "".join(user_locate.xpath('a[1]/text()'))
            people.city = "".join(user_locate.xpath('a[2]/text()'))
        if selector.xpath('//i[contains(@class,"i-user-post")]'):
            user_post = selector.xpath(
                '//i[contains(@class,"i-user-post")]')[0].getparent()
            people.post = "".join(user_post.xpath('text()')).strip()
        if selector.xpath('//i[contains(@class,"i-user-visits")]'):
            user_visits = selector.xpath(
                '//i[contains(@class,"i-user-visits")]')[0].getparent()
            user_visits_str = "".join(user_visits.xpath('text()'))
            people.home_visit_num = re.findall('(\d+)', user_visits_str)[0]
        people_type_spans = selector.xpath(
            '//div[@class="aw-user-center"]/div[1]/div/p[3]/span')
        people.user_type = people_type_spans[0].xpath(
            'a/em/text()')[0].replace("»", "").strip()
        people.weiwang_num = people_type_spans[1].xpath('em/text()')[0]
        people.agree_num = people_type_spans[2].xpath('em/text()')[0]
        people.thanks_num = people_type_spans[3].xpath('em/text()')[0]
        people.gold_num = people_type_spans[4].xpath('em/text()')[0]
        if '+' in people.gold_num:
            people.gold_num = 100
        if selector.xpath('//span[contains(text(),"最后活跃")]'):
            last_active_time_str = selector.xpath(
                '//span[contains(text(),"最后活跃")]')[0].getparent().getnext(
                ).xpath('text()')[0]
            people.last_active_time = str2datetime(last_active_time_str)
        CommonOper.add_one(people)
        CommonOper.add_filter_key("people_id", people_id)
    except Exception as e:
        jsl_log.warning(
            "get people info error,people_id:{},here are details {}".format(
                people_id,
                format_tb(e.__traceback__)[0]))
    app.send_task("tasks.people.do_follow",
                  args=(
                      people_id,
                      0,
                  ),
                  queue="people_queue",
                  routing_key="people")
Exemplo n.º 3
0
def do_question(question_id):
    if not CommonOper.is_exist("question_id", question_id):
        question_url = question_url_format.format(question_id)
        jsl_log.info("开始爬取url:{}".format(question_url))
        crawl_question_and_answer(question_url)
    else:
        jsl_log.info("question id:{}已存在,跳过".format(question_id))
Exemplo n.º 4
0
def do_people(people_id):
    if not CommonOper.is_exist("people_id", people_id):
        people_url = people_url_format.format(people_id)
        jsl_log.info("开始爬取url:{}".format(people_url))
        crawl_people(people_url)
    else:
        jsl_log.info("people id:{}已存在,跳过".format(people_id))
Exemplo n.º 5
0
def task_filter(task_type, param):
    if task_type == 'question':
        if not CommonOper.is_exist("question_id", param):
            app.send_task('tasks.question.do_question',
                          args=(param, ),
                          queue='question_queue',
                          routing_key='question')
        else:
            jsl_log.info("相关question已存在,question_id:{}".format(param))
    elif task_type == 'people':
        if not CommonOper.is_exist("people_id", param):
            app.send_task('tasks.people.do_people',
                          args=(param, ),
                          queue='people_queue',
                          routing_key='people')
        else:
            jsl_log.info("相关people已存在,people_id:{}".format(param))
Exemplo n.º 6
0
def crawl_dialogue_by_comment_id(cid, mid):
    cur_time = int(time.time() * 1000)

    dialogue_url = AJAX_URL.format(cid, cur_time)

    html = get_page(dialogue_url, auth_level=2, is_ajax=True)
    dialogue_data, uids = dialogue.get_dialogue(html, mid, cid)
    if dialogue_data:
        CommonOper.add_one(dialogue_data)

    if uids:
        for uid in uids:
            # crawl_person_infos_not_in_seed_ids(uid)
            app.send_task('tasks.user.crawl_person_infos_not_in_seed_ids',
                          args=(uid, ),
                          queue='user_crawler',
                          routing_key='for_user_info')
Exemplo n.º 7
0
def crawl_dialogue_by_comment_id(cid, mid):
    cur_time = int(time.time() * 1000)

    dialogue_url = AJAX_URL.format(cid, cur_time)

    html = get_page(dialogue_url, auth_level=2, is_ajax=True)
    dialogue_data, uids = dialogue.get_dialogue(html, mid, cid)
    if dialogue_data:
        CommonOper.add_one(dialogue_data)

    if uids:
        for uid in uids:
            # crawl_person_infos_not_in_seed_ids(uid)
            app.send_task('tasks.user.crawl_person_infos_not_in_seed_ids',
                          args=(uid,),
                          queue='user_crawler',
                          routing_key='for_user_info')
Exemplo n.º 8
0
def get_answer_comment(answer_id, selector):
    try:
        comment_list = selector.xpath('//ul/li')
        comment_id = 0
        answer_comments = []
        for c in comment_list:
            comment_id = comment_id + 1
            answer_comment = AnswerComment()
            answer_comment.answer_id = answer_id
            answer_comment.comment_id = comment_id
            answer_comment.people_id = c.xpath('div/a/@data-id')[0]
            task_filter('people', answer_comment.people_id)
            post_time_str = c.xpath('div/span/text()')[0]
            answer_comment.post_time = str2datetime(post_time_str)
            answer_comment.content = "".join(
                c.xpath('div/p[@class="clearfix"]/text()'))
            answer_comments.append(answer_comment)
        CommonOper.add_all(answer_comments)
    except Exception as e:
        jsl_log.warning(
            "get answer_comment_list error,answer_id:{},here are details {}".
            format(answer_id, e))
Exemplo n.º 9
0
def get_answers_and_agree(question_id, selector):
    try:
        answer_list = selector.xpath('//div[@class="aw-item"]')
        answers = []
        for a in answer_list:
            answer = Answer()
            answer.question_id = question_id
            answer.answer_id = a.xpath('@id')[0].split('_')[2]
            answer.answer_type = 1
            answer.people_id = a.xpath('a/@data-id')[0]
            task_filter('people', answer.people_id)
            answer.content = "".join(a.xpath('div/div/div[1]/div/text()'))
            post_time_str = a.xpath('div/div/div[2]/span/text()')[0]
            answer.post_time = str2datetime(post_time_str)
            answers.append(answer)
            answer_count_str = a.xpath('div/div/div[2]/a/text()')[0]
            answer_count = re.search('(\d+)', answer_count_str)
            if answer_count:
                app.send_task('tasks.question.do_answer_comment',
                              args=(answer.answer_id, ),
                              queue='answer_comment_queue',
                              routing_key='answer_comment')
            agrees = []
            agree_list = a.xpath('div/div/div[1]/p[2]/a/@data-id')
            for p in agree_list:
                task_filter('people', p)
                agree = Agree()
                agree.question_id = question_id
                agree.answer_id = answer.answer_id
                agree.people_id = p
                agrees.append(agree)
            CommonOper.add_all(agrees)
        CommonOper.add_all(answers)
    except Exception as e:
        jsl_log.warning(
            "get answer_list error,question_id:{},here are details {}".format(
                question_id, e))
Exemplo n.º 10
0
def get_follows(follower_id, page_num, selector):
    try:
        follow_list = selector.xpath('//li')
        follows = []
        if len(follow_list) == 30:
            app.send_task("tasks.people.do_follow",
                          args=(
                              follower_id,
                              int(page_num) + 1,
                          ),
                          queue="people_queue",
                          routing_key="people")
            for f in follow_list:
                follow = Follow()
                follow.refer_id = f.xpath('div/a/@data-id')[0]
                follow.follow_type = 1
                follow.follower_id = follower_id
                follows.append(follow)
                task_filter("people", follow.refer_id)
            CommonOper.add_all(follows)
    except Exception as e:
        jsl_log.warning(
            "get follow_list error,follower_id:{},here are details {}".format(
                follower_id, e))
Exemplo n.º 11
0
def get_question_and_agree(question_id, selector):
    try:
        question = Question()
        question.question_id = selector.xpath(
            '//div[@id="question_topic_editor"]/@data-id')[0]
        question.title = selector.xpath(
            '//div[@class="aw-mod-head"]/h1/text()')[0]
        question.people_id = selector.xpath(
            '//dd[@class="pull-left"]/a/@data-id')[0]
        task_filter('people', question.people_id)
        post_time_str = selector.xpath(
            '//div[@class="aw-question-detail-meta"]/div[1]/span[1]/text()'
        )[0].replace("发表时间 ", "")
        question.post_time = str2datetime(post_time_str)
        access_time_str = selector.xpath(
            '//div[@class="aw-side-bar-mod-body"]/ul/li[1]/span/text()')[0]
        question.access_time = str2datetime(access_time_str)
        question.read_num = selector.xpath(
            '//div[@class="aw-side-bar-mod-body"]/ul/li[2]/span/text()')[0]
        question.follow_num = selector.xpath(
            '//div[@class="aw-side-bar-mod-body"]/ul/li[3]/span/text()')[0]
        question.content = "".join(
            selector.xpath(
                '//div[contains(@class,"aw-question-detail-txt")]/text()'))
        CommonOper.add_one(question)
        CommonOper.add_filter_key("question_id", question_id)
        agrees = []
        agree_list = selector.xpath(
            '//div[@class="aw-question-detail-meta"]/p[contains(@class,"aw-agree-by")]/a/@data-id'
        )
        for p in agree_list:
            task_filter('people', p)
            agree = Agree()
            agree.question_id = question.question_id
            agree.people_id = p
            agrees.append(agree)
        CommonOper.add_all(agrees)
    except Exception as e:
        jsl_log.warning(
            "get question error,question_id:{},here are details {}".format(
                question_id, e))