Пример #1
0
def analysis_process():
    comment_res_file = 'analysis/profile_comment.json'
    review_res_file = 'analysis/profile_review.json'

    users = load_np_array(CONFIG.user_list_file)
    done_comment = [
        os.path.exists(os.path.join(CONFIG.user_path, user, comment_res_file))
        for user in users
    ]
    done_review = [
        os.path.exists(os.path.join(CONFIG.user_path, user, review_res_file))
        for user in users
    ]
    print("user comment: %d/%d" % (sum(done_comment), len(users)))
    print("user review: %d/%d" % (sum(done_review), len(users)))

    movies = load_np_array(CONFIG.movie_list_file)
    done_comment = [
        os.path.exists(os.path.join(CONFIG.movie_path, movie,
                                    comment_res_file)) for movie in movies
    ]
    done_review = [
        os.path.exists(os.path.join(CONFIG.movie_path, movie, review_res_file))
        for movie in movies
    ]
    print("movie comment: %d/%d" % (sum(done_comment), len(movies)))
    print("movie review: %d/%d" % (sum(done_review), len(movies)))
Пример #2
0
def merge_profile():
    users = load_np_array(CONFIG.user_list_file)
    nums = []
    for user in users:
        proifle = dataAnalyzer.merge_review_comment_profiles(user, 'user')
        triple_num = sum([
            sum([len(y.values()) for y in x.values()])
            for x in proifle.values()
        ])
        nums.append(triple_num)
    print('user: %d' % len(users))
    print('mean: %f, median: %d, max: %d' %
          (np.mean(nums), np.median(nums), np.max(nums)))
    print(Counter(nums))
    movies = load_np_array(CONFIG.movie_list_file)
    nums = []
    for movie in movies:
        proifle = dataAnalyzer.merge_review_comment_profiles(movie)
        triple_num = sum([
            sum([len(y.values()) for y in x.values()])
            for x in proifle.values()
        ])
        nums.append(triple_num)
    print('movie: %d' % len(movies))
    print('mean: %f, median: %d, max: %d' %
          (np.mean(nums), np.median(nums), np.max(nums)))
    print(Counter(nums))
Пример #3
0
def analysis_user_comments():
    user_list = load_np_array(CONFIG.user_list_file)
    for user in user_list:
        details = dataAnalyzer.analyze_user_comments(user)
        if len(details) == 0:
            logging_with_time('empty result for user: %s' % user)
        logging_with_time('done: %s' % user)
Пример #4
0
def remake_collect():
    user_list = load_np_array(CONFIG.user_list_file)
    for user in user_list:
        folder = os.path.join(CONFIG.data_path, 'user', user)
        remove_file = [x for x in os.listdir(folder) if x != "html.json"]
        for file in remove_file:
            os.remove(os.path.join(folder, file))
        userAnalyzer.get_collect(user)
Пример #5
0
def craw_user_reviews(user_list=None):
    if user_list is None:
        user_list = load_np_array(CONFIG.user_list_file)
    for user in user_list:
        reviews = userAnalyzer.get_reviews(user)
        logging_with_time(
            'user: %s, review num: %d, empty num: %d' %
            (user, len(reviews), len([x for x in reviews if len(x) == 0])))
 def search_user(self, query):
     users = list(load_np_array(CONFIG.user_list_file))
     infos = []
     for user in users:
         info = self.get_user_info(user)
         if query in info['name'] or query in info['id']:
             infos.append(info)
     return infos
Пример #7
0
def prepare_user_profile():
    for i, uid in enumerate(load_np_array(CONFIG.user_list_file)):
        logging_with_time('user %d: %s' % (i, uid))
        profiles = userAnalyzer.get_basic_info(uid)
        profiles.update(userAnalyzer.get_profile_of_collect(uid))
        sentiment_profile, _ = dataAnalyzer.analyze_profile(uid, 'user')
        profiles.update(
            userAnalyzer.get_profile_of_sentiment(uid, sentiment_profile))
        userAnalyzer.make_tags(profiles)
Пример #8
0
def crawl_movie_info():
    movie_list = load_np_array(CONFIG.movie_list_file)
    count = 0
    total = len(movie_list)
    for movie in movie_list:
        api_crawler.get_movie_info(movie, -1)
        count += 1
        if count % 100 == 0:
            logging_with_time('movie info: %d/%d' % (count, total))
Пример #9
0
def analysis_user_reviews():
    user_list = load_np_array(CONFIG.user_list_file)
    for user in user_list:
        try:
            details = dataAnalyzer.analyze_user_reveiws(user)
        except OSError:
            continue
        if len(details) == 0:
            logging_with_time('empty result for user: %s' % user)
        logging_with_time('done: %s' % user)
Пример #10
0
    def get_active_users(self, num=10):
        users = list(load_np_array(CONFIG.user_list_file))

        def get_sentiment_num(user):
            _, freqs = self.analyze_profile(user, 'user')
            total_freq = sum([x.get('freq', 0) for x in freqs.values()])
            return total_freq

        users.sort(key=get_sentiment_num, reverse=True)
        return list(map(self.get_user_info, users[:num]))
Пример #11
0
def re_crawl_html():
    user_list = load_np_array(
        '/data/wangke/MovieProject/MovieData/data/user_list.npy')
    for user in user_list:
        html_file = os.path.join(CONFIG.data_path, 'user', user, 'html.json')
        if cache_available(html_file):
            logging_with_time('file exists: %s' % html_file)
            continue
        collects, htmls = douban_crawler.get_collect(user)
        save_json_file(html_file, htmls)
        logging_with_time('done: %s, html num: %d' % (user, len(htmls)))
def tmp():
    user = load_np_array(CONFIG.user_list_file)
    movie = load_np_array(CONFIG.movie_list_file)
    print('user: %d, movie: %d' % (len(user), len(movie)))
    matrix = np.load(CONFIG.rate_matrix_file).astype(bool)
    user_rate_num = np.sum(matrix, axis=1)
    movie_rate_num = np.sum(matrix, axis=0)
    print('rate: %d' % sum(user_rate_num))
    matplotlib.rcParams['font.sans-serif'] = ['SimHei']  # 用黑体显示中文
    matplotlib.rcParams['axes.unicode_minus'] = False
    plt.hist(user_rate_num,
             density=True,
             histtype='bar',
             bins=list(range(0, 1200, 60)),
             facecolor='blue',
             edgecolor='white',
             alpha=0.7)
    plt.xlabel("评论数量")
    plt.title("用户评论数量分布直方图")
    plt.savefig(os.path.join(CONFIG.dataset_path, 'bb.png'))
    plt.clf()
    plt.cla()
    plt.close()
    plt.hist(movie_rate_num,
             density=True,
             histtype='bar',
             bins=list(range(0, 30, 2)),
             facecolor='red',
             edgecolor='white',
             alpha=0.7)
    plt.xlabel("评论数量")
    plt.title("电影评论数量分布直方图")
    plt.savefig(os.path.join(CONFIG.dataset_path, 'cc.png'))
    plt.clf()
    plt.cla()
    plt.close()
Пример #13
0
def make_movie_crawler_list(overwrite=False):
    if not overwrite and os.path.exists(CONFIG.movie_freq_file):
        movieid_freq_list = read_lines(
            CONFIG.movie_freq_file, lambda x:
            (x.split()[0], int(x.split()[1])))
        return movieid_freq_list
    movie_list = read_lines(CONFIG.rate_record_all,
                            lambda x: int(x.split()[1]))
    sorted_movie_freq = sorted(Counter(movie_list).items(),
                               key=lambda x: x[1],
                               reverse=True)
    movieid_mapping = load_np_array(CONFIG.movie_list_file)
    movieid_freq_list = list(
        map(lambda x: (movieid_mapping[x[0]], x[1]), sorted_movie_freq))
    write_lines(CONFIG.movie_freq_file, movieid_freq_list, lambda x: '%s %d' %
                (x[0], x[1]))
    return movieid_freq_list
Пример #14
0
def makeup_for_date():
    user_list = load_np_array(
        '/data/wangke/MovieProject/MovieData/data/user_list.npy')
    for user in user_list:
        html_file = os.path.join(CONFIG.data_path, 'user', user, 'html.json')
        collect_file = os.path.join(CONFIG.data_path, 'user', user,
                                    'collect.json')
        collect_file_bk = collect_file + '.bk'
        if os.path.exists(collect_file_bk):
            continue
        htmls = [x['content'] for x in load_json_file(html_file)]
        new_collects = itertools.chain.from_iterable(
            map(lambda x: douban_crawler.parse_collect(None, x)[0], htmls))
        old_collects = load_json_file(collect_file)
        old_collects_dict = dict(
            map(lambda x: (x['movie_id'], x), old_collects))
        new_collects_dict = dict(
            map(lambda x: (x['movie_id'], x), new_collects))
        missed_movies = set(old_collects_dict.keys()) - set(
            new_collects_dict.keys())
        if len(missed_movies) > 0:
            logging_with_time(
                'user: %s, %d movies missed in html: %s' %
                (user, len(missed_movies), ' '.join(missed_movies)))
        extra_movies = set(new_collects_dict.keys()) - set(
            old_collects_dict.keys())
        if len(extra_movies) > 0:
            logging_with_time(
                'user: %s, %d extra movies in html: %s' %
                (user, len(extra_movies), ' '.join(extra_movies)))
        for update_movie in set(old_collects_dict.keys()).intersection(
                set(new_collects_dict.keys())):
            old_collects_dict[update_movie].update(
                new_collects_dict[update_movie])

        os.rename(collect_file, collect_file_bk)
        save_json_file(collect_file, list(old_collects_dict.items()))
Пример #15
0
def manual_crawl_user_review_with_login():
    user_list = load_np_array(CONFIG.user_list_file)
    tmp_file = 'tmp.html'
    for user in user_list:
        userAnalyzer.update_uid(user)
        backup_file(userAnalyzer.review_file)
        backup_file(userAnalyzer.review_html_file)
        backup_file(userAnalyzer.review_list_html_file)
        if not os.path.exists(userAnalyzer.review_list_html_file):
            print('review list html missed: %s' % user)
            continue
        if not os.path.exists(userAnalyzer.review_html_file):
            print('review html missed: %s' % userAnalyzer)
            continue
        # if some html content of review_list_html is empty
        review_list_htmls = load_json_file(userAnalyzer.review_list_html_file)
        review_list_changed = []
        for i, html in enumerate(review_list_htmls):
            if html["content"] == "":
                new_content = make_up_html(html["url"], tmp_file)
                html["content"] = new_content
                if new_content != "":
                    review_list_changed.append(i)
        review_htmls = load_json_file(userAnalyzer.review_html_file)
        if len(review_list_changed) > 0:
            # update review_htmls
            save_json_file(userAnalyzer.review_list_html_file,
                           review_list_htmls)
            for new_review_list_htmls in [
                    review_list_htmls[i] for i in review_list_changed
            ]:
                new_urls, _, = douban_crawler.get_user_review_list(
                    user, new_review_list_htmls, False)
                new_review_htmls = douban_crawler.get_user_review_htmls(
                    new_urls)
                review_htmls.extend(new_review_htmls)
        save_json_file(userAnalyzer.review_html_file, review_htmls)

        s = etree.HTML(review_list_htmls[0]["content"])
        title = (
            s.xpath('//div[@id="db-usr-profile"]/div[@class="info"]/h1/text()')
            or [''])[0]
        if title == '':
            print(
                'Error in review list page of %s, check this page and maybe your cache html'
                % user)
            review_num = 0
        else:
            review_num = int(title.split('(')[-1].split(')')[0])
        review_urls = [x["url"] for x in review_htmls]
        # if review_urls not all parsed
        review_html_changed = False
        if review_num != len(review_urls):
            print('unmatched review num: expected %d, got %d' %
                  (review_num, len(review_urls)))
            print("recrawl review_list_htmls for user %s" % user)
            os.remove(userAnalyzer.review_list_html_file)
            new_review_urls, review_list_htmls = douban_crawler.get_user_review_list(
                user)
            save_json_file(userAnalyzer.review_list_html_file,
                           review_list_htmls)
            added_review_urls = list(
                filter(lambda x: x not in review_urls, new_review_urls))
            print("to crawl %d new reviews" % (len(added_review_urls)))
            new_review_htmls = douban_crawler.get_user_review_htmls(
                added_review_urls)
            review_htmls.extend(new_review_htmls)
            save_json_file(userAnalyzer.review_html_file, review_htmls)
            print("done")
            review_html_changed = True
        # if html content of review html is empty
        for html in review_htmls:
            url = html["url"]
            content = html["content"]
            if content == "":
                new_content = make_up_html(url, tmp_file)
                html["content"] = new_content
                review_html_changed = True
        if review_html_changed:
            save_json_file(userAnalyzer.review_html_file, review_htmls)
            new_reviews = douban_crawler.get_user_reviews(review_htmls)
            save_json_file(userAnalyzer.review_file, new_reviews)
    print(url_403)