예제 #1
0
def merge_phrases(data, is_ref_data, hash2group, rep2rank, top_n,
                  topics_count):
    """
    Analyze the provided topics data and detect trends (changes in importance)

    Args:
        data: A list of topics with importance scores
        is_ref_data (Boolean): Was the data extracted from the target/reference corpus
        hash2group: A dictionary storing the data of each topic
        rep2rank: A dict of all groups representatives and their ranks
        top_n (int): Limit the analysis to only the top N phrases of each list
        topics_count (int): The total sum of all topics extracted from both corpora
    """
    logger.info('merge and compare groups for data: %s', str(data))
    ctr = 0
    if not Path(data).exists():
        logger.error('invalid csv file: %s', str(data))
        sys.exit()
    try:
        with open(data, encoding='utf-8', errors='ignore') as csv_file:
            topics = csv.reader(csv_file, delimiter=',')
            for group, imp in topics:
                if ctr == top_n:
                    break
                try:
                    rep = clean_group(group).strip()
                    imp = float(imp) * 100.0
                    rank = ctr + 1
                    hash_id = simple_normalizer(rep)
                    if hash_id not in hash2group:
                        rep2rank[rep] = rank
                        if is_ref_data:
                            hash2group[hash_id] = (rep, imp, 0, rank)
                        else:
                            hash2group[hash_id] = (rep, 0, 1, imp, rank)
                    elif not is_ref_data:
                        data_b = hash2group[hash_id]
                        if data_b[
                                2] == 0:  # create a trend only in comparison to the
                            #  ref topics, ignore cases of different topics have the
                            # same hash or same topic was extracted twice from the
                            # same data
                            old_rep = data_b[0]
                            old_rank = data_b[3]
                            rep2rank[old_rep] = int(
                                (rank + old_rank) / 2)  # rank
                            #  of topic that appear in both corpora calculated as
                            #  the avarage of ranks
                            change = float(imp) - float(data_b[1])
                            t_score = (topics_count -
                                       (old_rank + rank)) * abs(change)
                            hash2group[hash_id] = (old_rep, float(data_b[1]),
                                                   2, imp, change, abs(change),
                                                   t_score)  # trend phrase
                    ctr += 1
                except Exception as e:
                    logger.error('bad line: %s. Error: %s', str(ctr), str(e))
    except Exception as e:
        logger.error('Error: %s. Is %s a valid csv file?', str(e), str(data))
        sys.exit()
예제 #2
0
def analyze(
    target_data,
    ref_data,
    tar_header,
    ref_header,
    top_n=10000,
    top_n_vectors=500,
    re_analysis=False,
    tfidf_w=0.5,
    cval_w=0.5,
    lm_w=0,
):
    """
    Compare a topics list of a target data to a topics list of a reference data
    and extract hot topics, trends and clusters. Topic lists can be generated
    by running topic_extraction.py

    Args:
        target_data: A list of topics with importance scores extracted from the tagret corpus
        ref_data: A list of topics with importance scores extracted from the reference corpus
        tar_header: The header to appear for the target topics graphs
        ref_header: The header to appear for the reference topics graphs
        top_n (int): Limit the analysis to only the top N phrases of each list
        top_n_vectors (int): The number of vectors to include in the scatter
        re_analysis (Boolean): whether a first analysis has already been made or not
        tfidf_w (Float): the TF_IDF weight for the final score calculation
        cval_w (Float): the C_Value weight for the final score calculation
        lm_w (Float): the Language-Model weight for the final score calculation
    """
    hash2group = {}
    rep2rank = {}  # the merged list of groups extracted from both corpora.
    #  Will be sorted by their rank.
    in_model_count = 0
    create_clusters = False
    try:
        if not re_analysis:  # first analysis, not through ui
            copyfile(target_data, target_topics_path)  # copying the initial
            # scores file to have a const filename for the ui to recognize
            #  when reanalyzing
            copyfile(ref_data, ref_topics_path)
        calc_scores(target_data, tfidf_w, cval_w, lm_w, target_scores_path)
        calc_scores(ref_data, tfidf_w, cval_w, lm_w, ref_scores_path)
        # unify all topics:
        with open(ref_data, encoding="utf-8", errors="ignore") as f:
            topics1 = sum(1 for _ in f)
        with open(target_data, encoding="utf-8", errors="ignore") as f:
            topics2 = sum(1 for _ in f)
        sum_topics = topics1 + topics2
        logger.info("sum of all topics= %s", str(sum_topics))
        merge_phrases(ref_scores_path, True, hash2group, rep2rank, top_n,
                      sum_topics)
        merge_phrases(target_scores_path, False, hash2group, rep2rank, top_n,
                      sum_topics)
        logger.info("Total number of evaluated topics: %s", str(len(rep2rank)))
        all_topics_sorted = sorted(rep2rank, key=rep2rank.get)
        top_n_scatter = len(
            all_topics_sorted) if top_n_vectors is None else top_n_vectors
        # compute 2D space clusters if model exists:
        w2v_loc = path.join(directory, "W2V_Models/model.bin")
        if os.path.isfile(w2v_loc):
            scatter_group = all_topics_sorted[0:top_n_scatter]
            np_scat, x_scat, y_scat, in_model_count = compute_scatter_subwords(
                scatter_group, w2v_loc)
            if np_scat is not None and x_scat is not None:
                create_clusters = True
                for j in range(len(np_scat)):
                    hash2group[simple_normalizer(np_scat[j])] += (x_scat[j],
                                                                  y_scat[j])
        # prepare reports data:
        groups_r = list(
            filter(lambda x: x[2] == 0 or x[2] == 2, hash2group.values()))
        groups_t = list(
            filter(lambda x: x[2] == 1 or x[2] == 2, hash2group.values()))
        trends = list(filter(lambda x: x[2] == 2,
                             hash2group.values()))  # all trends
        groups_r_sorted = sorted(groups_r,
                                 key=operator.itemgetter(1),
                                 reverse=True)
        groups_t_sorted = sorted(groups_t,
                                 key=operator.itemgetter(3),
                                 reverse=True)
        trends_sorted = sorted(trends,
                               key=operator.itemgetter(6),
                               reverse=True)  # sort by t_score

        # save results:
        save_report_data(
            hash2group,
            groups_r_sorted,
            groups_t_sorted,
            trends_sorted,
            all_topics_sorted,
            create_clusters,
            tar_header,
            ref_header,
            tfidf_w,
            cval_w,
            lm_w,
            in_model_count,
            top_n_scatter,
        )
        logger.info("Done analysis.")
    except Exception as e:
        logger.error(str(e))