def merge_phrases(data, is_ref_data, hash2group, rep2rank, top_n, topics_count): """ Analyze the provided topics data and detect trends (changes in importance) Args: data: A list of topics with importance scores is_ref_data (Boolean): Was the data extracted from the target/reference corpus hash2group: A dictionary storing the data of each topic rep2rank: A dict of all groups representatives and their ranks top_n (int): Limit the analysis to only the top N phrases of each list topics_count (int): The total sum of all topics extracted from both corpora """ logger.info('merge and compare groups for data: %s', str(data)) ctr = 0 if not Path(data).exists(): logger.error('invalid csv file: %s', str(data)) sys.exit() try: with open(data, encoding='utf-8', errors='ignore') as csv_file: topics = csv.reader(csv_file, delimiter=',') for group, imp in topics: if ctr == top_n: break try: rep = clean_group(group).strip() imp = float(imp) * 100.0 rank = ctr + 1 hash_id = simple_normalizer(rep) if hash_id not in hash2group: rep2rank[rep] = rank if is_ref_data: hash2group[hash_id] = (rep, imp, 0, rank) else: hash2group[hash_id] = (rep, 0, 1, imp, rank) elif not is_ref_data: data_b = hash2group[hash_id] if data_b[ 2] == 0: # create a trend only in comparison to the # ref topics, ignore cases of different topics have the # same hash or same topic was extracted twice from the # same data old_rep = data_b[0] old_rank = data_b[3] rep2rank[old_rep] = int( (rank + old_rank) / 2) # rank # of topic that appear in both corpora calculated as # the avarage of ranks change = float(imp) - float(data_b[1]) t_score = (topics_count - (old_rank + rank)) * abs(change) hash2group[hash_id] = (old_rep, float(data_b[1]), 2, imp, change, abs(change), t_score) # trend phrase ctr += 1 except Exception as e: logger.error('bad line: %s. Error: %s', str(ctr), str(e)) except Exception as e: logger.error('Error: %s. Is %s a valid csv file?', str(e), str(data)) sys.exit()
def analyze( target_data, ref_data, tar_header, ref_header, top_n=10000, top_n_vectors=500, re_analysis=False, tfidf_w=0.5, cval_w=0.5, lm_w=0, ): """ Compare a topics list of a target data to a topics list of a reference data and extract hot topics, trends and clusters. Topic lists can be generated by running topic_extraction.py Args: target_data: A list of topics with importance scores extracted from the tagret corpus ref_data: A list of topics with importance scores extracted from the reference corpus tar_header: The header to appear for the target topics graphs ref_header: The header to appear for the reference topics graphs top_n (int): Limit the analysis to only the top N phrases of each list top_n_vectors (int): The number of vectors to include in the scatter re_analysis (Boolean): whether a first analysis has already been made or not tfidf_w (Float): the TF_IDF weight for the final score calculation cval_w (Float): the C_Value weight for the final score calculation lm_w (Float): the Language-Model weight for the final score calculation """ hash2group = {} rep2rank = {} # the merged list of groups extracted from both corpora. # Will be sorted by their rank. in_model_count = 0 create_clusters = False try: if not re_analysis: # first analysis, not through ui copyfile(target_data, target_topics_path) # copying the initial # scores file to have a const filename for the ui to recognize # when reanalyzing copyfile(ref_data, ref_topics_path) calc_scores(target_data, tfidf_w, cval_w, lm_w, target_scores_path) calc_scores(ref_data, tfidf_w, cval_w, lm_w, ref_scores_path) # unify all topics: with open(ref_data, encoding="utf-8", errors="ignore") as f: topics1 = sum(1 for _ in f) with open(target_data, encoding="utf-8", errors="ignore") as f: topics2 = sum(1 for _ in f) sum_topics = topics1 + topics2 logger.info("sum of all topics= %s", str(sum_topics)) merge_phrases(ref_scores_path, True, hash2group, rep2rank, top_n, sum_topics) merge_phrases(target_scores_path, False, hash2group, rep2rank, top_n, sum_topics) logger.info("Total number of evaluated topics: %s", str(len(rep2rank))) all_topics_sorted = sorted(rep2rank, key=rep2rank.get) top_n_scatter = len( all_topics_sorted) if top_n_vectors is None else top_n_vectors # compute 2D space clusters if model exists: w2v_loc = path.join(directory, "W2V_Models/model.bin") if os.path.isfile(w2v_loc): scatter_group = all_topics_sorted[0:top_n_scatter] np_scat, x_scat, y_scat, in_model_count = compute_scatter_subwords( scatter_group, w2v_loc) if np_scat is not None and x_scat is not None: create_clusters = True for j in range(len(np_scat)): hash2group[simple_normalizer(np_scat[j])] += (x_scat[j], y_scat[j]) # prepare reports data: groups_r = list( filter(lambda x: x[2] == 0 or x[2] == 2, hash2group.values())) groups_t = list( filter(lambda x: x[2] == 1 or x[2] == 2, hash2group.values())) trends = list(filter(lambda x: x[2] == 2, hash2group.values())) # all trends groups_r_sorted = sorted(groups_r, key=operator.itemgetter(1), reverse=True) groups_t_sorted = sorted(groups_t, key=operator.itemgetter(3), reverse=True) trends_sorted = sorted(trends, key=operator.itemgetter(6), reverse=True) # sort by t_score # save results: save_report_data( hash2group, groups_r_sorted, groups_t_sorted, trends_sorted, all_topics_sorted, create_clusters, tar_header, ref_header, tfidf_w, cval_w, lm_w, in_model_count, top_n_scatter, ) logger.info("Done analysis.") except Exception as e: logger.error(str(e))