示例#1
0
def compute(score_lookup,
            word2vec_model,
            jate_terms_file,
            stopwords,
            jate_terms_folder,
            out_folder,
            append_label=None):
    kcr_lookup = score_lookup

    model = Word2Vec.load(word2vec_model)
    jate_term_base_scores = {
        c[0]: c[1]
        for c in tr.jate_terms_iterator(jate_terms_file)
    }
    term_unigrams = set()
    for term in jate_term_base_scores.keys():
        norm_parts = utils.normalize_string(term)
        for part in norm_parts:
            part = re.sub(r'[^a-zA-Z0-9,/\-\+\s_]', ' ', part).strip()
            if (part in stopwords or len(part) < 2):
                continue
            else:
                term_unigrams.add(part)

    sum_unigram_scores = {}
    for tu in term_unigrams:
        if tu in kcr_lookup.keys():
            sum_unigram_scores[tu] = kcr_lookup[tu]
        else:
            sum_unigram_scores[tu] = 0.0

    sum_unigram_scores = utils.normalize(sum_unigram_scores)

    jate_terms_components = td.generate_term_component_map(
        jate_term_base_scores, 5, model)

    for file in os.listdir(jate_terms_folder):
        print("\t{}".format(file))
        jate_term_base_scores = {
            c[0]: c[1]
            for c in tr.jate_terms_iterator(jate_terms_folder + "/" + file)
        }
        term_rank_scores = ts.SemReRankScorer(sum_unigram_scores,
                                              jate_terms_components,
                                              jate_term_base_scores)
        out_file = out_folder + "/" + file
        if append_label is not None:
            out_file = out_file + "_" + append_label
        # sorted_term_rank_scores = sorted(list(term_rank_scores), key=lambda k: k['score'])
        with open(out_file, 'w') as outfile:
            json.dump(list(term_rank_scores), outfile)
def calc_word_freq(ttf_term_json, ttf_word_out_file):
    term_ttf_scores = {
        c[0]: c[1]
        for c in srr.jate_terms_iterator(ttf_term_json)
    }
    word_freq = {}

    for t, ttf in term_ttf_scores.items():
        parts = t.split(" ")

        for p in parts:
            p = exp_loader_doc_graph.lemmatizer.lemmatize(p).strip().lower()
            if p in stop or len(p) < 2:
                continue

            if p in word_freq.keys():
                word_freq[p] += ttf
            else:
                word_freq[p] = ttf

    sorted_w_ttf = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

    with open(ttf_word_out_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile,
                            delimiter=',',
                            quotechar='"',
                            quoting=csv.QUOTE_MINIMAL)
        for tuple in sorted_w_ttf:
            writer.writerow([tuple[0], tuple[1]])
示例#3
0
def run_random_baseline(word2vec_model, jate_terms_file, stopwords,
                        jate_terms_folder, out_folder):
    model = None
    if word2vec_model is not None:
        model = Word2Vec.load(word2vec_model)
    jate_term_base_scores = {
        c[0]: c[1]
        for c in tr.jate_terms_iterator(jate_terms_file)
    }
    term_unigrams = set()
    for term in jate_term_base_scores.keys():
        norm_parts = utils.normalize_string(term)
        for part in norm_parts:
            part = re.sub(r'[^a-zA-Z0-9,/\-\+\s_]', ' ', part).strip()
            if (part in stopwords or len(part) < 2):
                continue
            else:
                term_unigrams.add(part)

    sum_unigram_scores = {}
    for tu in term_unigrams:
        sum_unigram_scores[tu] = 0.0
    sum_unigram_scores = utils.randomize(sum_unigram_scores)

    jate_terms_components = td.generate_term_component_map(
        jate_term_base_scores, 5, model)

    for file in os.listdir(jate_terms_folder):
        jate_term_base_scores = {
            c[0]: c[1]
            for c in tr.jate_terms_iterator(jate_terms_folder + "/" + file)
        }
        term_rank_scores = ts.SemReRankScorer(sum_unigram_scores,
                                              jate_terms_components,
                                              jate_term_base_scores)
        out_file = out_folder + "/" + file + "-random"
        # sorted_term_rank_scores = sorted(list(term_rank_scores), key=lambda k: k['score'])
        with open(out_file, 'w') as outfile:
            json.dump(list(term_rank_scores), outfile)
def find_terms_with_infrequent_words(ttf_term_json,
                                     ttf_word_csv_file,
                                     min_ttf_word,
                                     out_file,
                                     gs_file=None):
    term_ttf_scores = {
        c[0]: c[1]
        for c in srr.jate_terms_iterator(ttf_term_json)
    }
    word_ttf_scores = {}
    with open(ttf_word_csv_file, newline='\n') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        for row in reader:
            word_ttf_scores[row[0]] = row[1]
    gs_terms = []
    if gs_file is not None:
        with open(gs_file) as f:
            terms = f.readlines()
            for t in terms:
                t = exp_loader_doc_graph.lemmatizer.lemmatize(
                    t).strip().lower()
                if len(t) >= 2:
                    gs_terms.append(t)

    selected_terms = []
    for t, ttf in term_ttf_scores.items():
        parts = t.split(" ")

        all_words_infrequent = True
        for p in parts:
            p = exp_loader_doc_graph.lemmatizer.lemmatize(p).strip().lower()
            if p in stop or len(p) < 2:
                continue

            if p in word_ttf_scores.keys():
                wtf = word_ttf_scores[p]
                if float(wtf) >= min_ttf_word:
                    all_words_infrequent = False
                    break

        if all_words_infrequent:
            if gs_terms is not None and t in gs_terms:
                selected_terms.append(t)
            elif gs_terms is None:
                selected_terms.append(t)

    selected_terms = sorted(selected_terms)
    with open(out_file, 'w') as the_file:
        for t in selected_terms:
            the_file.write(t + '\n')
示例#5
0
def analyze_node_degree(folder, jate_terms_file, stopwords, out_folder, folder_base_pattern):
    simTs = set()
    topNs = set()
    jate_term_base_scores = {c[0]: c[1] for c in tr.jate_terms_iterator(jate_terms_file)}
    term_unigrams = set()
    for term in jate_term_base_scores.keys():
        norm_parts = utils.normalize_string(term)
        for part in norm_parts:
            part = re.sub(r'[^a-zA-Z0-9,/\-\+\s_]', ' ',
                          part).strip()
            if (part in stopwords or len(part) < 2):
                continue
            else:
                term_unigrams.add(part)

    for file in os.listdir(folder):
        graph_type_folder = os.path.basename(file)
        parts = graph_type_folder.split("-")
        topN = parts[len(parts) - 2]
        simT = parts[len(parts) - 1]
        simTs.add(simT)
        topNs.add(topN)

    simTs = sorted(simTs)
    topNs = sorted(topNs)

    for simT in simTs:
        print(simT)

        for topN in topNs:
            target_folder = folder + "/" + folder_base_pattern + "-" + topN + "-" + simT
            print(target_folder)
            node_stats = calculate_node_stats(target_folder, term_unigrams)

            anynode_degrees = node_stats[0]
            termnode_degrees = node_stats[1]

            out_file = out_folder + "/" + folder_base_pattern + "-" + topN + "-" + simT + ".csv"
            f = open(out_file, 'w')
            for i in range(0, len(anynode_degrees)):
                line = str(anynode_degrees[i])
                if len(termnode_degrees) > i:
                    line += "," + str(termnode_degrees[i])
                line += "\n"
                f.write(line)

            f.close()
示例#6
0
def analyse_threshold(word2vec_model, jate_terms_file, stopwords, out_file, term_only):
    model = Word2Vec.load(word2vec_model)
    jate_term_base_scores = {c[0]: c[1] for c in tr.jate_terms_iterator(jate_terms_file)}
    term_unigrams = set()
    for term in jate_term_base_scores.keys():
        norm_parts = utils.normalize_string(term)
        for part in norm_parts:
            part = re.sub(r'[^a-zA-Z0-9,/\-\+\s_]', ' ',
                          part).strip()
            if (part in stopwords or len(part) < 2):
                continue
            else:
                term_unigrams.add(part)

    f = open(out_file, 'w')
    print(len(term_unigrams))
    line="UNIGRAM, 0.9, 0.8, 0.7, 0.6, 0.5\n"
    f.write(line)
    simTs=[0.9, 0.8, 0.7, 0.6, 0.5]
    for unigram in term_unigrams:
        if unigram not in model.wv.vocab.keys():
            continue
        similar = model.wv.most_similar(positive=unigram, topn=100000)
        line="\""+unigram+"\","
        for simT in simTs:
            #print(simT)
            count=0
            for item in similar:
                if term_only and item[0] not in term_unigrams:
                    continue
                if item[1] < simT:
                    break
                count+=1
            line+=str(count)+","
        line=line+"\n"
        f.write(line)
    f.close()
def calc_movement_stats(base_ate_outlist_json, srr_ate_outlist_json,
                        term_filter_list_file, word_ttf_scores: dict):
    with open(term_filter_list_file) as f:
        filter_terms = f.readlines()
    base_ate_scores = {
        c[0]: c[1]
        for c in srr.jate_terms_iterator(base_ate_outlist_json)
    }
    sorted_base_ate = sorted(base_ate_scores,
                             key=base_ate_scores.get,
                             reverse=True)
    ssr_ate_scores = {
        c[0]: c[1]
        for c in ut.semrerank_json_reader(srr_ate_outlist_json)
    }
    sorted_ssr_ate = sorted(ssr_ate_scores,
                            key=ssr_ate_scores.get,
                            reverse=True)

    distribution = {}
    sum_avg_word_freq = {
    }  # key: movement range; value:sum of the average word freq for each term belong to that range
    for adv in np.arange(-1.0, 1.0, 0.05):
        distribution[format(adv, '.2f')] = 0
        sum_avg_word_freq[format(adv, '.2f')] = 0

    count_advances = 0
    count_advanced_percentages = 0
    count_drops = 0
    count_drop_percentages = 0
    for f_t in filter_terms:
        f_t = f_t.strip()
        if f_t not in sorted_base_ate:
            continue

        avg_word_freq = calc_avg_word_freq(f_t, word_ttf_scores)
        ate_index = sorted_base_ate.index(f_t)
        ssr_index = sorted_ssr_ate.index(f_t)

        advance = (ate_index - ssr_index) / len(sorted_base_ate)

        if advance > 0:
            count_advanced_percentages += advance
            count_advances += 1
        elif advance < 0:
            count_drop_percentages += advance
            count_drops += 1

        if advance >= -1.0 and advance < -0.95:
            distribution['-1.0'] += 1
            sum_avg_word_freq['-1.0'] += avg_word_freq
        elif advance >= -0.95 and advance < -0.9:
            distribution['-0.95'] += 1
            sum_avg_word_freq['-0.95'] += avg_word_freq
        elif advance >= -0.9 and advance < -0.85:
            distribution['-0.90'] += 1
            sum_avg_word_freq['-0.90'] += avg_word_freq
        elif advance >= -0.85 and advance < -0.8:
            distribution['-0.85'] += 1
            sum_avg_word_freq['-0.85'] += avg_word_freq
        elif advance >= -0.8 and advance < -0.75:
            distribution['-0.80'] += 1
            sum_avg_word_freq['-0.80'] += avg_word_freq
        elif advance >= -0.75 and advance < -0.7:
            distribution['-0.75'] += 1
            sum_avg_word_freq['-0.75'] += avg_word_freq
        elif advance >= -0.70 and advance < -0.65:
            distribution['-0.70'] += 1
            sum_avg_word_freq['-0.70'] += avg_word_freq
        elif advance >= -0.65 and advance < -0.6:
            distribution['-0.65'] += 1
            sum_avg_word_freq['-0.65'] += avg_word_freq
        elif advance >= -0.6 and advance < -0.55:
            distribution['-0.60'] += 1
            sum_avg_word_freq['-0.60'] += avg_word_freq
        elif advance >= -0.55 and advance < -0.5:
            distribution['-0.55'] += 1
            sum_avg_word_freq['-0.55'] += avg_word_freq
        elif advance >= -0.5 and advance < -0.45:
            distribution['-0.50'] += 1
            sum_avg_word_freq['-0.50'] += avg_word_freq
        elif advance >= -0.45 and advance < -0.4:
            distribution['-0.45'] += 1
            sum_avg_word_freq['-0.45'] += avg_word_freq
        elif advance >= -0.4 and advance < -0.35:
            distribution['-0.40'] += 1
            sum_avg_word_freq['-0.40'] += avg_word_freq
        elif advance >= -0.35 and advance < -0.3:
            distribution['-0.35'] += 1
            sum_avg_word_freq['-0.35'] += avg_word_freq
        elif advance >= -0.3 and advance < -0.25:
            distribution['-0.30'] += 1
            sum_avg_word_freq['-0.30'] += avg_word_freq
        elif advance >= -0.25 and advance < -0.2:
            distribution['-0.25'] += 1
            sum_avg_word_freq['-0.25'] += avg_word_freq
        elif advance >= -0.20 and advance < -0.15:
            distribution['-0.20'] += 1
            sum_avg_word_freq['-0.20'] += avg_word_freq
        elif advance >= -0.15 and advance < -0.1:
            distribution['-0.15'] += 1
            sum_avg_word_freq['-0.15'] += avg_word_freq
        elif advance >= -1.0 and advance < -0.05:
            distribution['-0.10'] += 1
            sum_avg_word_freq['-0.10'] += avg_word_freq
        elif advance >= -0.05 and advance < 0:
            distribution['-0.05'] += 1
            sum_avg_word_freq['-0.05'] += avg_word_freq
        elif advance == 0:
            distribution['0.00'] += 1
            sum_avg_word_freq['0.00'] += avg_word_freq
        elif advance > 0 and advance < 0.05:
            distribution['0.05'] += 1
            sum_avg_word_freq['0.05'] += avg_word_freq
        elif advance >= 0.05 and advance < 0.1:
            distribution['0.10'] += 1
            sum_avg_word_freq['0.10'] += avg_word_freq
        elif advance >= 0.1 and advance < 0.15:
            distribution['0.15'] += 1
            sum_avg_word_freq['0.15'] += avg_word_freq
        elif advance >= 0.15 and advance < 0.2:
            distribution['0.20'] += 1
            sum_avg_word_freq['0.20'] += avg_word_freq
        elif advance >= 0.2 and advance < 0.25:
            distribution['0.25'] += 1
            sum_avg_word_freq['0.25'] += avg_word_freq
        elif advance >= 0.25 and advance < 0.3:
            distribution['0.30'] += 1
            sum_avg_word_freq['0.30'] += avg_word_freq
        elif advance >= 0.35 and advance < 0.4:
            distribution['0.35'] += 1
            sum_avg_word_freq['0.35'] += avg_word_freq
        elif advance >= 0.4 and advance < 0.45:
            distribution['0.40'] += 1
            sum_avg_word_freq['0.40'] += avg_word_freq
        elif advance >= 0.45 and advance < 0.5:
            distribution['0.45'] += 1
            sum_avg_word_freq['0.45'] += avg_word_freq
        elif advance >= 0.5 and advance < 0.55:
            distribution['0.50'] += 1
            sum_avg_word_freq['0.50'] += avg_word_freq
        elif advance >= 0.55 and advance < 0.6:
            distribution['0.55'] += 1
            sum_avg_word_freq['0.55'] += avg_word_freq
        elif advance >= 0.6 and advance < 0.65:
            distribution['0.60'] += 1
            sum_avg_word_freq['0.60'] += avg_word_freq
        elif advance >= 0.65 and advance < 0.7:
            distribution['0.65'] += 1
            sum_avg_word_freq['0.65'] += avg_word_freq
        elif advance >= 0.7 and advance < 0.75:
            distribution['0.70'] += 1
            sum_avg_word_freq['0.70'] += avg_word_freq
        elif advance >= 0.75 and advance < 0.8:
            distribution['0.75'] += 1
            sum_avg_word_freq['0.75'] += avg_word_freq
        elif advance >= 0.8 and advance < 0.85:
            distribution['0.80'] += 1
            sum_avg_word_freq['0.80'] += avg_word_freq
        elif advance >= 0.85 and advance < 0.9:
            distribution['0.85'] += 1
            sum_avg_word_freq['0.85'] += avg_word_freq
        elif advance >= 0.9 and advance < 0.95:
            distribution['0.90'] += 1
            sum_avg_word_freq['0.90'] += avg_word_freq
        elif advance >= 0.95 and advance <= 1.0:
            distribution['0.95'] += 1
            sum_avg_word_freq['0.95'] += avg_word_freq

    avg_adv = 0
    avg_drop = 0
    if count_advances > 0:
        avg_adv = count_advanced_percentages / count_advances
    if count_drops > 0:
        avg_drop = count_drop_percentages / count_drops
    print("avg advance,{},avg drop,{}".format(avg_adv, avg_drop))

    avg_word_freq = {}
    for mv_range, sum_avg_w_f in sum_avg_word_freq.items():
        samples = distribution[mv_range]
        if samples > 0:
            avg_word_freq[mv_range] = sum_avg_w_f / samples
        else:
            avg_word_freq[mv_range] = 0

    return OrderedDict(sorted(distribution.items())), OrderedDict(
        sorted(avg_word_freq.items()))