def build_series_corpus(corpus: Corpus, annotated_series_corpus_path: str,
                        number_of_subparts: int):
    corpus = Preprocesser.filter_too_small_docs_from_corpus(corpus)
    corpus.fake_series(series_corpus_dir=annotated_series_corpus_path,
                       number_of_sub_parts=number_of_subparts)
    return Corpus.fast_load(path=annotated_series_corpus_path,
                            load_entities=False)
    def filter_thresholds(cls, dir_path: str, parallel: bool = False):
        data_set_bar = tqdm(cls.data_sets, total=len(cls.data_sets), desc="2 Operate on dataset!!")
        for data_set in data_set_bar:
            data_set_bar.set_description(f'2 Operate on dataset >{data_set}<')
            data_set_bar.refresh()
            annotated_corpus_path = os.path.join(cls.config["system_storage"]["corpora"], data_set)
            try:
                corpus = Corpus.fast_load(path=annotated_corpus_path, load_entities=False)
            except FileNotFoundError:
                corpus = DataHandler.load_corpus(data_set)
                print('corpus loaded')
                # corpus = Preprocesser.annotate_corpus(corpus, without_spacy=False)
                # corpus.save_corpus_adv(annotated_corpus_path)
                Preprocesser.annotate_and_save(corpus,  corpus_dir=annotated_corpus_path, without_spacy=False)
                print('annotated corpus')
                del corpus
                corpus = Corpus.fast_load(path=annotated_corpus_path, load_entities=False)

                # print('saved corpus')

            if cls.absolute:
                thresholds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                              11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
                              25, 50, 100, #1000, 2000, 3000,
                              len(corpus)
                              ]
            else:
                thresholds = cls.thresholds

            threshold_bar = tqdm(thresholds, total=len(thresholds), desc="3 Calculate filter_mode results")
            if parallel:
                Parallel(n_jobs=cls.num_cores)(
                    delayed(CommonWordsExperiment.calculate_vocab_sizes)(corpus, t, data_set=data_set,
                                                                         dir_path=dir_path)
                    for t in threshold_bar)
            else:
                res = {t: CommonWordsExperiment.calculate_vocab_sizes(corpus, t, data_set=data_set,
                                                                      dir_path=dir_path)
                       for t in threshold_bar}

                with open(os.path.join(dir_path, 'all.json'), 'w', encoding='utf-8') as outfile:
                    json.dump(res, outfile, indent=1)
Exemplo n.º 3
0
 def get_corpus_summary_sentence_list(corpus: Corpus, lemma: bool, lower: bool):
     corpus_summary = []
     corpus_summary_dict = Summarizer.get_summary(corpus)
     _, doc_ids = corpus.get_texts_and_doc_ids()
     for doc_id in doc_ids:
         document = corpus.documents[doc_id]
         corpus_summary.append(Summarizer.document_summary_list(document,
                                                                corpus_summary_dict,
                                                                lemma=lemma,
                                                                lower=lower))
     return corpus_summary
def corpus2plain_text_dir(source_path: str):
    corpus = Corpus.fast_load(path=source_path, load_entities=False)

    new_dir = os.path.join(config["system_storage"]["corpora"], 'plain_text',
                           f'{os.path.basename(source_path)}_plain')
    print(new_dir)
    if not os.path.isdir(new_dir):
        os.mkdir(new_dir)
    for doc_id, d in corpus.documents.items():
        doc_path = os.path.join(new_dir, f'{doc_id}_{d.language}.txt')
        with open(doc_path, 'w', encoding="utf-8") as writer:
            writer.write('\n'.join([
                ' '.join(sent.representation())
                for sent in d.get_sentences_from_disk()
            ]))
    def most_similar_documents(model: Union[Doc2Vec, DocumentKeyedVectors],
                               corpus: Corpus,
                               positives: Union[List[str], str],
                               negatives: Union[List[str], str] = None,
                               topn: int = 10,
                               restrict_to_same: bool = True,
                               feature_to_use: str = None,
                               print_results: bool = True,
                               series: bool = False):

        positive_list = Vectorization.get_list(positives, model,
                                               feature_to_use)
        negative_list = Vectorization.get_list(negatives, model,
                                               feature_to_use)

        if restrict_to_same:
            results = Vectorization.get_ordered_results_of_same_type(
                model, positives, positive_list, negative_list, feature_to_use,
                series)
        else:
            results = model.docvecs.most_similar(positive=positive_list,
                                                 negative=negative_list,
                                                 topn=len(
                                                     model.docvecs.doctags))
        results = [
            result for result in results
            if corpus.vector_doc_id_base_in_corpus(result[0])
        ]

        results = results[:topn]

        if print_results:
            for result in results:
                index, sim = result
                print(index, corpus.id2desc(index), sim)
        return results
Exemplo n.º 6
0
    def run_experiment(cls, parallel: bool = False):
        # res = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: dict())))
        for data_set in tqdm(cls.data_sets,
                             total=len(cls.data_sets),
                             desc=f"Evaluate datasets"):
            for filter_mode in tqdm(cls.filters,
                                    total=len(cls.filters),
                                    desc=f"Evaluate filters"):
                corpus = Corpus.fast_load("all",
                                          "no_limit",
                                          data_set,
                                          filter_mode,
                                          "real",
                                          load_entities=False)

                vec_bar = tqdm(cls.vectorization_algorithms,
                               total=len(cls.vectorization_algorithms),
                               desc=f"Evaluate algorithm")
                if parallel:
                    tuple_list_results = Parallel(n_jobs=cls.num_cores)(
                        delayed(TextLengthExperiment.eval_vec_loop_eff)(
                            corpus, "all", "no_limit", data_set, filter_mode,
                            vectorization_algorithm)
                        for vectorization_algorithm in vec_bar)
                else:
                    tuple_list_results = [
                        TextLengthExperiment.eval_vec_loop_eff(
                            corpus, "all", "no_limit", data_set, filter_mode,
                            vectorization_algorithm)
                        for vectorization_algorithm in vec_bar
                    ]

                full_df = pd.DataFrame(tuple_list_results,
                                       columns=[
                                           'Algorithm', 'Full Spearman [p]',
                                           'Short Spearman [p]',
                                           'Medium Spearman [p]',
                                           'Long Spearman [p]'
                                       ])

                full_df.to_csv(os.path.join('../results',
                                            'text_length_experiment',
                                            'text_length_spearman.csv'),
                               index=False)
                full_df.to_latex(os.path.join('../results',
                                              'text_length_experiment',
                                              'text_length_spearman.tex'),
                                 index=False)
Exemplo n.º 7
0
 def get_summary(corpus: Corpus):
     if corpus.root_corpus_path is None:
         raise UserWarning("No root corpus set!")
     corpus_root_path = corpus.root_corpus_path
     summary_dict_path = os.path.join(corpus_root_path, "sent_ids.json")
     if not os.path.isfile(summary_dict_path):
         summary_dict = {}
         print("train summary")
         root_corpus = Corpus.fast_load(path=corpus_root_path, load_entities=False)
         for doc_id, doc in root_corpus.documents.items():
             sents, ids = Summarizer.generate_summary_of_corpus_doc(doc, 20)
             # print(doc_id, ":", ids, [' '.join(sent) for sent in sents])
             summary_dict[doc_id] = ids
         with open(summary_dict_path, 'w', encoding='utf-8') as fp:
             json.dump(summary_dict, fp, indent=1)
     else:
         with open(summary_dict_path) as json_file:
             summary_dict = json.load(json_file)
     return summary_dict
Exemplo n.º 8
0
                                                   facet_pred_vals[facet])

    return complete_correlation, facet_correlation


if __name__ == '__main__':
    # c = Corpus.fast_load(path="corpora/german_series", load_entities=False)
    #
    # vec_path = Vectorizer.build_vec_file_name("all",
    #                                           "no_limit",
    #                                           "german_series",
    #                                           "no_filter",
    #                                           "book2vec",
    #                                           "real")

    c = Corpus.fast_load(path="../corpora/classic_gutenberg",
                         load_entities=False)

    vec_path = Vectorization.build_vec_file_name("",
                                                 "",
                                                 "classic_gutenberg",
                                                 "no_filter",
                                                 "book2vec_adv",
                                                 "real",
                                                 allow_combination=True)

    vecs, _ = Vectorization.my_load_doc2vec_format(vec_path)

    Vectorization.most_similar_documents(vecs,
                                         c,
                                         positives="cb_18",
                                         feature_to_use="atm")
Exemplo n.º 9
0
    def train_lda(corpus: Corpus):
        # def make_bigrams(texts):
        #     return [bigram_mod[doc] for doc in texts]

        def make_trigrams(texts):
            return [trigram_mod[bigram_mod[doc]] for doc in texts]
        # c.filter("ne")
        # c.filter("V")
        corpus = corpus.filter_on_copy("stopwords")
        corpus = corpus.filter_on_copy("punctuation")
        # data_words = [document.get_flat_document_tokens(lemma=True, lower=True)
        #               for doc_id, document in c.documents.items()]
        data_words = corpus.get_flat_document_tokens(lemma=True, lower=True)
        id2doc_id = {i: doc_id for i, doc_id in enumerate(corpus.documents.keys())}

        bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)  # higher threshold fewer phrases.
        bigram_mod = gensim.models.phrases.Phraser(bigram)

        trigram = gensim.models.Phrases(bigram[data_words], threshold=150)
        trigram_mod = gensim.models.phrases.Phraser(trigram)

        data_lemmatized = make_trigrams(data_words)

        id2word = corpora.Dictionary(data_lemmatized)
        corpus = [id2word.doc2bow(text) for text in data_lemmatized]

        # limit = 40
        # start = 2
        # step = 6
        # lda_model, coherence, num_topics = compute_coherence_values(dictionary=id2word,
        #                                                             corpus=corpus,
        #                                                             texts=data_lemmatized,
        #                                                             start=start,
        #                                                             limit=limit,
        #                                                             step=step,
        #                                                             id2word)
        # print(coherence, num_topics)

        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                    id2word=id2word,
                                                    num_topics=15,
                                                    random_state=100,
                                                    update_every=1,
                                                    iterations=100,
                                                    chunksize=100,
                                                    passes=50,
                                                    alpha='auto',
                                                    minimum_probability=0.0,
                                                    per_word_topics=True)

        # os.environ.update({'MALLET_HOME': r'C:/mallet_new/mallet-2.0.8'})
        # mallet_path = "bin\\mallet"
        # print(mallet_path)
        # lda_model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=15, id2word=id2word,
        #                                              alpha='auto', random_seed=42)

        # print(lda_model.print_topics())

        # Compute Coherence Score
        # coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word,
        #                                      coherence='c_v')
        # coherence_lda = coherence_model_lda.get_coherence()
        # print('\nCoherence Score: ', coherence_lda)

        content_aspect_dict = TopicModeller.get_topic_words_for_docs(lda_model, corpus, id2doc_id)
        content_aspect_list = [texts for doc_id, texts in content_aspect_dict.items()]
        # print(content_aspect_list)
        # print(content_aspect_dict)
        return content_aspect_dict, content_aspect_list
Exemplo n.º 10
0
            _, _, topic_model, lda_corpus, doc_ids = TopicModeller.train_lda_mem_eff(corpus)
            topic_vectors = {}
            # print(len(lda_corpus))
            # print(doc_ids)
            for i, doc_id in enumerate(doc_ids):
                doc = lda_corpus[i]
                topic_vectors[doc_id] = np.array([score for (topic, score) in topic_model[doc][0]])

            # print(topic_vectors)
            Vectorization.my_save_doc2vec_format(fname=f'D:/models/topic_vectors/{dataset}.kv',
                                                 doctag_vec=topic_vectors)

        topic_vecs, _ = Vectorization.my_load_doc2vec_format(f'D:/models/topic_vectors/{dataset}.kv')

        # print(topic_vecs.docvecs.doctags)
        # for doctag in topic_vecs.docvecs.doctags:
        #     print(doctag, topic_vecs.docvecs.most_similar(doctag, topn=None))
        # print(topic_model[lda_corpus[0]])
        # for document in topic_model:
        #     doc_id = ...
        #     gensim_doc_id = ...
        #     topic_vectors[doc_id] = topic_model[lda_corpus[gensim_doc_id]]
        return topic_vecs


if __name__ == "__main__":
    data_set_name = "classic_gutenberg"
    c = Corpus.load_corpus_from_dir_format(os.path.join(f"corpora/{data_set_name}"))
    # d = TopicModeller.train_lda(c)
    TopicModeller.get_topic_distribution(c, data_set_name, overwrite=True)
Exemplo n.º 11
0
def corpus_stats(data_sets: List[str]):
    tuples = []

    for data_set_name in data_sets:
        corpus = Corpus.fast_load("all",
                                  "no_limit",
                                  data_set_name,
                                  "no_filter",
                                  "real",
                                  load_entities=False)
        if corpus.language == Language.DE:
            language = "GER"
        else:
            language = "EN"
        nr_books = human_format(len(corpus.documents))

        document_tokens = [
            document.length for document in corpus.documents.values()
        ]
        tokens_total = human_format(sum(document_tokens))
        tokens_avg = f'{np.mean(document_tokens):.0f} ± {np.std(document_tokens):.0f}'
        # tokens_median = f'{np.median(document_tokens):.0f} ± {iqr(document_tokens):.0f}'
        tokens_median = f'{human_format(np.median(document_tokens))}'
        tokens_iqr = f'{human_format(iqr(document_tokens))}'
        tokens_min = f'{human_format(np.min(document_tokens))}'
        tokens_max = f'{human_format(np.max(document_tokens))}'
        document_vocab = [
            document.vocab_size for document in corpus.documents.values()
        ]
        vocab_total = human_format(sum(document_vocab))
        vocab_avg = f'{np.mean(document_vocab):.0f} ± {np.std(document_vocab):.0f}'
        # vocab_median = f'{np.median(document_vocab):.0f} ± {iqr(document_vocab):.0f}'
        vocab_median = f'{human_format(np.median(document_vocab))}'
        vocab_iqr = f'{human_format(iqr(document_vocab))}'
        # vocab_mix = f'[{human_format(np.min(document_vocab))}, {human_format(np.max(document_vocab))}]'
        vocab_min = f'{human_format(np.min(document_vocab))}'
        vocab_max = f'{human_format(np.max(document_vocab))}'

        document_sents = [
            document.sentences_nr for document in corpus.documents.values()
        ]
        sents_total = sum(document_sents)
        sents_avg = f'{np.mean(document_sents):.0f} ± {np.std(document_sents):.0f}'
        sents_median = f'{np.median(document_sents):.0f} ± {iqr(document_sents):.0f}'

        author_dict = defaultdict(list)
        for doc_id, document in corpus.documents.items():
            author_dict[document.authors].append(doc_id)

        print({
            author: len(doc_ids)
            for author, doc_ids in author_dict.items() if author is not None
        })
        author_vals = [
            len(doc_ids) for author, doc_ids in author_dict.items()
            if author is not None
        ]

        author_median = f'{np.median(author_vals):.0f} ± {iqr(author_vals):.0f} [{np.min(author_vals):.0f}, {np.max(author_vals):.0f}]'
        # author_mean = f'{np.mean(author_vals):.2f} ± {np.std(author_vals):.2f} [{np.min(author_vals):.0f}, {np.max(author_vals):.0f}]'
        author_mean = f'{np.mean(author_vals):.2f}'
        author_std = f'{np.std(author_vals):.2f}'
        author_mix = f'[{np.min(author_vals):.0f}, {np.max(author_vals):.0f}]'
        author_max = f'{np.max(author_vals):.0f}'

        print(data_set_name, "Author median iqr / mean std", author_median,
              author_mean)
        if corpus.series_dict and len(corpus.series_dict) > 0:
            series_vals = [
                len(doc_ids)
                for series_id, doc_ids in corpus.series_dict.items()
                if series_id is not None
            ]
            series_median = f'{np.median(series_vals):.0f} ± {iqr(series_vals):.0f} [{np.min(series_vals):.0f}, {np.max(series_vals):.0f}]'
            # series_mean = f'{np.mean(series_vals):.2f} ± {np.std(series_vals):.2f} [{np.min(series_vals):.0f}, {np.max(series_vals):.0f}]'

            series_mean = f'{np.mean(series_vals):.2f}'
            series_std = f'{np.std(series_vals):.2f}'
            series_mix = f'[{np.min(series_vals):.0f}, {np.max(series_vals):.0f}]'

            series_max = f'{np.max(series_vals):.0f}'
            print(data_set_name, "Series median iqr / mean std", series_median,
                  series_mean)
        else:
            series_median = "-"
            series_mean = "-"
            series_std = "-"
            series_mix = "-"

        if corpus.shared_attributes_dict is None:
            corpus.calculate_documents_with_shared_attributes()
        if corpus.shared_attributes_dict["same_genres"] and len(
                corpus.shared_attributes_dict["same_genres"]) > 1:
            genre_vals = [
                len(doc_ids) for genre, doc_ids in
                corpus.shared_attributes_dict["same_genres"].items()
                if genre is not None
            ]
            # print(genre_vals)
            genre_median = f'{np.median(genre_vals):.0f} ± {iqr(genre_vals):.0f} [{np.min(genre_vals):.0f}, {np.max(genre_vals):.0f}]'
            # genre_mean = f'{np.mean(genre_vals):.2f} ± {np.std(genre_vals):.2f} [{np.min(genre_vals):.0f}, {np.max(genre_vals):.0f}]'
            genre_mean = f'{np.mean(genre_vals):.2f}'
            genre_std = f'{np.std(genre_vals):.2f}'
            genre_mix = f'[{np.min(genre_vals):.0f}, {np.max(genre_vals):.0f}]'

            print(data_set_name, "Genre median iqr / mean std", genre_median,
                  genre_mean)
        else:
            genre_median = "-"
            genre_mean = "-"
            genre_std = "-"
            genre_mix = "-"

        # if corpus and len(corpus.series_dict) > 0:
        #     series_median = np.median([len(doc_ids) for series_id, doc_ids in corpus.series_dict.items()])

        tuples.append((
            data_set_name,
            nr_books,
            language,
            tokens_total,
            tokens_median,
            tokens_iqr,
            tokens_min,
            tokens_max,
            vocab_total,
            vocab_median,
            vocab_iqr,
            vocab_min,
            vocab_max,
            author_mean,
            author_std,
            author_mix,
            series_mean,
            series_std,
            series_mix,
            genre_mean,
            genre_std,
            genre_mix,
        ))
    df = pd.DataFrame(
        tuples,
        columns=[
            "Data set", "Amount of Books", "Language", "Total Tokens",
            "Tokens Median", "Tokens IQR", "Tokens Min", "Tokens Max",
            "Total Vocabulary", "Vocabulary Median", "Vocabulary IQR",
            "Vocabulary Min", "Vocabulary Max", "Author Mean", "Author STD",
            "Author [Min, Max]", "Series Mean", "Series STD",
            "Series [Min, Max]", "Genre Mean", "Genre STD", "Genre [Min, Max]"
            # "Books by Same Author ± STD [Min, Max]",
            # "Books by Same Series ± STD [Min, Max]",
            # "Books by Same Genre ± STD [Min, Max]",
            # "Total Sentences", "Sentences Mean [STD]", "Sentences Median [IQR]",
        ],
        index=data_sets)
    df = df.transpose()
    print(df)
    df.to_csv("results/dataset_stats/sizes.csv", index=True)
    print(df.to_latex(index=True))
        fontsize=20)

    plt.show()


if __name__ == '__main__':
    # data_set_name = "classic_gutenberg"
    # data_set_name = "german_books"
    data_set_name = "goodreads_genres"
    vectorization_algorithm = "book2vec"
    filter = "no_filter"  # "specific_words_strict"  # "no_filter"
    vec_path = Vectorization.build_vec_file_name("all",
                                                 "no_limit",
                                                 data_set_name,
                                                 filter,
                                                 vectorization_algorithm,
                                                 "real",
                                                 allow_combination=True)
    vecs, summation_method = Vectorization.my_load_doc2vec_format(vec_path)

    c = Corpus.fast_load("all",
                         "no_limit",
                         data_set_name,
                         filter,
                         "real",
                         load_entities=False)

    tsne_plot(vecs, c)
    # neighbor_plot(vecs, c)
    force_directed_graph(vecs, c)
Exemplo n.º 13
0
def get_neighbors(data_sets: List[str], vector_names: List[str]):
    doc_top_n = 3
    facet_names = [
        #     "loc",
        #     "time",
        #     "atm",
        #     "sty",
        "cont",
        # "plot"
    ]
    is_series_corpus = False
    tuples = []
    columns = None
    for data_set in data_sets:
        corpus = Corpus.fast_load(path=os.path.join('../corpora', data_set),
                                  load_entities=False)
        for vector_name in tqdm(vector_names,
                                desc="Iterate through embedding types",
                                total=len(vector_names)):
            vec_path = Vectorization.build_vec_file_name(
                "all",
                "no_limit",
                data_set,
                "no_filter",
                vector_name,
                "real",
                allow_combination=True)

            vectors, _ = Vectorization.my_load_doc2vec_format(vec_path)

            for doc_id in corpus.documents.keys():
                for facet_name in facet_names:
                    sim_docs = Vectorization.most_similar_documents(
                        vectors,
                        corpus,
                        positives=doc_id,
                        topn=doc_top_n,
                        feature_to_use=facet_name,
                        print_results=False,
                        series=is_series_corpus)[1:]
                    if len(sim_docs) == 2:
                        tuples.append(
                            (data_set, vector_name, facet_name,
                             table_format(corpus.documents[doc_id]), 1,
                             table_format(corpus.documents[replace_sim_id(
                                 sim_docs[0][0])]),
                             table_format(corpus.documents[replace_sim_id(
                                 sim_docs[1][0])])))
                        columns = [
                            "Dataset", "Algorithm", "Facet", "Book", "Rank",
                            "First Neighbor", "Second Neighbor"
                        ]
                    else:
                        for i, (sim_doc_id, sim) in enumerate(sim_docs):
                            tuples.append(
                                (data_set, vector_name, facet_name,
                                 table_format(corpus.documents[doc_id]), i,
                                 table_format(corpus.documents[replace_sim_id(
                                     sim_doc_id)]), sim))
                        columns = [
                            "Dataset", "Algorithm", "Facet", "Book", "Rank",
                            "Similar Book", "Similarity"
                        ]
    df = pd.DataFrame(tuples, columns=columns)
    df.to_csv("results/neighbors/neighbors.csv")

    print(df)
Exemplo n.º 14
0
def calculate_facet_scores(data_sets: List[str],
                           vector_names: List[str],
                           facets: List[str],
                           use_topic_vecs: bool = False):
    results = []
    for data_set in data_sets:
        corpus = Corpus.fast_load(path=os.path.join('../corpora', data_set),
                                  load_entities=False)
        topic_dict = None
        summary_dict = None
        if "cont" in facets:
            topic_dict = TopicModeller.topic_modelling(corpus)
        if "plot" in facets:
            summary_dict = Summarizer.get_summary(corpus)
        start_time = time.time()
        if use_topic_vecs:
            topic_vecs = TopicModeller.get_topic_distribution(corpus, data_set)
        else:
            topic_vecs = None
        for vector_name in tqdm(vector_names,
                                desc="Iterate through embedding types",
                                total=len(vector_names)):
            # print('---')
            vec_path = Vectorization.build_vec_file_name(
                "all",
                "no_limit",
                data_set,
                "no_filter",
                vector_name,
                "real",
                allow_combination=True)

            vecs, _ = Vectorization.my_load_doc2vec_format(vec_path)
            adv_mode = False
            if "_adv" in vector_name:
                adv_mode = True
            fee = FacetEfficientEvaluation(vectors=vecs,
                                           corpus=corpus,
                                           data_set_name=data_set,
                                           facet_names=facets,
                                           topic_vectors=topic_vecs)
            fac_relaxed_scores, fac_strict_scores, fac_strict_fac_only = fee.evaluate(
                word_top_n=100,
                topic_dict=topic_dict,
                summary_dict=summary_dict,
                adv_mode=adv_mode)

            for fac_name in facets:
                results.append(
                    (data_set, vector_name, fac_name,
                     fac_relaxed_scores[fac_name], fac_strict_scores[fac_name],
                     fac_strict_fac_only[fac_name]))

        tuples = []
        for result in results:
            data_set, vector_name, fac_name, relaxed_scores, strict_scores, fac_only_scores = result
            tuples.append((data_set, fac_name, vector_name,
                           sum(relaxed_scores) / len(relaxed_scores),
                           sum(strict_scores) / len(strict_scores),
                           sum(fac_only_scores) / len(fac_only_scores)))

        df = pd.DataFrame(tuples,
                          columns=[
                              'Corpus', 'Facet', 'Algorithm', 'Relaxed Score',
                              'Strict Score', 'Facet Only Score'
                          ])
        df = df.sort_values([
            'Corpus', 'Facet', 'Algorithm', 'Relaxed Score', 'Strict Score',
            'Facet Only Score'
        ])
        print(df)
        df.to_csv('results/facet_evaluation/facet_task_results.csv',
                  index=False)
        print(df.to_latex(index=False))
        results = []
        a_time = time.time() - start_time
        start_time = time.time()

        # for vector_name in tqdm(vector_names, desc="Iterate through embedding types", total=len(vector_names)):
        #     print('---')
        #     vec_path = Vectorizer.build_vec_file_name("all",
        #                                               "no_limit",
        #                                               data_set,
        #                                               "no_filter",
        #                                               vector_name,
        #                                               "real")
        #
        #     vecs = Vectorizer.my_load_doc2vec_format(vec_path)
        #
        #     for fac_name in tqdm(facets, total=len(facets), desc="Iterate through facetes"):
        #         fe = FacetEvaluation(fac_name, vecs, c, data_set)
        #         relaxed_scores, strict_scores = fe.evaluate()
        #         results.append((data_set, vector_name, fac_name, relaxed_scores, strict_scores))
        #
        # tuples = []
        # for result in results:
        #     data_set, vector_name, fac_name, relaxed_scores, strict_scores = result
        #     tuples.append((data_set, vector_name, fac_name,
        #                    sum(relaxed_scores) / len(relaxed_scores), sum(strict_scores) / len(strict_scores)))
        #
        # df = pd.DataFrame(tuples, columns=['Corpus', 'Algorithm', 'Facet', 'Relaxed Score', 'Strict Score'])
        # print(df)
        # df.to_csv('results/facet_evaluation/facet_task_results.csv', index=False)

        b_time = time.time() - start_time
        print(a_time, b_time)
    def calculate_vocab_sizes(cls, corpus: Corpus, threshold, data_set: str, dir_path: str):
        filtered_corpus_dir = Corpus.build_corpus_dir("",
                                                      "",
                                                      data_set,
                                                      f'specific_words_{threshold}',
                                                      "None").replace('__', '_')

        # print(os.path.isfile(os.path.join(dir_path.replace('.txt', ''), f'{threshold}.json')))

        if os.path.isfile(os.path.join(dir_path.replace('.txt', ''), f'{threshold}.json')):
            with open(os.path.join(dir_path.replace('.txt', ''), f'{threshold}.json'), 'r', encoding='utf-8') as file:
                data = json.load(file)
                print(threshold, data['global_vocab_size'])
                return data
        print('>|0', threshold)
        if not os.path.isdir(filtered_corpus_dir):
            if cls.absolute:
                to_specfic_words = CommonWords.global_too_specific_words_doc_frequency(
                    corpus, percentage_share=threshold, absolute_share=threshold)
            else:
                to_specfic_words = CommonWords.global_too_specific_words_doc_frequency(
                    corpus,
                    percentage_share=threshold)
            print('>|1 with len', len(to_specfic_words))
            # filtered_corpus = corpus.common_words_corpus_copy(to_specfic_words, masking=False)

            filtered_corpus = corpus.common_words_corpus_copy_mem_eff(to_specfic_words, masking=False,
                                                                      corpus_dir=filtered_corpus_dir,
                                                                      through_no_sentences_error=False)
        else:
            filtered_corpus = Corpus.load_corpus_from_dir_format(filtered_corpus_dir)
        # corpus.common_words_corpus_filtered(to_specfic_words, masking=False)
        # filtered_corpus = corpus
        # del corpus
        print('>|2')

        author_dict = defaultdict(list)
        for doc_id, document in filtered_corpus.documents.items():
            author_dict[document.authors].append(doc_id)
        author_median = np.median([len(doc_ids) for author, doc_ids in author_dict.items()])
        series_median = np.median([len(doc_ids) for series_id, doc_ids in filtered_corpus.series_dict.items()])

        corpus_vocab_size = len(filtered_corpus.get_corpus_vocab())
        print('>|3 vocab size', corpus_vocab_size)
        document_sizes = {document_id:  {'vocab_size': document.vocab_size,
                                         'document_length': document.length}
                          for document_id, document in tqdm(filtered_corpus.documents.items(),
                                                            total=len(filtered_corpus),
                                                            desc="Calculate Corpus Sizes")}
        # for document_id, document in filtered_corpus.documents.items():
        #     print([token for token in document.get_flat_document_tokens() if token != 'del'][:100])

        # for document_id, words in common_words.items():
        #     print(document_id, len(words), document_sizes[document_id]['vocab_size'])
        vocab_sizes = []
        document_lengths = []
        for doc_id, document_size in document_sizes.items():
            vocab_sizes.append(document_size['vocab_size'])
            document_lengths.append(document_size['document_length'])

        print(threshold, corpus_vocab_size, np.mean(vocab_sizes), np.mean(document_lengths))

        result_dict = {'global_vocab_size': corpus_vocab_size,
                       'avg_vocab_size': np.mean(vocab_sizes),
                       'std_vocab_size': np.std(vocab_sizes),
                       'avg_document_length': np.mean(document_lengths),
                       'std_document_length': np.std(document_lengths),
                       'document_sizes': document_sizes}

        with open(os.path.join(dir_path, f'{threshold}.json'), 'w', encoding='utf-8') as outfile:
            json.dump(result_dict, outfile, indent=1)

        # print(filtered_corpus.get_corpus_vocab())
        # print(filtered_corpus.get_flat_document_tokens())
        return result_dict
Exemplo n.º 16
0
import json
import os

from lib2vec.corpus_structure import Corpus, ConfigLoader

if __name__ == "__main__":
    config = ConfigLoader.get_config()
    corpus_to_annotate = "dta"
    time_dict_path = os.path.join(config["system_storage"]["corpora"], 'plain_text', f'{corpus_to_annotate}_plain',
                                  'out', 'time_dict.json')
    with open(time_dict_path, encoding='utf-8') as json_file:
        data = json.load(json_file)
    x = Corpus.load_corpus_from_dir_format(os.path.join(config["system_storage"]["corpora"], corpus_to_annotate))
    x.update_time_entities(data)
    # x.save_corpus_adv(os.path.join(config["system_storage"]["corpora"], corpus_to_annotate))
Exemplo n.º 17
0
def chunk_documents(data_set: str, number_of_subparts: int,
                    corpus_size: Union[int, str]):
    annotated_series_corpus_path = None
    if "_fake_series" in data_set:
        annotated_series_corpus_path = os.path.join(
            config["system_storage"]["corpora"],
            f'{data_set}_{number_of_subparts}_'
            f'{corpus_size}')
        data_set = data_set.replace("_fake_series", "")

    annotated_corpus_path = os.path.join(config["system_storage"]["corpora"],
                                         f'{data_set}')

    # print(annotated_series_corpus_path, annotated_corpus_path)
    if annotated_series_corpus_path:
        try:
            # check if series corpus exists
            # corpus = Corpus(annotated_series_corpus_path)
            corpus = Corpus.fast_load(path=annotated_series_corpus_path,
                                      load_entities=False)
        except FileNotFoundError:
            try:
                # check if general corpus exists
                corpus = Corpus.fast_load(path=annotated_corpus_path,
                                          load_entities=False)
                if corpus_size != "no_limit":
                    corpus = corpus.sample(corpus_size, seed=42)
                corpus = build_series_corpus(corpus,
                                             annotated_series_corpus_path,
                                             number_of_subparts)

                # corpus.save_corpus_adv(annotated_series_corpus_path)
            except FileNotFoundError:
                # load from raw data
                corpus = DataHandler.load_corpus(data_set)
                if corpus_size != "no_limit":
                    corpus = corpus.sample(corpus_size, seed=42)

                Preprocesser.annotate_and_save(
                    corpus,
                    corpus_dir=annotated_corpus_path,
                    without_spacy=False)
                # corpus = Preprocesser.annotate_corpus(corpus)
                # corpus.save_corpus_adv(annotated_corpus_path)

                corpus = build_series_corpus(
                    Corpus.fast_load(path=annotated_corpus_path,
                                     load_entities=False),
                    annotated_series_corpus_path, number_of_subparts)
    else:
        try:
            # check if general corpus exists
            corpus = Corpus.fast_load(path=annotated_corpus_path,
                                      load_entities=False)
            if corpus_size != "no_limit":
                corpus = corpus.sample(corpus_size, seed=42)

            # corpus.save_corpus_adv(annotated_series_corpus_path)
        except FileNotFoundError:
            # load from raw data
            corpus = DataHandler.load_corpus(data_set)
            if corpus_size != "no_limit":
                corpus = corpus.sample(corpus_size, seed=42)

            Preprocesser.annotate_and_save(corpus,
                                           corpus_dir=annotated_corpus_path,
                                           without_spacy=False)
            corpus = Corpus.fast_load(path=annotated_corpus_path,
                                      load_entities=False)

    return corpus