示例#1
0
    def test_trigram_count(self):
        df = build_corpus(pd.DataFrame(data=self.raw_data))

        ngram_extractor = NGram()
        ngram_extractor.extract_features(df, trigram_threshold=2)

        self.assertEqual([0, 0, 0, 0], df["hate_speech_trigrams"].tolist())
 def test_corpus(self):
     corpus = build_corpus(pd.DataFrame(data=self.raw_data))
     self.assertTrue(
         "p.m."
         in corpus.iloc[0]["tokens"])  # check correct punctuation removal
     self.assertTrue(
         "will" in corpus.iloc[1]["tokens"])  # check correct lemmatization
     self.assertTrue("http://www.spamlaws.com/state/summary.html" in
                     corpus.iloc[2]["tokens"])
    def test_pattern_count(self):
        df = build_corpus(pd.DataFrame(data=self.raw_data))

        pattern_extractor = Pattern(min_pattern_size=2,
                                    max_pattern_size=2,
                                    threshold=2)
        pattern_extractor.extract_features(df)

        self.assertEqual([2, 2, 2, 1], df["pattern_count"].tolist())
示例#4
0
def run_feature_extraction_create_corpus(run_from_scratch, df_preprocessed):
    """ Run corpus building if run_from_scratch=True """
    if run_from_scratch:
        df_corpus = build_corpus(df_preprocessed)
        df_corpus.to_csv(
            str(get_project_root()) + "/data/extracted_features/corpus.csv")
        return df_corpus
    else:
        df_corpus = pd.read_csv(
            str(get_project_root()) + "/data/extracted_features/corpus.csv")
        return df_corpus
示例#5
0
    def __init__(self, num_topics=2, num_workers=4):
        self.num_topics = num_topics
        self.num_workers = num_workers

    def extract_features(self, df):
        """ Extracts LDA topics """
        id2word = corpora.Dictionary(df["tokens"])
        corpus = [id2word.doc2bow(doc) for doc in df["tokens"]]
        lda_model = LdaMulticore(
            corpus=corpus,
            id2word=id2word,
            num_topics=self.num_topics,
            workers=self.num_workers,
        )
        df["topic"] = df["tokens"].apply(
            lambda tokens: np.argmax(
                [prob for (topic, prob) in lda_model[id2word.doc2bow(tokens)]]
            )
        )
        return df


if __name__ == "__main__":
    df_dataset = pd.read_csv("../../data/preprocessed/dataset.csv", index_col=0)
    df_dataset = build_corpus(df_dataset)
    lda_topic_extractor = LDATopic(
        2,
    )
    lda_topic_extractor.extract_features(df_dataset)
    print(df_dataset)