Exemplo n.º 1
0
    def create_dictionary(self):
        """
        Utility method to generate gensim-style Dictionary directly from
        the corpus and vocabulary data.
        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs))

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary
Exemplo n.º 2
0
    def create_dictionary(self):
        """
        Utility method to generate gensim-style Dictionary directly from
        the corpus and vocabulary data.
        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i' %
                            (docno, self.num_docs))

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary
Exemplo n.º 3
0
    def create_dictionary(self):
        """Generate :class:`gensim.corpora.dictionary.Dictionary` directly from the corpus and vocabulary data.

        Return
        ------
        :class:`gensim.corpora.dictionary.Dictionary`
            Dictionary, based on corpus.

        Examples
        --------

        .. sourcecode:: pycon

            >>> from gensim.corpora.ucicorpus import UciCorpus
            >>> from gensim.test.utils import datapath
            >>> ucc = UciCorpus(datapath('testcorpus.uci'))
            >>> dictionary = ucc.create_dictionary()

        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = utils.revdict(self.id2word)

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i', docno,
                            self.num_docs)

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary
Exemplo n.º 4
0
    def create_dictionary(self):
        """Generate :class:`gensim.corpora.dictionary.Dictionary` directly from the corpus and vocabulary data.

        Return
        ------
        :class:`gensim.corpora.dictionary.Dictionary`
            Dictionary, based on corpus.

        Examples
        --------

        .. sourcecode:: pycon

            >>> from gensim.corpora.ucicorpus import UciCorpus
            >>> from gensim.test.utils import datapath
            >>> ucc = UciCorpus(datapath('testcorpus.uci'))
            >>> dictionary = ucc.create_dictionary()

        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = utils.revdict(self.id2word)

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i', docno, self.num_docs)

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary
Exemplo n.º 5
0
                             f"{args.config}.{sp_key}.dic"))
    else:
        special_token_dict = {PADDING: 0, UNKNOWN: 1, START: 2, END: 3}
        word_dic = Dictionary()
        word_dic.token2id = special_token_dict
        #char_dic = Dictionary()
        #char_dic.token2id = special_token_dict
        sw_dicts = {}
        for sp_key, sp in sps.items():
            _dic = Dictionary()
            _dic.token2id = special_token_dict
            sw_dicts[sp_key] = _dic
    label_dic = Dictionary(train_label_documents)
    label_dic.patch_with_special_tokens({PADDING: 0})
    label_dic.id2token = {
        _id: label
        for label, _id in label_dic.token2id.items()
    }

    # add vocabulary
    word_dic.add_documents(train_word_documents)
    #char_dic.add_documents(list(chain.from_iterable(train_char_documents)))
    for sp_key, train_sw_documents in train_sw_documents_dicts.items():
        sw_dicts[sp_key].add_documents(train_sw_documents)

    # load GloVe
    if config_dic.get("glove_path"):
        print("========= Load Pretrain Word Embeddings ==========")
        word2vec = load_pretrain_embeddings(
            config_dic.get("glove_path"),
            emb_dim=config_dic.get("word_emb_dim"))
        pretrain_embeddings = build_pretrain_embeddings(
Exemplo n.º 6
0
def abandon():
    stopWords = set(stopwords.words('english'))

    for w in string.punctuation:
        stopWords.add(w)

    stops_words = [
        "rt", "…", "...", "URL", "http", "https", "“", "”", "‘", "’", "get",
        "2", "new", "one", "i'm", "make", "go", "good", "say", "says", "know",
        "day", "..", "take", "got", "1", "going", "4", "3", "two", "n", "like",
        "via", "u", "would", "still", "first", "really", "watch", "see",
        "even", "that's", "look", "way", "last", "said", "let", "twitter",
        "ever", "always", "another", "many", "things", "may", "big", "come",
        "keep", "5", "time", "much", "want", "think", "us", "love", "people",
        "need"
    ]

    for w in stops_words:
        stopWords.add(w)

    tokenizer = CustomTweetTokenizer(preserve_case=False,
                                     reduce_len=True,
                                     strip_handles=False,
                                     normalize_usernames=False,
                                     normalize_urls=True,
                                     keep_allupper=False)

    cnt = Counter()
    texts = []
    # comm = json.load(open("data/louvain_rst.json"))
    # users_comm = {str(u) for u in comm if comm[u] == 0}
    # print(len(users_comm))

    # loading data
    data = pd.read_csv("data/ira-tweets-ele.csv",
                       usecols=["tweet_text", "userid"])
    for i, row in tqdm(data.iterrows()):
        # if row["userid"] not in users_comm:
        #     continue
        words = tokenizer.tokenize(row["tweet_text"])
        words = [w for w in words if w not in stopWords and w]
        # if words[0] == "RT":
        #     continue
        for w in words:
            cnt[w] += 1
        texts.append(words)
    print(len(texts))
    json.dump(cnt.most_common(), open("data/word_cloud.json", "w"), indent=2)

    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(t) for t in texts]

    def average_distance(v_tops):
        _sum = 0
        _cnt = 0
        for i in range(len(v_tops)):
            for j in range(i + 1, len(v_tops)):
                _sum += scipy.spatial.distance.cosine(v_tops[i], v_tops[j])
                _cnt += 1
        return _sum / _cnt

    with open("data/IRA_topics.txt", "w") as f:
        for n in range(2, 12):
            print(f"N = {n}")
            lda = LdaModel(corpus, num_topics=n, random_state=42)
            v_topics = lda.get_topics()
            lda.save(f"model/lda-ira-{n}.mod")
            # pprint(lda.print_topics())

            f.write(f"Perplexity: {lda.log_perplexity(corpus)}"
                    )  # a measure of how good the model is. lower the better.

            # Compute Coherence Score
            coherence_model_lda = CoherenceModel(model=lda,
                                                 texts=corpus,
                                                 coherence='c_v')
            coherence_lda = coherence_model_lda.get_coherence()
            f.write(f"Coherence Score: {coherence_lda}")
            f.write(f"~Average distance: {average_distance(v_topics)}\n")
            # show
            x = lda.show_topics(num_topics=n, num_words=20, formatted=False)
            topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
            dictionary.id2token = {
                v: k
                for k, v in dictionary.token2id.items()
            }
            # Below Code Prints Topics and Words
            for topic, words in topics_words:
                f.write(
                    str(topic) + " :: " +
                    str([dictionary.id2token[int(w)] for w in words]) + "\n")
            f.write("\n")
Exemplo n.º 7
0
                             f"{args.config}.{sp_key}.dic"))
    else:
        special_token_dict = {PADDING: 0, UNKNOWN: 1, START: 2, END: 3}
        word_dic = Dictionary()
        word_dic.token2id = special_token_dict
        char_dic = Dictionary()
        char_dic.token2id = special_token_dict
        sw_dicts = {}
        for sp_key, sp in sps.items():
            _dic = Dictionary()
            _dic.token2id = special_token_dict
            sw_dicts[sp_key] = _dic
    label_dic = Dictionary(train_label_documents)
    label_dic.patch_with_special_tokens({PADDING: 0})
    label_dic.id2token = {
        _id: label
        for label, _id in label_dic.token2id.items()
    }

    # add vocabulary
    word_dic.add_documents(train_word_documents)
    char_dic.add_documents(list(chain.from_iterable(train_char_documents)))
    for sp_key, train_sw_documents in train_sw_documents_dicts.items():
        sw_dicts[sp_key].add_documents(train_sw_documents)

    # load GloVe
    if config_dic.get("glove_path"):
        print("============== Load Pretrain Word Embeddings ================")
        word2vec = load_pretrain_embeddings(
            config_dic.get("glove_path"),
            emb_dim=config_dic.get("word_emb_dim"))
        pretrain_embeddings = build_pretrain_embeddings(
Exemplo n.º 8
0
## Vectorize the corpus
cv = CountVectorizer(stop_words="english",
                     min_df=5,
                     max_df=0.4,
                     max_features=5000,
                     ngram_range=(1, 1))
dtm = cv.fit_transform(corpus)

features = np.array(cv.get_feature_names())
id2token = dict(zip(range(len(features)), features))
token2id = dict(zip(features, range(len(features))))

## Create a gensim dictionary
dictionary = Dictionary()
dictionary.id2token = id2token
dictionary.token2id = token2id

## Train LDA models with different count of topics
topic_counts = [20, 30, 40, 50, 70, 100, 120, 150]


def get_topn_words(lda_model, features, topn=20):
    topics = lda_model.components_
    topic_words = []
    for topic_num, topic_weights in enumerate(topics):
        top_words = topic_weights.argsort()[::-1][:topn]
        topic_words.append(list(features[top_words]))
    return topic_words

    words = tokenizer.tokenize(line.strip())
    # if words[0] == "RT":
    #     continue
    texts.append(words)

print("loaded!")
# conn.close()

dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(t) for t in texts]
lda = LdaModel(corpus, num_topics=10)

x = lda.show_topics(num_topics=10, num_words=20, formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

dictionary.id2token = {v: k for k, v in dictionary.token2id.items()}
#Below Code Prints Topics and Words
for topic, words in topics_words:
    print(
        str(topic) + "::" + str([dictionary.id2token[int(w)] for w in words]))
print()

#Below Code Prints Only Words
# for topic, words in topics_words:
#     print(" ".join(words))

# conn = sqlite3.connect(
#     "/home/alex/network_workdir/elections/databases_ssd/complete_trump_vs_hillary_sep-nov_db.sqlite")
# c = conn.cursor()
# c.execute('''SELECT text FROM tweet''')