Exemplo n.º 1
0
def save_dictionary(
    dic: corpora.Dictionary,
    filename: str
) -> None:
    dic.save(filename)
    print("saved dictionary: {} items to {}".format(
        len(dic.values()), filename
    ))
    def test_dict_interface(self):
        """Test Python 2 dict-like interface in both Python 2 and 3."""
        d = Dictionary(self.texts)

        self.assertTrue(isinstance(d, Mapping))

        self.assertEqual(list(zip(d.keys(), d.values())), list(d.items()))

        # Even in Py3, we want the iter* members.
        self.assertEqual(list(d.items()), list(d.iteritems()))
        self.assertEqual(list(d.keys()), list(d.iterkeys()))
        self.assertEqual(list(d.values()), list(d.itervalues()))

        # XXX Do we want list results from the dict members in Py3 too?
        if not PY3:
            self.assertTrue(isinstance(d.items(), list))
            self.assertTrue(isinstance(d.keys(), list))
            self.assertTrue(isinstance(d.values(), list))
Exemplo n.º 3
0
    thing = d[x]["text"].lower().translate(
        str.maketrans('', '', string.punctuation))
    tfiltered = list(filter(lambda w: not w in s, thing.split()))

    #tfiltered = map(lambda x: lemmatizer.lemmatize(x), tfiltered)
    #tfiltered = list(tfiltered)
    corpus.append(tfiltered)

dct = Dictionary(corpus)
bow_corpus = [dct.doc2bow(line) for line in corpus]
term_doc_mat = corpus2csc(bow_corpus)

from collections import OrderedDict

document = corpus
names = dct.values()

occurrences = OrderedDict(
    (name, OrderedDict((name, 0) for name in names)) for name in names)

# Find the co-occurrences:
for l in document:
    for i in range(len(l)):
        print(l[i - word_window:i] + l[i + word_window:])
        for item in l[i - word_window:i] + l[i + word_window:]:
            occurrences[l[i]][item] += 1

# Print the matrix

wcounts = dict()
Exemplo n.º 4
0
metadata = pd.read_csv("..\\data\\absrecord.csv")
print(len(metadata['filename'].values))
fullvocab = []

from preprocessor import preprocess, flatten

for record in range(len(metadata)):
    # print(100*record/len(metadata))
    fullvocab.append(preprocess(str(metadata.iloc[record]['body']))[0])
print(fullvocab)
maindict = Dictionary(fullvocab)
i = 0
fulldict = []
for document in fullvocab:
    temp = []
    print(100 * i / len(fullvocab))
    i += 1
    document = list(sorted(set(document)))
    for token in document:
        if token in list(maindict.values()):
            for key, value in list(maindict.items()):
                if token == value:
                    temp.append({"id": key, "name": token})
                    # print({"id":key, "name":token})
    fulldict.append(temp)

b = metadata['filename'].values
print(fulldict)
a = pd.DataFrame({'keywords': fulldict})
metadata.append(a)
metadata.to_csv("..\\data\\keywords.csv")
Exemplo n.º 5
0
 def get_headers(df, attr):
     documents = df[attr]
     dictionary = Dictionary(documents)
     return list(dictionary.values())
Exemplo n.º 6
0
class CMVCorpus(object):
    logger = logging.getLogger(__name__)

    def __init__(self, config):
        self.config = config
        self._path = config.data_dir[0]
        self.max_data_size = config.max_data_size
        self.max_utt_len = config.max_utt_len
        self.tokenize = get_chat_tokenize()
        self.train_corpus, self.test_corpus = self._read_file(
            os.path.join(self._path))
        self._build_vocab(config.max_vocab_cnt)
        print("Done loading corpus")

    def _process_dialog(self, data):
        new_dialog = []
        all_lens = []
        all_dialog_lens = []

        for raw_dialog in data:
            dialog = {
                "title": self.tokenize(raw_dialog['title'].lower()),
                "op": self.tokenize(raw_dialog["content"].lower()),
                "pos_conv_lst": [],
                "neg_conv_lst": []
            }
            for i, turns in enumerate(
                    raw_dialog['comments']):  # for each comment lst
                if turns["win"]:
                    conv_lst = dialog["pos_conv_lst"]
                else:
                    conv_lst = dialog["neg_conv_lst"]
                new_utt_lst = []
                for turn in turns["utt_lst"]:
                    argument = self.tokenize(turn.lower())
                    all_lens.append(len(argument))
                    new_utt_lst.append(argument)
                conv_lst.append(new_utt_lst)
                all_dialog_lens.append(len(new_utt_lst))
            new_dialog.append(dialog)
            # cut for the max data size
            if len(new_dialog) >= self.max_data_size:
                break

        print("Max utt len %d, mean utt len %.2f" %
              (np.max(all_lens), float(np.mean(all_lens))))
        print("Max dialog len %d, mean dialog len %.2f" %
              (np.max(all_dialog_lens), float(np.mean(all_dialog_lens))))
        return new_dialog

    def _build_vocab(self, max_vocab_cnt):
        all_words = []
        for dialog in self.train_corpus:
            all_words.append(dialog["op"] + dialog["title"])
            for turns in dialog["pos_conv_lst"] + dialog["neg_conv_lst"]:
                for turn in turns:
                    all_words.append(turn)

        self.vocab_bow = Dictionary(all_words)
        raw_vocab_size = len(self.vocab_bow)
        raw_wc = np.sum(list(self.vocab_bow.dfs.values()))

        # build useless stopwords vocab (e.g, very few words, single ascii words, some punctuation ,."')
        self.vocab_bow.filter_extremes(no_below=10, keep_n=max_vocab_cnt)
        bad_ids = HTML_STOPWORDS + ['cmv']
        self.vocab_bow.filter_tokens(
            list(map(self.vocab_bow.token2id.get, bad_ids)))
        self.vocab_bow.compactify()
        self.vocab_seq = copy.deepcopy(self.vocab_bow)  # for sequence model
        self.vocab_seq.token2id[self.vocab_seq[0]] = len(self.vocab_seq)
        self.vocab_seq.token2id[PAD] = 0
        self.vocab_seq.token2id[UNK] = len(self.vocab_seq)
        self.vocab_seq.compactify()
        self.pad_wid = self.vocab_seq.token2id.get(PAD)

        len_1_words = list(
            filter(
                lambda w: len(w) == 1 and re.match(r"[\x00-\x7f]", w) and w
                not in ["[", "]", "$", "?", "!", "\"", "'", "i", "a"
                        ] and True or False, self.vocab_bow.values()))
        self.vocab_bow.filter_tokens(
            list(map(self.vocab_bow.token2id.get, len_1_words)))
        # some makeup words
        # makeup_lst = [PAD]
        # for w in makeup_lst:
        #     self.vocab_bow.token2id[w] = len(self.vocab_bow)
        # self.vocab_bow.compactify()
        # self.pad_wid = self.vocab_bow.token2id.get(PAD)
        # here we keep stopwords and some meaningful punctuations
        non_stopwords = filter(
            lambda w: re.match(r"^[\w\d_-]*$", w) and w not in STOPWORDS and
            True or False, self.vocab_bow.values())
        self.vocab_bow_stopwords = copy.deepcopy(self.vocab_bow)
        self.vocab_bow_stopwords.filter_tokens(
            map(self.vocab_bow_stopwords.token2id.get, non_stopwords))
        self.vocab_bow_stopwords.compactify()
        self.vocab_bow_non_stopwords = copy.deepcopy(self.vocab_bow)
        self.vocab_bow_non_stopwords.filter_tokens(
            map(self.vocab_bow_non_stopwords.token2id.get,
                self.vocab_bow_stopwords.values()))
        self.vocab_bow_non_stopwords.compactify()
        remain_wc = np.sum(list(self.vocab_bow.dfs.values()))
        min_count = np.min(list(self.vocab_bow.dfs.values()))
        # create vocabulary list sorted by count
        print(
            "Load corpus with train size %d, "
            "test size %d raw vocab size %d vocab size %d at cut_off %d OOV rate %f"
            % (len(self.train_corpus), len(self.test_corpus), raw_vocab_size,
               len(self.vocab_bow), min_count, 1 - float(remain_wc) / raw_wc))

    def _read_file(self, path):
        with open(path, 'r') as f:
            data = json.load(f)
        return self._process_dialog(data["train"]), self._process_dialog(
            data["test"])

    def _sent2id_seq(self, sent, vocab):
        return list(
            filter(lambda x: x is not None,
                   [vocab.token2id.get(t) for t in sent]))

    def _sent2id_bow(self, sent, vocab):
        if sent:
            return vocab.doc2bow(sent)
        else:
            return []

    def _to_id_corpus(self, data, vocab_seq, vocab_bow):
        results = []
        word_cnt = 0
        msg_cnt = 0

        for dialog in data:
            # convert utterance and feature into numeric numbers
            id_dialog = Pack(title=self._sent2id_seq(dialog["title"],
                                                     vocab_seq),
                             op=self._sent2id_seq(dialog["op"], vocab_seq),
                             pos_conv_seq_lst=[],
                             pos_conv_bow_lst=[],
                             neg_conv_seq_lst=[],
                             neg_conv_bow_lst=[])
            for turns in dialog["pos_conv_lst"]:
                new_turns_bow = []
                new_turns_seq = []
                for turn in turns:
                    id_turn_seq = self._sent2id_seq(turn, vocab_seq)
                    id_turn_bow = self._sent2id_bow(turn, vocab_bow)
                    if id_turn_seq and id_turn_bow:  # filter empty utt
                        new_turns_bow.append(id_turn_bow)
                        new_turns_seq.append(id_turn_seq)
                        word_cnt += len(id_turn_seq)
                        msg_cnt += 1
                if new_turns_seq and new_turns_bow:
                    id_dialog["pos_conv_bow_lst"].append(new_turns_bow)
                    id_dialog["pos_conv_seq_lst"].append(new_turns_seq)
            for turns in dialog["neg_conv_lst"]:
                new_turns_bow = []
                new_turns_seq = []
                for turn in turns:
                    id_turn_seq = self._sent2id_seq(turn, vocab_seq)
                    id_turn_bow = self._sent2id_bow(turn, vocab_bow)
                    if id_turn_seq and id_turn_bow:  # filter empty utt
                        new_turns_bow.append(id_turn_bow)
                        new_turns_seq.append(id_turn_seq)
                        word_cnt += len(id_turn_seq)
                        msg_cnt += 1
                if new_turns_seq and new_turns_bow:
                    id_dialog["neg_conv_bow_lst"].append(new_turns_bow)
                    id_dialog["neg_conv_seq_lst"].append(new_turns_seq)
            if id_dialog.pos_conv_bow_lst and id_dialog.neg_conv_bow_lst:
                results.append(id_dialog)
        print("Load seq with %d msgs, %d words" % (msg_cnt, word_cnt))
        return results, msg_cnt, word_cnt

    def _to_id_corpus_bow(self, data, vocab):
        results = []
        word_cnt = 0
        msg_cnt = 0

        for dialog in data:
            # convert utterance and feature into numeric numbers
            id_dialog = Pack(title=self._sent2id_bow(dialog["title"], vocab),
                             op=self._sent2id_bow(dialog["op"], vocab),
                             pos_conv_bow_lst=[],
                             neg_conv_bow_lst=[])
            for turns in dialog["pos_conv_lst"]:
                new_turns = []
                for turn in turns:
                    id_turn = self._sent2id_bow(turn, vocab)
                    if id_turn:  # filter empty utt
                        new_turns.append(id_turn)
                        word_cnt += np.sum([j for i, j in id_turn])
                        msg_cnt += 1
                if new_turns:
                    id_dialog["pos_conv_bow_lst"].append(new_turns)
            for turns in dialog["neg_conv_lst"]:
                new_turns = []
                for turn in turns:
                    id_turn = self._sent2id_bow(turn, vocab)
                    if id_turn:  # filter empty utt
                        new_turns.append(id_turn)
                        word_cnt += np.sum([j for i, j in id_turn])
                        msg_cnt += 1
                if new_turns:
                    id_dialog["neg_conv_bow_lst"].append(new_turns)
            if id_dialog.pos_conv_bow_lst and id_dialog.neg_conv_bow_lst:
                results.append(id_dialog)
        print("Load bow with %d msgs, %d words" % (msg_cnt, word_cnt))
        return results, msg_cnt, word_cnt

    def get_corpus_bow(self, keep_stopwords=True):
        if keep_stopwords:
            vocab = self.vocab_bow
        else:
            vocab = self.vocab_bow_non_stopwords
        id_train = self._to_id_corpus_bow(self.train_corpus, vocab)
        id_test = self._to_id_corpus_bow(self.test_corpus, vocab)
        return Pack(train=id_train, test=id_test, vocab_size=len(vocab))

    def get_corpus_seq(self):
        vocab = self.vocab_seq

        id_train = self._to_id_corpus_seq(self.train_corpus, vocab)
        id_test = self._to_id_corpus_seq(self.test_corpus, vocab)
        return Pack(train=id_train, test=id_test, vocab_size=len(vocab))

    def get_corpus(self):
        id_train = self._to_id_corpus(self.train_corpus, self.vocab_seq,
                                      self.vocab_bow)
        id_test = self._to_id_corpus(self.test_corpus, self.vocab_seq,
                                     self.vocab_bow)
        # id_valid = self._to_id_corpus(self.valid_corpus, self.vocab_seq, self.vocab_bow)
        return Pack(train=id_train,
                    test=id_test,
                    vocab_size=len(self.vocab_bow))
Exemplo n.º 7
0
        df['lemmatized_text'] = list(
            map(lambda sentence: list(map(lemm.lemmatize, sentence)),
                df.stopwords_removed))

        p_stemmer = nltk.stem.porter.PorterStemmer()
        df['stemmed_text'] = list(
            map(lambda sentence: list(map(p_stemmer.stem, sentence)),
                df.lemmatized_text))

    stem_words(train_data)

    # Vectorize words

    dictionary = Dictionary(documents=train_data.stemmed_text.values)
    dictionary.save('model/dictionary.txtdic')
    print("Found {} words.".format(len(dictionary.values())))

    #dictionary.filter_extremes(no_above=0.8, no_below=3)

    dictionary.compactify()  # Reindexes the remaining words after filtering
    print("Left with {} words.".format(len(dictionary.values())))

    #Make a BOW ( Bag of Words ) for every document
    def document_to_bow(df):
        df['bow'] = list(
            map(lambda doc: dictionary.doc2bow(doc), df.stemmed_text))

    document_to_bow(train_data)

    # we make a function such that later on when we make the submission, we don't need to write duplicate code
    def lda_preprocessing(df):
Exemplo n.º 8
0
class FasttextTfIdfTransformer:
    def __init__(self,
                 model=None,
                 dictionary=None,
                 corpus_file=None,
                 size=256,
                 window=7,
                 min_count=4,
                 iter=30,
                 min_n=4,
                 max_n=5,
                 word_ngrams=1,
                 no_above=0.5,
                 filter_n_most_frequent=100,
                 do_filter_tokens=True,
                 workers=multiprocessing.cpu_count() - 1,
                 ft_prefix="ft_",
                 token_column=None,
                 inplace=True,
                 store_train_data=False,
                 skip_fit=False,
                 skip_transform=False,
                 normalize_word_vectors=True):
        self.size = size
        self.window = window
        self.min_count = min_count
        self.iter = iter
        self.min_n = min_n
        self.max_n = max_n
        self.word_ngrams = word_ngrams
        self.workers = workers
        self.token_column = token_column
        self.model = None
        assert type(self.token_column) == str
        self.ft_prefix = ft_prefix
        self.skip_fit = skip_fit
        self.skip_transform = skip_transform
        self.inplace = inplace
        self.normalize_word_vectors = normalize_word_vectors
        self.store_train_data = store_train_data
        self.train = None
        self.model = model
        self.no_above = no_above
        self.word_set = None
        self.filter_n_most_frequent = filter_n_most_frequent
        self.do_filter_tokens = do_filter_tokens
        self.dictionary = dictionary
        if model is None and corpus_file is not None:
            self.dictionary = Dictionary(
                map(lambda s: s.split(), load_list_per_line(corpus_file)))
            print("Total Unique Tokens = %s" % (len(self.dictionary)))
            self.dictionary.filter_extremes(no_below=self.min_count,
                                            no_above=self.no_above,
                                            keep_n=1000000)
            self.dictionary.filter_n_most_frequent(self.filter_n_most_frequent)
            print("Total Unique Tokens after filtering = %s" %
                  (len(self.dictionary)))
            self.word_set = set(self.dictionary.values())
            self.model = FastText(corpus_file=corpus_file,
                                  size=self.size,
                                  window=self.window,
                                  min_count=self.min_count,
                                  iter=self.iter,
                                  min_n=self.min_n,
                                  max_n=self.max_n,
                                  word_ngrams=self.word_ngrams,
                                  workers=self.workers,
                                  bucket=8000000,
                                  alpha=0.03,
                                  negative=10,
                                  ns_exponent=0.5)

        if (model is None or dictionary is None) and corpus_file is None:
            raise ValueError("No data given to initialise FastText Model")
        assert self.dictionary is not None and self.model is not None

    def fit(self, X, y='ignored'):
        gc.collect()
        if self.store_train_data:
            self.train = (X, y)
        if self.skip_fit:
            return self
        if type(X) == pd.DataFrame:
            X = X[self.token_column].values
        else:
            raise ValueError()

        assert self.dictionary is not None and self.model is not None

        self.dictionary.add_documents(X)
        dct = self.dictionary
        print("Total Unique Tokens = %s" % (len(dct)))
        dct.filter_extremes(no_below=self.min_count,
                            no_above=self.no_above,
                            keep_n=1000000)
        dct.filter_n_most_frequent(self.filter_n_most_frequent)
        print("Total Unique Tokens after filtering = %s" % (len(dct)))
        self.word_set = set(dct.values())

        print("FastText Modelling Started at %s" % (str(pd.datetime.now())))
        self.model.build_vocab(X, update=True)
        self.model.train(X,
                         total_examples=self.model.corpus_count,
                         epochs=self.model.epochs)
        print("FastText Modelling done at %s" % (str(pd.datetime.now())))
        print("FastText Vocab Length = %s, Ngrams length = %s" % (len(
            self.model.wv.vectors_ngrams), len(self.model.wv.vectors_vocab)))

        gc.collect()
        return self

    def fit_stored(self):
        X, y = self.train
        return self.fit(X, y)

    def partial_fit(self, X, y=None):
        self.fit(X, y='ignored')

    def transform_one(self, token_array):
        tokens2vec = [
            self.model.wv[token] if token in self.model.wv else np.full(
                self.size, 0) for token in token_array
        ]
        if np.sum(tokens2vec) == 0:
            return np.full(self.size, 0)
        return np.average(tokens2vec, axis=0)

    def transform(self, X, y='ignored'):
        print("Fasttext Transforms start at: %s" % (str(pd.datetime.now())))
        if self.skip_transform:
            return X
        if type(X) == pd.DataFrame:
            Input = X[self.token_column].values
        else:
            raise ValueError()
        if not self.inplace:
            X = X.copy()

        uniq_tokens = set(more_itertools.flatten(Input))
        print("Number of Unique Test Tokens for Fasttext transform %s" %
              len(uniq_tokens))
        if self.do_filter_tokens:
            uniq_tokens = uniq_tokens.intersection(self.word_set)
        print(
            "Number of Unique Test Tokens after filtering for Fasttext transform %s"
            % len(uniq_tokens))
        empty = np.full(self.size, 0)
        token2vec = {
            k: self.model.wv[k] if k in self.model.wv else empty
            for k in uniq_tokens
        }
        token2vec = {k: v / np.linalg.norm(v) for k, v in token2vec.items()}

        def tokens2vec(token_array):
            empty = np.full(self.size, 0)
            if len(token_array) == 0:
                return empty
            return [
                token2vec[token] if token in uniq_tokens else empty
                for token in token_array
            ]

        ft_vecs = list(map(tokens2vec, Input))

        results = list(
            map(
                lambda x: np.average(
                    x,
                    axis=0,
                ) if np.sum(x) != 0 else np.full(300, 0), ft_vecs))

        text_df = pd.DataFrame(list(map(list, results)))
        text_df.columns = [
            self.ft_prefix + str(i) for i in range(0, self.size)
        ]
        text_df.index = X.index
        X[list(text_df.columns)] = text_df
        gc.collect()
        print("Fasttext Transforms done at: %s" % (str(pd.datetime.now())))
        return X

    def inverse_transform(self, X, copy=None):
        raise NotImplementedError()

    def fit_transform(self, X, y='ignored'):
        self.fit(X)
        return self.transform(X)
Exemplo n.º 9
0
def save_dictionary(dic: corpora.Dictionary, filename: str) -> None:
    dic.save(filename)
    print("saved dictionary: {} items to {}".format(len(dic.values()),
                                                    filename))
# coherencemodel2 = CoherenceModel(
# model=lda, texts=data, dictionary=id2word, coherence='c_v')
# coherence_arr.append(coherencemodel2.get_coherence())

f2 = open('models_online.pkl', 'wb')

count = 0
# The loop simulates arrival of new documents from Google Alerts in batches of STEP_SIZE
for i in range(INITIAL_DOC_SIZE, len(data_lemmatized)-STEP_SIZE, STEP_SIZE):
    # new_docs is the list of STEP_SIZE new documents which have arrived
    new_docs = data_lemmatized[i:i+STEP_SIZE]
    pruned_docs = []
    for doc in new_docs:
        pruned_data = []
        for x in doc:
            if x in id2word.values():
                pruned_data.append(x)
        pruned_docs.append(pruned_data)

    new_docs = pruned_docs
    print('Pruning Done')
    # Updating Dictionary
    # id2word.add_documents(new_docs)
    # id2word.filter_extremes(no_below=5, no_above=0.95,
    #                         keep_n=1800)

    prev_corpus = copy.deepcopy(corpus)

    # Converting Documents to doc2bow format so that they can be fed to models
    corpus = [id2word.doc2bow(doc) for doc in new_docs]
    count += 1
Exemplo n.º 11
0
    p_stemmer = nltk.stem.porter.PorterStemmer()
    df_fon['stemmed_text'] = list(map(lambda sentence:
                                  list(map(p_stemmer.stem, sentence)),
                                  df_fon.lemmatized_text))

stem_words(df_Moive_train)



from gensim.corpora import Dictionary

#Sözcükleri vektörle
dictionary = Dictionary(documents=df_Moive_train.stemmed_text.values)

print("Bulunan kelimeler: {}".format(len(dictionary.values())))

dictionary.filter_extremes(no_above=0.8, no_below=3)

dictionary.compactify()  # Filtrelemeden sonra kalan kelimeleri yeniden indeksler
print("Kalan kelimeler: {}".format(len(dictionary.values())))



#her belge için bir BOW

def document_to_bow(df_fon):
    df_fon['bow'] = list(map(lambda doc: dictionary.doc2bow(doc), df_fon.stemmed_text))
    
document_to_bow(df_Moive_train)