Exemplo n.º 1
0
def process_text(corpus, stoplist=None, bigrams=None, trigrams=None, keep_all=False, no_below=10, no_above=0.8):
    """
    Extracts text data from the corpus
    Cleans and tokenizes text data
    Computes most frequent phrases, creates a dictionary and converts the corpus to a BOW model
    :param corpus:
    :return: processed corpus with phrases, dictionary and BOW corpus
    """

    logging.info("Cleaned and tokenzed dataset")
    text_dataset = clean_and_tokenize(corpus, stoplist=stoplist, keep_all=keep_all)

    if bigrams is not None:
        bi_grams = Phrases(text_dataset, threshold=bigrams, min_count=no_below)
        text_dataset = bi_grams[text_dataset]
    elif trigrams is not None:
        bi_grams = Phrases(text_dataset, threshold=bigrams)
        tri_grams = Phrases(bi_grams[text_dataset], threshold=trigrams)
        text_dataset = tri_grams[bi_grams[text_dataset]]

    dictionary = Dictionary(text_dataset)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)
    bow_corpus = [dictionary.doc2bow(text) for text in text_dataset]

    return text_dataset, dictionary, bow_corpus
Exemplo n.º 2
0
    def _build_vocab(self, max_vocab_cnt):
        all_words = []
        for data in self.valid + self.non_valid:
            all_words.append(data["title"] + data["content"])
        vocab = Dictionary(all_words)
        raw_vocab_size = len(vocab)

        vocab.filter_extremes(no_below=5)
        vocab.filter_extremes(keep_n=max_vocab_cnt)
        len_1_words = list(
            filter(
                lambda w: len(w) == 1 and re.match(r"[\x00-\x7f]", w) and w
                not in ["a", "i"] and True or False, vocab.values()))
        vocab.filter_tokens(list(map(vocab.token2id.get, len_1_words)))
        if self.config.use_dict == "seq" and self.config.enable_pad:
            vocab.token2id[PAD] = len(vocab)
            vocab.compactify()
            self.pad_wid = vocab.token2id.get(PAD)
        self.vocab_seq = vocab  # seq dictionary
        # build bow dictionary
        self.vocab_bow = copy.deepcopy(vocab)
        self.vocab_bow.filter_tokens(
            map(self.vocab_bow.token2id.get, STOPWORDS))  # filter stop words
        self.vocab_bow.compactify()
        if self.config.tfidf:
            tfidf_corpus = [self.vocab_bow.doc2bow(line) for line in all_words]
            self.tfidf_model = TfidfModel(tfidf_corpus)
        print("Load corpus with non_valid size %d, valid size %d, "
              "raw vocab size %d seq vocab size %d, bow vocab size %d" %
              (len(self.non_valid), len(self.valid), raw_vocab_size,
               len(self.vocab_seq), len(self.vocab_bow)))
Exemplo n.º 3
0
 def buildDictionary(self, corpus, txt2tokens, opts):
     '''
     Tokenize texts and add tokens to dictionary.
     :param corpus: Corpus-like or id
     :param txt2tokens: txt2tokens or id
     :param opts: GensimDictBuildOptions
     :param ctx: pytopia context
     :return: gensim Dictionary
     '''
     t = clock()
     corpus, txt2tokens = self.resolve(corpus, txt2tokens)
     # fill the dictionary with tokens from corpus texts
     dictionary = Dictionary(documents=None)
     numDocs = 0; numTokens = 0
     for txto in corpus:
         tokens = txt2tokens(txto.text)
         numDocs += 1; numTokens += len(tokens)
         dictionary.doc2bow(tokens, allow_update=True)
     # form filtering options and run filtering
     no_below = opts.docLowerLimit if opts.docLowerLimit is not None else 0
     if opts.docUpperLimit is None: no_above = 1.0
     elif isinstance(opts.docUpperLimit, float): no_above = opts.docUpperLimit
     else: no_above = opts.docUpperLimit/float(numDocs)
     if opts.words2keep is None: keep_n = numTokens
     else: keep_n = opts.words2keep
     dictionary.filter_extremes(no_below=no_below, no_above=no_above,
                                keep_n=keep_n)
     dictionary.compactify()
     # force id2token map building
     someId = dictionary.token2id.values()[0]
     dictionary[someId]
     return GensimDictAdapter(dictionary, corpus.id, txt2tokens.id, opts)
Exemplo n.º 4
0
def preprocess(tweets):
    tweet_list = [preprocess_one(tweet) for tweet in tweets]

    print("Passed initial Processing...")

    # Train bigrams/trigrams model only when there is a list of many tweets
    def n_grams(tweets):
        ngram = Phrases(tweets)
        for ind in range(len(tweets)):
            for word in ngram[tweets[ind]]:
                if '_' in word:
                    tweets[ind].append(word)

        return tweets

    tweet_list = n_grams(tweet_list)
    print("Passed ngram Processing...")
    # Use to create Bag-of-Words when possessing a list of tweets
    dictionary = Dictionary(tweet_list)
    print("Passed dictionary creation...")
    # Filter out words that occur less than 10 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=10, no_above=0.5)
    corpus = [dictionary.doc2bow(tweet) for tweet in tweet_list]

    print("Number of Unique Words:", str(len(dictionary)))
    print("Number of documents:", str(len(corpus)))
    return tweet_list, dictionary, corpus
    def clean_docs(self, docs):
        """Removes uneccessary words (noise) or in this case words
        that will bring our models to the worse case scenario"""

        # Remove numbers, but not words that contain numbers.
        docs = [[token for token in doc if not token.isnumeric()]
                for doc in docs]

        # Remove words that are only one character.
        docs = [[
            token for token in doc
            if len(token) > 1 and token not in stop_words
        ] for doc in docs]

        # lemmatizer = WordNetLemmatizer()
        # docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

        # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
        bigram = Phrases(docs, min_count=20)
        for idx in range(len(docs)):
            for token in bigram[docs[idx]]:
                if '_' in token:
                    # Token is a bigram, add to document.
                    docs[idx].append(token)

        # Create a dictionary representation of the documents.
        dictionary = Dictionary(docs)

        # Filter out words that occur less than 20 documents, or more than 50% of the documents.
        dictionary.filter_extremes(no_below=20, no_above=0.5)

        # Bag-of-words representation of the documents.
        corpus = [dictionary.doc2bow(doc) for doc in docs]

        return docs, dictionary, corpus
Exemplo n.º 6
0
def vectorize(corpus):
    tokenized = [Tokenizer.tokenize(doc) for doc in corpus]

    dictionary = Dictionary(tokenized)
    dictionary.filter_extremes(no_below=10, no_above=0.66)
    bows = [dictionary.doc2bow(doc) for doc in tokenized]
    return dictionary, bows
Exemplo n.º 7
0
    def __prep_texts(self, include_bigrams=False):
        print("--- Preparing Texts for Model ---\n")
        cleaned_text = str(self.text_column) + "_clean"
        if self.algo == 'gensim':
            doc_lst = self.processed_df[cleaned_text].tolist()
            doc_lst = [word_tokenize(str(doc)) for doc in doc_lst]

            if include_bigrams:
                # Compute bigrams.
                # Add bigrams to docs (as per the linked NPMI paper).
                bigram = Phrases(doc_lst, threshold=10e-5, scoring='npmi')
                for idx in range(len(doc_lst)):
                    temp_bigram = []
                    for token in bigram[doc_lst[idx]]:
                        if '_' in token:
                            # Token is a bigram, add to document.
                            temp_bigram.append(token)
                    doc_lst.append(temp_bigram)

            # Create Corpus
            dictionary = Dictionary(doc_lst)
            dictionary.filter_extremes(no_above=0.9)
            corpus = [dictionary.doc2bow(text) for text in doc_lst]
            self.texts = doc_lst
            self.dictionary = dictionary
            self.corpus = corpus
        else:
            doc_lst = self.processed_df[cleaned_text].tolist()
            self.texts = doc_lst
            vectorizer = CountVectorizer(strip_accents='unicode',
                                         max_df=0.9,
                                         lowercase=True)
            data_vectorized = vectorizer.fit_transform(self.texts)
            self.lda_vectorizer = vectorizer
            self.lda_dtm = data_vectorized
Exemplo n.º 8
0
class ExtraWordFilter(object):
    def __init__(self):
        self.dct = None
        self.stopwords = None

    def fit(self, docs, no_above, **kwargs):
        segmented_docs = [doc.lower().split() for item in docs for doc in item]
        self.dct = Dictionary(segmented_docs)
        self.dct.filter_extremes(no_above=no_above, **kwargs)
        print("Extra Dct size:{}".format(len(self.dct.token2id)))
        # print("Dct keys: {}".format(self.dct.token2id.keys()))
        return self.dct.token2id

    def transform(self, docs):
        segmented_docs = [[doc.split() for doc in item] for item in docs]
        transformed_docs = [[
            " ".join([
                word for word in doc
                if word.lower() in self.dct.token2id.keys()
                or word in string.punctuation
            ]) for doc in item
        ] for item in segmented_docs]
        return transformed_docs

    def fit_transform(self, docs, no_above, **kwargs):
        self.fit(docs, no_above, **kwargs)
        return self.transform(docs)
Exemplo n.º 9
0
def doc_embed_charity_notfidf(processed_docs, word_min=5, word_max_perc=.8):
    'Takes a list of preprocessed texts and returns an embedding vector for each document, a dictionary of the words within the corpus, and the glove vectors for each word in the corpus'

    # Create dictionary from corpus
    docs_dict = Dictionary(processed_docs)
    docs_dict.filter_extremes(no_below=word_min, no_above=word_max_perc)
    docs_dict.compactify()

    # Convert docs into sparce matricx (N_docs x N_words in dictionary) where the number in each cell indicates the number of time that word appeared in that document
    docs_corpus = [docs_dict.doc2bow(doc) for doc in processed_docs]
    docs_vecs = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_corpus])

    #Count number of documents and words in dictionary
    num_docs = np.shape(docs_vecs)[0]
    num_words = np.shape(docs_vecs)[1]

    print("Total # of docs: {}".format(num_docs))
    print("Total # of words in dict: {}".format(num_words))

    # For each word in dict extract embedding vector (Glove vectors)
    glove_vecs = np.vstack([nlp(docs_dict[i]).vector for i in range(len(docs_dict))])

    # Sum glove vectors over words in doc
    docs_emb = np.dot(docs_vecs, glove_vecs)

    return docs_emb, docs_dict, glove_vecs
 class MyCorpus(object):
     def __init__(self, input_file, K):
         self.K = K
         self.input_file = input_file
         self.dictionary = Dictionary()
         with open(input_file, "rt") as f:
             for line in f:
                 self.dictionary.add_documents([line.split()])
         self.dictionary.filter_extremes(no_below = 2, no_above = 0.5, keep_n = K)
                 
     def __iter__(self):
         count = 1
         with open(self.input_file, "rt") as f:
             count += 1
             for line in f:
                 yield self.dictionary.doc2bow(line.rstrip().split())
                 
     def __str__(self):
         s = "MyCorpus(" + str(self.dictionary.num_docs) + " documents, "
         s += str(len(self.dictionary.keys())) + " features, "
         s += str(corpus.dictionary.num_nnz) + " non-zero entries)"
         return s
         
     def __repr__(self):
         return "MyCorpus('" + self.input_file + "', " + str(self.K) + ")"
Exemplo n.º 11
0
def main():
    doc = get_doc()
    print('doc len:', len(doc))

    train_texts = list(build_texts(doc))
    print('train len:', len(train_texts))

    bigram = gensim.models.Phrases(
        train_texts, min_count=10)  # for bigram collocation detection
    stops = set(stopwords.words('english'))  # nltk stopwords list

    train_texts = process_texts(train_texts, bigram, stops)
    print('bigramed train_texts', len(train_texts))
    vocabulary = Dictionary(train_texts)
    print('vocab size:', len(vocabulary))
    # remove extremes
    vocabulary.filter_extremes(
        no_below=3, no_above=0.3
    )  # remove words in less than 5 documents and more than 50% documents
    #vocabulary.filter_n_most_frequent(50)  # Filter out 1000 most common tokens
    # filter_tokens(bad_ids=None, good_ids=None)
    corpus = [vocabulary.doc2bow(text) for text in train_texts]
    print('corpus size:', len(corpus))
    lda = LdaModel(corpus=corpus,
                   id2word=vocabulary,
                   num_topics=10,
                   chunksize=1500,
                   iterations=200,
                   alpha='auto')
    print(
        pd.DataFrame([[word for rank, (word, prob) in enumerate(words)]
                      for topic_id, words in lda.show_topics(
                          formatted=False, num_words=6, num_topics=35)]))
Exemplo n.º 12
0
def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5):
    print('Building dictionary...')
    dictionary = Dictionary(docs)
    # remove stopwords
    stopwords = nltk_stopwords().union(additional_stopwords)
    stopword_ids = map(dictionary.token2id.get, stopwords)
    # get ids for short words len(word)<=3
    shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3]
    dictionary.filter_tokens(stopword_ids)
    dictionary.compactify()
    # get ids for short words len(word)<=3
    shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3]
    dictionary.filter_tokens(shortword_ids)
    dictionary.compactify()
    # remove words that appear only once
    once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems()if docfreq == 1]
    dictionary.filter_tokens(once_ids)
    dictionary.compactify()
    # filter extreme values
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
    dictionary.compactify()

    print('Building corpus...')
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    return dictionary, corpus
Exemplo n.º 13
0
def make_dict_and_corpus(tweets, upper_limit, lower_limit):
    twitter_wakati_texts = wakati_tweets(tweets)
    dictionary = Dictionary(twitter_wakati_texts)
    if upper_limit is not None and lower_limit is not None:
        dictionary.filter_extremes(no_below=lower_limit, no_above=upper_limit)
    corpus = [dictionary.doc2bow(t) for t in twitter_wakati_texts]
    return dictionary, corpus
Exemplo n.º 14
0
class LdaTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, dim = 2, column = 'whole'):
        self.dim = dim
        self.column = column
    def fit(self, X, y=None):     
        lda_tokens = X[self.column].apply(lambda x: x.split())
        # create Dictionary and train it on text corpus
        self.lda_dic = Dictionary(lda_tokens)
        self.lda_dic.filter_extremes(no_below=10, no_above=0.6, keep_n=8000)
        lda_corpus = [self.lda_dic.doc2bow(doc) for doc in lda_tokens]
        # create TfidfModel and train it on text corpus
        self.lda_tfidf = TfidfModel(lda_corpus)
        lda_corpus = self.lda_tfidf[lda_corpus]
        # create LDA Model and train it on text corpus
        self.lda_model = LdaMulticore(
            lda_corpus, num_topics=self.dim, id2word=self.lda_dic, workers=4,
            passes=20, chunksize=1000, random_state=0
        )
        return self
    
    def transform(self, X, y=None):
        lda_emb_len = len(self.lda_model[[]])
        lda_corpus = [self.lda_dic.doc2bow(doc) for doc in X[self.column].apply(lambda x: x.split())]
        lda_corpus = self.lda_tfidf[lda_corpus]
        lda_que_embs = self.lda_model.inference(lda_corpus)[0]
        # append lda question embeddings
        out = np.zeros((len(X), lda_emb_len))
        for i in range(lda_emb_len):
            out[:, i] = lda_que_embs[:, i]
        return out
Exemplo n.º 15
0
def prepare_corpus(tweets_file, corpus_file, dictionary_file, author_topic):
    stop_words = set(stopwords.words('english'))
    stop_words.add(u'rt')

    print('Loading tweets from ' + tweets_file)
    tweets = pd.read_pickle(tweets_file)

    if author_topic:
        tweets = tweets.groupby('user').agg({'text': 'sum'})

    print('%d tweets loaded' % len(tweets.index))

    dictionary = Dictionary(tweets['text'])
    stopword_ids = map(dictionary.token2id.get, stop_words)
    dictionary.filter_tokens(stopword_ids)
    dictionary.compactify()
    dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=None)
    dictionary.compactify()

    corpus = [dictionary.doc2bow(doc) for doc in tweets['text']]

    # print(corpus)
    print("Writing corpus to " + corpus_file)
    MmCorpus.serialize(corpus_file, corpus)
    # print(dictionary)
    print("Writing dictionary to " + dictionary_file)

    dictionary.save(dictionary_file)
Exemplo n.º 16
0
def filtrar_extremos(docs, max_freq=0.5, min_wordcount=2, n_top=3):
    dictionary = Dictionary(docs)
    dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)
    dictionary.filter_n_most_frequent(n_top)
    _ = dictionary[0]

    return dictionary
Exemplo n.º 17
0
 def _prepare(self, dataset):
     docs = dataset
     dictionary = Dictionary(docs)
     dictionary.filter_extremes(no_below=2, no_above=0.5)
     corpus = [dictionary.doc2bow(doc) for doc in docs]
     _ = dictionary[0]
     return corpus, dictionary
Exemplo n.º 18
0
class TFIDF():
    def __init__(self):
        pass

    def preprocess_tfidf(self):
        return [process_text(r) for r in get_db_records()]

    def create_tfidf_model(self):
        self.dataset = self.preprocess_tfidf()
        self.dct = Dictionary(self.dataset)
        self.dct.filter_extremes(no_below=50)
        corpus = [self.dct.doc2bow(line) for line in self.dataset]
        self.model = TfidfModel(corpus)

    def infer_tfidf(self):
        def infer(vector):
            dim = self.dct.keys()[-1] + 1
            text1 = self.model[self.dct.doc2bow(vector)]
            t1 = []
            for d in range(dim):
                t1_val = [i[1] for i in text1 if i[0] == d]
                if len(t1_val) == 1:
                    t1.append(t1_val[0])
                else:
                    t1.append(0)
            return t1

        return infer

    @staticmethod
    def load(filename):
        with open(filename, "rb") as f:
            return pickle.load(f)
Exemplo n.º 19
0
def texts2corpus(documents,
                 tfidf=False,
                 stopwords=None,
                 filter_below=5,
                 filter_above=0.5,
                 keep_n=100000,
                 logg=print):
    logg(f'generating {"tfidf" if tfidf else "bow"} corpus and dictionary')

    dictionary = Dictionary(documents, prune_at=None)
    dictionary.filter_extremes(no_below=filter_below,
                               no_above=filter_above,
                               keep_n=keep_n)

    # filter some noice (e.g. special characters)
    if stopwords:
        stopword_ids = [dictionary.token2id[token] for token in stopwords]
        dictionary.filter_tokens(bad_ids=stopword_ids, good_ids=None)

    bow_corpus = [dictionary.doc2bow(text) for text in documents]
    if tfidf:
        tfidf_model = TfidfModel(bow_corpus)
        corpus = tfidf_model[bow_corpus]
    else:
        corpus = bow_corpus

    return corpus, dictionary
Exemplo n.º 20
0
def preprocess(docs,no_below=20,no_above=0.7):
    # input is a an array of docs; each is one string
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isnumeric()] for doc in docs]
    # Remove words that are less than three characters
    docs = [[token for token in doc if len(token) > 2] for doc in docs]

    # Remove short words that are not in the dictionary
    docs = [[token for token in doc if len(token) > 4 or enchantdict.check(token)] for doc in docs]

    # Lemmatize all words in documents.
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

    # Delete words based on their frequency in the whole corps
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)
    #set_trace()
    # Filter out words that occur less than 20 documents, or more than 70% of the documents.
    dictionary.filter_extremes(no_below, no_above)

    # According to the filtered dictionary, reconstruct the corpus
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    return corpus, dictionary
Exemplo n.º 21
0
    def _create_from_texts(cls,
                           tokenized_texts,
                           name,
                           dataset,
                           settings,
                           minimum_frequency=2):
        from gensim.corpora import Dictionary as GensimDictionary

        # build a dictionary of features
        logger.info("Creating features (including n-grams) from texts")
        gemsim_dictionary = GensimDictionary(tokenized_texts)

        # Remove extremely rare features
        logger.info("Features dictionary contains %d features. Filtering..." %
                    len(gemsim_dictionary.token2id))
        gemsim_dictionary.filter_extremes(no_below=minimum_frequency,
                                          no_above=1,
                                          keep_n=None)
        gemsim_dictionary.compactify()
        logger.info("Features Dictionary contains %d features." %
                    len(gemsim_dictionary.token2id))

        dict_model = cls(name=name, dataset=dataset, settings=settings)
        dict_model.save()

        dict_model._populate_from_gensim_dictionary(gemsim_dictionary)

        return dict_model
Exemplo n.º 22
0
def preprocess(documents,
               stem=False,
               vocab_size=10000,
               oov_token="<OOV>",
               oov_id=-1):
    """Preprocess documents.

    Args:
        documents: An array of strings, each string representing a document.
        stem: (bool) Whether to use a stemmer. Defaults to False.


    Returns:
        (gensim Dictionary, tokenized documents)
    """
    porter_stemmer = PorterStemmer()

    def process_document(doc):
        tokens = word_tokenize(doc)
        tokens = [token.lower() for token in tokens if token.isalpha()]
        if stem:
            tokens = [porter_stemmer.stem(token) for token in tokens]
        return tokens

    tokenized_docs = list(map(process_document, documents))

    dictionary = Dictionary(tokenized_docs)
    dictionary.filter_extremes(no_below=5, no_above=0.8, keep_n=vocab_size)

    # Add OOV to dictionary
    dictionary.add_documents([["<OOV>"]])

    return dictionary, tokenized_docs
Exemplo n.º 23
0
def parse_processed_amazon_dataset(task_files, max_words=10000):
    """
    Code inspired by:
    https://github.com/sclincha/xrce_msda_da_regularization
    """
    datasets = {}
    dico = GensimDict()
    print("Parsing", task_files)

    # First pass on document to build dictionary
    for fname in task_files:
        with open(fname, 'r') as f:
            for l in f:
                tokens = l.split(' ')
                tokens_list = []
                for tok in tokens[:-1]:
                    ts, tfreq = tok.split(':')
                    freq = int(tfreq)
                    tokens_list += [ts] * freq
                dico.doc2bow(tokens_list, allow_update=True)

    # Preprocessing_options
    dico.filter_extremes(no_below=2, keep_n=max_words)
    dico.compactify()

    for fname in task_files:
        X, Y = [], []

        with open(fname, 'r') as f:
            for docid, l in enumerate(f):
                tokens = l.split(' ')
                label_string = tokens[-1]
                tokens_list = []
                for tok in tokens[:-1]:
                    ts, tfreq = tok.split(':')
                    freq = int(tfreq)
                    tokens_list += [ts] * freq
                count_list = dico.doc2bow(tokens_list, allow_update=False)

                idx, freqs = list(zip(*count_list))
                one_hot = np.zeros(max_words)
                one_hot[list(idx)] = np.array(freqs)

                X.append((docid, one_hot))

                #Preprocess Label
                ls, lvalue = label_string.split(':')
                if ls == "#label#":
                    if lvalue.rstrip() == 'positive':
                        Y.append(1)
                    elif lvalue.rstrip() == 'negative':
                        Y.append(0)
                    else:
                        raise Exception("Invalid Label Value")
                else:
                    raise Exception('Invalid Format')

        datasets[os.path.split(os.path.split(fname)[0])[-1]] = (X, Y)

    return datasets, dico
Exemplo n.º 24
0
def prepare_LDA_input(corpus, LDA_model):
    # Prepare input to LDA model
    corpus = [clean_text(text).split() for text in corpus]
    dict_corpus = Dictionary(corpus)
    dict_corpus.filter_extremes(no_below=5, no_above=0.3, keep_n=None)
    bow_corpus = [dict_corpus.doc2bow(c) for c in corpus]
    
    # Get topic-doc vector
    LDA_input = []
    for doc in bow_corpus:
        LDA_input.append(LDA_model.get_document_topics(doc))
    
    # Add missing probabilities
    for doc in LDA_input:
        index = []
        true_index = set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
        for i in range(len(doc)):
            index.append(doc[i][0])
        new_index = true_index- set(index)
        for j in new_index:
            doc.extend([(j, 0.0)])
        doc.sort() 
        
    # Create input matrix
    LDA_doc = []
    for doc in LDA_input:
        LDA_doc.append(np.asarray([doc[0][1], doc[1][1], doc[2][1], doc[3][1],
                                   doc[4][1], doc[5][1], doc[6][1], doc[7][1],
                                   doc[8][1], doc[9][1], doc[10][1], doc[11][1]], dtype='float32'))
    LDA_doc = np.array(LDA_doc)
    return LDA_doc
Exemplo n.º 25
0
class LDATransformer:
    """Preps data for LDA.
    TODO: add options to slim down vocab and filter words. Also make the methods more efficient.
    """
    def fit(self, texts):
        all_words = []
        docs = [preprocess_string(d) for d in texts]
        self.vocab = Dictionary(docs)
        self.vocab.filter_extremes()
        return self

    def transform(self, docs):
        """TODO: speed up for loop."""
        all_docs = []
        i = 0
        for d in docs:
            words = preprocess_string(d)
            id_ct = self.vocab.doc2bow(words)
            if len(id_ct) < 1:
                continue
            else:
                id, ct = zip(*id_ct)
                all_docs.extend([(i, j) for j in id])
                i += 1
        return all_docs
Exemplo n.º 26
0
Arquivo: util.py Projeto: Badodon/FFNN
def load_data(fname):
    
    print 'input file name:', fname

    target = [] #ラベル
    source = [] #文書ベクトル

    #文書リストを作成
    document_list = []
    word_list = []
    for l in open(fname, 'r').readlines():
        sample = l.strip().split(' ',  1)
        label = sample[0]
        target.append([label]) #ラベル
        word_list = preprocess_string(sample[1]) #ストップワード除去, ステミング
        document_list.append(word_list) #文書ごとの単語リスト
    
    #辞書を作成
    #低頻度と高頻度のワードは除く
    dct = Dictionary(document_list)
    dct.filter_extremes(no_below=3, no_above=0.6)

    #文書のBOWでベクトル化
    for doc in document_list:
        tmp = dct.doc2bow(doc) # ex.[(4, 1), (23,1),..., (119,2)] 
        dense = list(matutils.corpus2dense([tmp], num_terms=len(dct)).T[0])
        source.append(dense)

    dataset = {}
    dataset['target'] = np.array(target)    
    dataset['source'] = np.array(source)    

    return dataset #, max_len, width
Exemplo n.º 27
0
class vectorizer:
    num_topics = 5000

    def __init__(self):
        pass

    def fit_transform(self, data):
        data = [simple_preprocess(x, deacc=True) for x in data]

        phrases = Phrases(data, threshold=10)
        self.phraser = Phraser(phrases)
        data = self.phraser[data]

        self.dct = Dictionary(data)
        self.dct.filter_extremes(keep_n=self.num_topics)
        docs_bow = [self.dct.doc2bow(line) for line in data]

        #self.tfidf = TfidfModel(docs_bow)
        #vectors = list(self.tfidf[docs_bow])

        self.lsimodel = None
        #		self.lsimodel = LsiModel(corpus=vectors, num_topics=self.num_topics)

        retorno = [convert2dense(x, self.num_topics) for x in docs_bow]

        return retorno

    def transform(self, text):
        text = simple_preprocess(text, deacc=True)
        palavras = self.phraser[text]
        bow = self.dct.doc2bow(palavras)
Exemplo n.º 28
0
def indexing(Corpus, keep_n, length,
             samples):  #samples => number of samples in the corpus to take
    sentences = []  #list of list of words
    for i in range(len(Corpus)):
        sentences.append(
            Corpus['Output'][i]
        )  #puts all the lists of words(Output[i]) into the "sentences" list
    dct = Dictionary(sentences)
    dct.filter_extremes(keep_n)  #keeps the top n words
    dictionary = (dct.token2id
                  )  #dictionary now has all the words mapped to a number
    #newsentences will be a list of lists that only includes the top n indexes
    newsentences = []
    for sentence in sentences:  #for each list in the sentences list
        newsentence = [keep_n]  #pad the sentence with the </s> token
        for item in sentence:  #for each word in the sentence
            if (item in dictionary):  #check if the word is top n frequent word
                newsentence.append(dictionary[item])  #append
            else:
                pass  #otherwise do nothing
        newsentences.append(
            newsentence
        )  #append the sentence after each word has been iterated through
    #all the sentences in X_train are of length 10, ie if longer - truncate, if shorter - pad
    X_train = sequence.pad_sequences(newsentences,
                                     maxlen=length,
                                     value=3001,
                                     padding="post",
                                     truncating="post")
    temp = []
    for i in range(samples):
        temp.append(one_hot(X_train[i]))
    return temp
Exemplo n.º 29
0
def generate_tfidf_commit(
        repository: Repository,
        stopwords_: Set[str],
        min_len,
        cache=None) -> Tuple[tfidfmodel.TfidfModel, Dictionary, Dict]:
    if cache is None:
        cache = dict()

    texts = list()
    for commit in repository.commits:
        if commit.c_hash in cache.keys():
            texts.append(cache[commit.c_hash])
        else:
            text = text_pipeline(commit, stopwords_, min_len)
            texts.append(text)
            cache[commit.c_hash] = text
    for issue_ in repository.issues:
        if issue_.id_ in cache.keys():
            texts.append(cache[issue_.id_])
        else:
            text = text_pipeline(issue_, stopwords_, min_len)
            texts.append(text)
            cache[issue_.id_] = text

    dictionary_ = Dictionary(texts)
    dictionary_.filter_extremes(no_below=3, no_above=0.95)
    working_corpus = [
        dictionary_.doc2bow(text, return_missing=True) for text in texts
    ]
    # Convert UNK from explicit dictionary to UNK token (id = -1)
    working_corpus = [
        val[0] + [(-1, sum(val[1].values()))] for val in working_corpus
    ]
    return tfidfmodel.TfidfModel(working_corpus,
                                 id2word=dictionary_), dictionary_, cache
def embed(sent_words, path_word_ind, path_word_vec, path_embed):
    model = Dictionary(sent_words)
    model.filter_extremes(no_below=min_freq, no_above=1.0, keep_n=max_vocab)
    word_inds = model.token2id
    #print (word_inds)
    #随机排布
    word_inds = tran_dict(word_inds, off=2)

    with open(path_word_ind, 'wb') as f:
        pk.dump(word_inds, f)
    #输出
    #print (word_inds)
    with open(path_word_vec, 'rb') as f:
        word_vecs = pk.load(f)
    #print (word_vecs)
    vocab = word_vecs.vocab
    print (word_vecs['A'].shape)
    #200
    vocab_num = min(max_vocab + 2, len(word_inds) + 2)
    embed_mat = np.zeros((vocab_num, embed_len))
    for word, ind in word_inds.items():
        if word in vocab:
            if ind < max_vocab:
                embed_mat[ind] = word_vecs[word]
                #嵌入规则为word_vecs

    print (embed_mat.shape)
    #(3571,200)
    with open(path_embed, 'wb') as f:
        pk.dump(embed_mat, f)
Exemplo n.º 31
0
 def testFilter(self):
     d = Dictionary(self.texts)
     d.filter_extremes(no_below=2, no_above=1.0, keep_n=4)
     dfs_expected = {0: 3, 1: 3, 2: 3, 3: 3}
     cfs_expected = {0: 4, 1: 3, 2: 3, 3: 3}
     self.assertEqual(d.dfs, dfs_expected)
     self.assertEqual(d.cfs, cfs_expected)
Exemplo n.º 32
0
def pipeline_lda(que: pd.DataFrame,
                 dim: int) -> (Dictionary, TfidfModel, LdaMulticore):
    """
    Pipeline for training embeddings for questions via LDA algorithm
    on question titles and bodies

    :param que: raw questions.csv dataset
    :param dim: dimension of doc2vec embeddings to train
    :return: trained tags, industries embeddings and question's Doc2Vec model
    """
    lda_tokens = que['questions_whole'].apply(lambda x: x.split())

    # create Dictionary and train it on text corpus
    lda_dic = Dictionary(lda_tokens)
    lda_dic.filter_extremes(no_below=10, no_above=0.6, keep_n=8000)
    lda_corpus = [lda_dic.doc2bow(doc) for doc in lda_tokens]

    # create TfidfModel and train it on text corpus
    lda_tfidf = TfidfModel(lda_corpus)
    lda_corpus = lda_tfidf[lda_corpus]

    # create LDA Model and train it on text corpus
    lda_model = LdaMulticore(lda_corpus,
                             num_topics=dim,
                             id2word=lda_dic,
                             workers=4,
                             passes=20,
                             chunksize=1000,
                             random_state=0)

    return lda_dic, lda_tfidf, lda_model
Exemplo n.º 33
0
    def _create_from_texts(cls,
                           tokenized_texts,
                           name,
                           dataset,
                           settings,
                           minimum_frequency=2):
        from gensim.corpora import Dictionary as GensimDictionary

        # build a dictionary
        logger.info("Building a dictionary from texts")
        dictionary = GensimDictionary(tokenized_texts)

        # Remove extremely rare words
        logger.info("Dictionary contains %d words. Filtering..." %
                    len(dictionary.token2id))
        dictionary.filter_extremes(no_below=minimum_frequency,
                                   no_above=1,
                                   keep_n=None)
        dictionary.compactify()
        logger.info("Dictionary contains %d words." % len(dictionary.token2id))

        dict_model = cls(name=name, dataset=dataset, settings=settings)
        dict_model.save()

        dict_model._populate_from_gensim_dictionary(dictionary)

        return dict_model
Exemplo n.º 34
0
def topic_model(docs):
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)
    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=20, no_above=0.5)

    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    print('Number of unique tokens: %d' % len(dictionary))
    print('Number of documents: %d' % len(corpus))

    # Set training parameters.
    num_topics = 10
    chunksize = 2000
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    print("Training LDA Model ...")
    model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     chunksize=chunksize,
                     alpha='auto',
                     eta='auto',
                     iterations=iterations,
                     num_topics=num_topics,
                     passes=passes,
                     eval_every=eval_every)

    return model.top_topics(corpus)
Exemplo n.º 35
0
 def test_run(self, data):
     dictionary = Dictionary(data)
     dictionary.filter_extremes(no_above=0.5)
     bags_of_words = [ dictionary.doc2bow(t) for t in data]
     #This can take a while to run:
     lda = LdaModel(bags_of_words, id2word = dictionary, num_topics=30, passes=2)
     results = self.assemble_topics(lda)
     return results
Exemplo n.º 36
0
def small_word_conv(dataset_path):
    docs, y, test_docs, test_y = nli2013_train_test_split(dataset_path)

    logging.info('preprocessing, padding and binarizing data ...')
    docs = [flatten([sent.split() for sent in doc.split('\n') if sent.strip() != '']) for doc in docs]
    test_docs = [flatten([sent.split() for sent in doc.split('\n') if sent.strip() != '']) for doc in test_docs]

    vocab = Dictionary(docs)
    vocab.filter_extremes(keep_n=5000)
    bin = LabelBinarizer()

    x = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
                                for s in docs],
                               max_length=100, padding_word=0))
    y = bin.fit_transform(y)

    test_x = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
                                     for s in test_docs],
                                    max_length=100, padding_word=0))
    test_y = bin.transform(test_y)

    logging.info('building model ...')
    model = Sequential()
    model.add(Embedding(5001, 300, input_length=100))
    model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(11, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])

    model.fit(x, y, batch_size=32, nb_epoch=10, validation_data=[test_x, test_y])

    print(accuracy_score(np.argwhere(test_y)[:, 1], model.predict_classes(test_x)))
def build_dictionary():
    corpus = CorpusIterator(dir_list=dir_list)

    dictionary = Dictionary(corpus)

    dictionary.save_as_text(
        '/home/andre/Develop/corpora/lsamodel_wordids.txt.bz2')

    dictionary.filter_extremes(no_below=10, no_above=0.1, keep_n=500000)

    dictionary.save_as_text(
        '/home/andre/Develop/corpora/lsamodel_wordids_filtered.txt.bz2')
 def testFilterKeepTokens_keepn(self):
     # keep_tokens should also work if the keep_n parameter is used, but only
     # to keep a maximum of n (so if keep_n < len(keep_n) the tokens to keep are
     # still getting removed to reduce the size to keep_n!)
     d = Dictionary(self.texts)
     # Note: there are four tokens with freq 3, all the others have frequence 2
     # in self.texts. In order to make the test result deterministic, we add
     # 2 tokens of frequency one
     d.add_documents([['worda'], ['wordb']])
     # this should keep the 3 tokens with freq 3 and the one we want to keep
     d.filter_extremes(keep_n=5, no_below=0, no_above=1.0, keep_tokens=['worda'])
     expected = {'graph', 'trees', 'system', 'user', 'worda'}
     self.assertEqual(set(d.token2id.keys()), expected)
Exemplo n.º 39
0
def build_corpora(db):
    dictionary = Dictionary()
    corpus = []
    for article in db.articles.find():
        text = article['clean_text']
        dictionary.doc2bow(text, allow_update=True)
    dictionary.filter_extremes()
    for article in db.articles.find():
        text = article['clean_text']
        corpus.append(dictionary.doc2bow(text))
    gensim.corpora.MmCorpus.serialize('data/corpus.mm', corpus)
    dictionary.save('data/cnn.dict')
    return corpus, dictionary
def dbpedia_convgemb(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_dbpedia_data(size=sample)

    if sample:
        test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000)))
    else:
        test_size = 5000 * 14

    split = StratifiedShuffleSplit(df.category, test_size=test_size)
    train_split, test_split = next(iter(split))
    train_df = df.iloc[train_split]
    test_df = df.iloc[test_split]

    train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True)
    vocab = Dictionary(train_docs)
    vocab.filter_extremes(keep_n=5000)
    bin = LabelBinarizer()

    x_train = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
                                      for s in train_docs],
                                     max_length=100, padding_word=0))
    y_train = bin.fit_transform(train_df.category.values)

    test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True)
    x_test = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
                                      for s in test_docs],
                                     max_length=100, padding_word=0))
    y_test = bin.transform(test_df.category.values)

    emb_weights = load_w2v_weights(vocab)

    model = Sequential()
    model.add(Embedding(5001, 300, input_length=100, dropout=.2, weights=[emb_weights], trainable=False))
    model.add(Convolution1D(nb_filter=50, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=model.output_shape[1]))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(.2))
    model.add(Dense(14, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    model.fit(x_train, y_train)

    print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))
Exemplo n.º 41
0
def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5):
  print('Building dictionary...')
  dictionary = Dictionary(docs)
  stopwords = nltk_stopwords().union(additional_stopwords)
  stopword_ids = map(dictionary.token2id.get, stopwords)
  dictionary.filter_tokens(stopword_ids)
  dictionary.compactify()
  dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
  dictionary.compactify()

  print('Building corpus...')
  corpus = [dictionary.doc2bow(doc) for doc in docs]

  return dictionary, corpus
Exemplo n.º 42
0
def main():
    global dictionary
    try:
        dictionary = Dictionary.load_from_text("persist/reuters_dictionary.txt")
        #dictionary = Dictionary.load_from_text("persist/wiki_stem-False_keep-100000_nobelow-20_noabove-0.1_wordids.txt.bz2")

    except:
        dictionary = Dictionary(ReutersCorpus())
        dictionary.filter_extremes()
        dictionary.save_as_text("persist/reuters_dictionary.txt")

    models = train_models()

    if settings["models"]["bow"]:
        bowmodel = BOWmodel()
        bowmodel.__out_size = len(dictionary)
        models["bow"] = bowmodel

    if settings["models"]["noise"]:
        noisemodel = NoiseModel(1000)
        noisemodel.__out_size = 1000
        models["noise"] = noisemodel

    num_train_samples = 21578 - settings["held_out_docs"]
    test_samples = []


    class generate_train_samples(object):
        first_iteration = True

        def __iter__(self):
            count = 0
            for document in stream_reuters_documents():
                sample = document["content"], "acq" in document["topics"]  # todo: maybe try "usa" or "earn"
                if count > num_train_samples:
                    if self.first_iteration:
                        test_samples.append(sample)
                else:
                    yield sample
                count += 1
            self.first_iteration = False

    classifiers = train_classifiers(models, generate_train_samples())

    classifications = run_evaluation(classifiers, models, test_samples)
    #output_results(classifications)

    return classifications
Exemplo n.º 43
0
    def create(pathtomapping, pathtocorpus, corpusname, window, numtokeep=50000, save_raw=True, shifts=(1, 5, 10)):
        """
        Creates an Shifted Positive Pointwise Mutual Information matrix.

        :param pathtomapping: The path to the id2word mapping. If this is left empty, the id2word mapping gets
        recreated. Warning: this takes a long time.
        :param pathtocorpus: The path to the corpus folder. The corpus can be spread out over multiple files or folders,
        and is read iteratively.
        :param corpusname: The name of the corpus. Used for saving the files.
        :param window: The window used to consider co-occurrences.
        :param numtokeep: The number of most frequent words to keep. Note that the matrix is non-sparse.
        Because of this, the memory requirements of the code are quadratic.
        :param save_raw: Whether to save the raw co-occurrence matrix as a numpy matrix.
        :param shifts: The shifts to apply to the co-occurrence matrix. Each shifted matrix
        gets saved as a separate model.
        """

        start = time.time()

        if not pathtomapping:
            id2word = Dictionary(SentenceIter(pathtocorpus), prune_at=None)
            id2word.filter_extremes(no_below=5, keep_n=numtokeep)
            id2word.compactify()
            logger.info("Creating the word2id took {0} seconds".format(time.time() - start))
        else:
            id2word = Dictionary.load(pathtomapping)

        inter = time.time()

        word2id = gensim.utils.revdict(id2word)

        corpus = SentenceIter(pathtocorpus)
        raw = get_cooccur(corpus, word2id, window=window)

        logger.info("Creating raw co-occurrence matrix took {0} seconds".format(time.time() - inter))

        if save_raw:
            np.save('{0}-cooccur.npy'.format(corpusname), raw)

        SPPMIFactory._save_word2id(word2id, "{0}mapping.json".format(corpusname))
        SPPMIFactory._save_freqs(id2word, "{0}freqs.json".format(corpusname))

        raw = SPPMIFactory.raw2pmi(raw)

        for k in shifts:
            sparse = SPPMIFactory.shift_clip_pmi(np.copy(raw), k_shift=k)
            SPPMIFactory._save_sparse_mtr(sparse, "{0}-SPPMI-sparse-{1}-shift.npz".format(corpusname, k))
            del sparse
Exemplo n.º 44
0
def train_lda_model(articles, num_topics=10):
    docs = [article_to_bow(a) for a in articles]

    dict = Dictionary(docs)
    dict.filter_extremes()
    dict.compactify()

    corpus = [dict.doc2bow(article_to_bow(a)) for a in articles]

    tfidf = TfidfModel(corpus=corpus, id2word=dict)

    w_corpus = [tfidf[doc] for doc in corpus]

    lda = LdaModel(corpus=w_corpus, num_topics=num_topics,
                   update_every=0, passes=20, id2word=dict)

    return lda, tfidf, dict
Exemplo n.º 45
0
def prepare_data():
    # returns the corpus object required by learn
    # skips datasets/dspace/2481.json
    base = 'datasets/dspace'
    documents = []
    for filename in tqdm(os.listdir(base)):
        path = os.path.join(base, filename)
        with open(path) as f:
            d = json.load(f)
            abstract = d['abstract']
            if abstract is not None:
                words = tokenize(abstract.split())
                documents.append(words)

    dictionary = Dictionary(documents)
    dictionary.filter_extremes(no_below=5, no_above=0.3)
    dictionary.save('lda.dict')
    corpus = map(dictionary.doc2bow, documents)
    return corpus
Exemplo n.º 46
0
class DictionaryLearner(object):
    '''Learn a gensim dictionary from all available documents.'''
    
    def __init__(self, n=4):
        '''Initialize a DictionaryLearner instance using vocabulary of ngrams of size `n`.'''
        self._ngram = NgramTransformer(n)
        self._dictionary = Dictionary()
    
    def fit(self, documentstorage, filter_extremes=True):
        '''Fit a dictonary using documents from given documentstorage.'''
        for document in documentstorage.load_iterator(u''):
            text_document = document.text
            ngrams = self._ngram.transform([text_document])
            self._dictionary.add_documents(ngrams)
        if filter_extremes:
            self._dictionary.filter_extremes()

    def get(self):
        return self._dictionary
Exemplo n.º 47
0
    def produce(self):
        doc_n = 0
        docs = []
        doctokens = [] # AKA gensim "text"
        stopwords = nltk.corpus.stopwords.words('english')

        NOALPHA = re.compile('[^a-z]+')
        def prep_string(my_string,pattern = NOALPHA):
            return re.sub(pattern, ' ', my_string.strip().lower())

        print('Getting src docs')
        for doc in self.src_doc_generator():
            content = re.sub(NOALPHA, ' ', doc) # Do this in the corpus generator?
            docs.append(content)
            doctokens.append([token for token in nltk.word_tokenize(content) if token not in stopwords])
            doc_n += 1
            if doc_n % 1000 == 0: print(doc_n)
                
        print('Creating the dictionary')
        dictionary = Dictionary(doctokens)
        dictionary.compactify()
        dictionary.filter_extremes(keep_n=None)
        if self.dictfile:
            dictionary.save_as_text(self.dictfile+'.dict', sort_by_word=True)

        with self.dbi as db:

            print('Creating DOC')
            db.create_table('doc')
            for i, doc in enumerate(docs):
                db.cur.execute('INSERT INTO doc VALUES (?,?)',(i,doc))

            print('Creating WORD')
            db.create_table('word')
            for item in dictionary.iteritems():
                db.cur.execute('INSERT INTO word (word_id, word_str) VALUES (?,?)',item)

            print('Creating DOCWORD')
            db.create_table('docword')
            for i, tokens in enumerate(doctokens):
                for item in (dictionary.doc2bow(tokens)):
                    db.cur.execute('INSERT INTO docword (doc_id,word_id,word_count) VALUES (?,?,?)',[i,item[0],item[1]])
Exemplo n.º 48
0
    def _create_from_texts(cls, tokenized_texts, name, dataset, settings, minimum_frequency=2):
        from gensim.corpora import Dictionary as GensimDictionary

        # build a dictionary of features
        logger.info("Creating features (including n-grams) from texts")
        gemsim_dictionary = GensimDictionary(tokenized_texts)

        # Remove extremely rare features
        logger.info("Features dictionary contains %d features. Filtering..." % len(gemsim_dictionary.token2id))
        gemsim_dictionary.filter_extremes(no_below=minimum_frequency, no_above=1, keep_n=None)
        gemsim_dictionary.compactify()
        logger.info("Features Dictionary contains %d features." % len(gemsim_dictionary.token2id))

        dict_model = cls(name=name,
                         dataset=dataset,
                         settings=settings)
        dict_model.save()

        dict_model._populate_from_gensim_dictionary(gemsim_dictionary)

        return dict_model
Exemplo n.º 49
0
    def _create_from_texts(cls, tokenized_texts, name, dataset, settings, minimum_frequency=2):
        from gensim.corpora import Dictionary as GensimDictionary

        # build a dictionary
        logger.info("Building a dictionary from texts")
        dictionary = GensimDictionary(tokenized_texts)

        # Remove extremely rare words
        logger.info("Dictionary contains %d words. Filtering..." % len(dictionary.token2id))
        dictionary.filter_extremes(no_below=minimum_frequency, no_above=0.5, keep_n=None)
        dictionary.compactify()
        logger.info("Dictionary contains %d words." % len(dictionary.token2id))

        dict_model = cls(name=name,
                         dataset=dataset,
                         settings=settings)
        dict_model.save()

        dict_model._populate_from_gensim_dictionary(dictionary)

        return dict_model
Exemplo n.º 50
0
class TfidfVectorizer():
    """
    Transform text to tf-idf representation
    """

    def __init__(self):

        self.base_path = os.path.dirname(__file__)
        self.dictionary_path = os.path.join(self.base_path, "dictionary")
        self.tf_idf_model_path = os.path.join(self.base_path, "tfidf")

        self.stemmer = NepStemmer()
        self.tf_idf_model = None

    def get_tokens(self, document):
        if not self.stemmer:
            raise Exception("Stemmer not available")

        return self.stemmer.get_stems(document)

    def construct_model(self, documents):
        logging.basicConfig(
            format='%(asctime)s:%(levelname)s:%(message)s',
            level=logging.INFO
        )

        logging.info("Obtaining word tokens")
        tokens = [self.get_tokens(document) for document in documents]
        # self.tf_idf_model = TfidfModel(tokens)

        logging.info("Constructing dictionary")
        self.dictionary = Dictionary(tokens)
        self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)
        self.dictionary.compactify()
        self.dictionary.save(self.dictionary_path)

        logging.info("Constructing TF-IDF model")
        self.tf_idf_model = TfidfModel(dictionary=self.dictionary)
        self.tf_idf_model.save(self.tf_idf_model_path)

    def load_data(self):

        if not self.tf_idf_model:
            if not os.path.exists(self.tf_idf_model_path):
                raise Exception('TF-IDF model file not found')

            self.dictionary = Dictionary.load(self.dictionary_path)
            self.tf_idf_model = TfidfModel.load(self.tf_idf_model_path)

    def doc2vector(self, document):
        """ Returns the sparse tf-idf vector for given document """

        tokens = self.get_tokens(document)
        bag_of_words = self.dictionary.doc2bow(tokens)

        return (self.tf_idf_model[bag_of_words])

    def obtain_feature_vector(self, document):
        """
        Returns a single dense tf-idf vector for a given document
        """

        self.load_data()

        tf_idf_vector = matutils.sparse2full(
            self.doc2vector(document),
            self.no_of_features
        ).reshape(1, -1)

        return tf_idf_vector

    def obtain_feature_matrix(self, documents):
        """
        Returns the tf-idf dense matrix for the given documents
        """

        self.load_data()

        input_matrix_sparse = [
            self.doc2vector(x)
            for x in documents
        ]

        no_of_features = len(self.tf_idf_model.idfs)

        input_matrix = matutils.corpus2dense(
            input_matrix_sparse,
            no_of_features
        ).transpose()

        return input_matrix
	return [token for token in simple_preprocess(text) if token not in stop_words]

def iter_wiki(dump_file):
    """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple."""
    ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
    for title, text, pageid in _extract_pages(smart_open(dump_file)):
        text = filter_wiki(text)
        tokens = tokenize(text)
        if len(tokens) < 50 or any(title.startswith(ns + ':') for ns in ignore_namespaces):
            continue  # ignore short articles and various meta-articles
        yield title, tokens


wiki_stream = (tokens for _, tokens in iter_wiki('enwiki-latest-pages-articles.xml.bz2'))

print "making of dictionary started"
wiki_dictionary = Dictionary(wiki_stream)
print "wikipedia dictionary made"

wiki_dictionary.filter_extremes(no_below=10, no_above=0.3, keep_n=200000)

print "...... saving the dictionary"
wiki_dictionary.save('WikiDictionary200k.dict')
print "dictionary saved ........"

# wiki = WikiCorpus('enwiki-latest-pages-articles.xml.bz2')  # make a corpus from wiki dump

# MmCorpus.save_corpus('WikiCorpus.mm', wiki) # Saving the corpus


        scaling = 'tfidf'
    elif not opts.scaling:
        scaling = None
    else:
        raise ValueError("Only tfidf scaling is supported")

    word_model = opts.word_model

    if word_model:
        logging.info("Building word model")
        corpus = LimitCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), word_limit)
    else:
        corpus = SublexicalizedCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), order=order, word_limit=word_limit)

    voc = Dictionary(corpus)
    voc.filter_extremes(no_below=cutoff)
    voc.compactify()

    bow_corpus = (voc.doc2bow(art) for art in corpus)

    tfidf = None

    if scaling == 'tfidf':
        tfidf = TfidfModel(bow_corpus)
        bow_corpus = (tfidf[voc.doc2bow(art)] for art in corpus)

    model = LsiModel(corpus=bow_corpus, num_topics=num_topics, id2word=voc)
    model.save(model_fn)

    if tfidf:
        tfidf.save(model_fn + '.tfidf')
class MyCorpus(object):
    '''
    Corpus class for streaming review documents
    '''
    def __init__(self, file_list, file_dir, dictionary = None, mindf = MINDF, maxdf = MAXDF, \
                 maxwords = MAXWORDS, cluster_words = CLUSTER_WORDS, cluster_ul = CLUSTER_UL):
        self.file_list = file_list           # list of cuisine text files
        self.file_dir = file_dir             # directory of cuisine text files
        self.maxwords = maxwords             # maximum number of words to keep after building dictionary from clusters
        self.cluster_words = cluster_words   # maximum number of words to keep from each cluster
        self.cluster_ul = cluster_ul         # upper proportion of reviews to limit for cluster processing
        self.mindf = mindf                   # minimum number of documents to keep word
        self.maxdf = maxdf                   # max proportion of documents to keep word
        self.agglomerate = True              # return clusters as single documents (True) or return single reviews (False)
        if dictionary:
            self.dictionary = dictionary
        else:
            self.dictionary = Dictionary()
            self._build_dict()
            
    def __str__(self):
        return "<MyCorpus at " + str(hex(id(self))) + ">"
        
    def __repr__(self):
        return self.__str__()
    
    def _build_dict(self):
        for filename in self.file_list:
            dictionary = dict()
            num_reviews = 0
            with open(os.path.join(self.file_dir, filename), "rt") as f:
                for line in f:
                    num_reviews += 1
                    words = line[REVIEW_INDEX:].split()
                    for word in set(words):
                        if word not in dictionary:
                            dictionary[word] = 1
                        else:
                            dictionary[word] += 1
                doc = [item for item in dictionary.items() if dictionary[item[0]] > 2 and dictionary[item[0]] / num_reviews < self.cluster_ul]
                doc.sort(key = lambda x: -x[1])
                doc = [word for word, f in doc]
                self.dictionary.add_documents([doc[:self.cluster_words]])
                print("%s added to corpus dictionary!" % (filename,))
        self.dictionary.filter_extremes(self.mindf, self.maxdf, self.maxwords)
        self.dictionary.save("cuisine_dictionary.gensimDict")
        
    def __iter__(self):
        '''
        Iterates through cuisines by combining all reviews for each cuisine into a single
        processed document.  Also stores the length of each processed document
        '''
        if self.agglomerate:
            for filename in self.file_list:
                with open(os.path.join(self.file_dir, filename), "rt") as f:
                    doc = " ".join([line[REVIEW_INDEX:].rstrip() for line in f])
                    yield self.dictionary.doc2bow(doc.split())
        else:
            reviewIDs = set()
            for filename in self.file_list:
                with open(os.path.join(self.file_dir, filename), "rt") as f:
                    for line in f:
                        id = line[:RATING_INDEX - 1]
                        if id not in reviewIDs:
                            reviewIDs.update([id])
                            doc = line[REVIEW_INDEX:].rstrip()
                            yield self.dictionary.doc2bow(doc.split())
Exemplo n.º 54
0
 def testFilterKeepTokens_keepTokens(self):
     # provide keep_tokens argument, keep the tokens given
     d = Dictionary(self.texts)
     d.filter_extremes(no_below=3, no_above=1.0, keep_tokens=['human', 'survey'])
     expected = set(['graph', 'trees', 'human', 'system', 'user', 'survey'])
     self.assertEqual(set(d.token2id.keys()), expected)
Exemplo n.º 55
0
 def testFilterKeepTokens_unseenToken(self):
     # do provide keep_tokens argument with unseen tokens, filter_extremes functionality is unchanged
     d = Dictionary(self.texts)
     d.filter_extremes(no_below=3, no_above=1.0, keep_tokens=['unknown_token'])
     expected = set(['graph', 'trees', 'system', 'user'])
     self.assertEqual(set(d.token2id.keys()), expected)
def main():
    parser = ArgumentParser(
        description="wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information"
    )
    parser.add_argument("-ds", "--dataset", default="wiki", help="What kind of dataset to use. (wiki,es,file)")
    parser.add_argument("-d", "--dump-file", help="Wiki: bz2 dump file with wiki in it")
    parser.add_argument("-l", "--limit", help="Wiki: How many documents to extract from wiki")
    parser.add_argument("--model-id", default="model", help="Filename for created model.")
    parser.add_argument("--model-type", default="lsi", help="Model type (lsi, lda, word2vec, hdp, vocabulary).")
    parser.add_argument("--n-topics", default=10, help="Number of topics to model.")
    parser.add_argument("--n-passes", default=1, help="Number of passes for LDA  model.")
    parser.add_argument("--w2v-size", default=100, help="size of Word2Vec context.")
    parser.add_argument("--w2v-window", default=5, help="window for Word2Vec.")
    parser.add_argument("-q", "--query", default=None, help="Elasticsearch: Query to use to fetch documents")
    parser.add_argument("--index", help="Elasticsearch: index to read from.")
    parser.add_argument("--doc_type", default="doc", help="Elasticsearch: data type in index.")
    parser.add_argument("--data-dir", help="Directory to save the generated models and vocabularies into.")
    parser.add_argument("--vocab", help="Prebuilt Vocabulary file. Use this to avoid having to generate one.")

    opts = parser.parse_args()

    model_type = opts.model_type.lower()
    if model_type not in ["lsi", "lda", "word2vec", "hdp", "vocabulary"]:
        logging.error("Invalid model type %s" % model_type)
        parser.print_usage()
        exit(-1)

    logging.info("Using model type %s" % model_type)

    dump_fn = opts.dump_file
    limit = int(opts.limit) if opts.limit else None

    data_type = opts.dataset.lower()
    if data_type not in ["es", "wiki", "file"]:
        logging.error("Invalid dataset  type %s" % data_type)
        parser.print_usage()
        exit(-1)
    limit = None
    if opts.limit:
        limit = int(opts.limit)
    if not dump_fn and data_type in ["wiki"]:
        logging.error("--dump-file required for wiki dataset")
        sys.exit(1)

    query = opts.query
    index = opts.index
    doc_type = opts.doc_type
    if data_type == "es" and index is None:
        logging.error(
            "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter"
        )
        sys.exit(1)

    n_topics = int(opts.n_topics)
    n_passes = int(opts.n_passes)
    logging.info("Using %d topics." % n_topics)
    data_dir = opts.data_dir
    model_id = opts.model_id
    model_fn = "%s_%s_%d" % (model_id, model_type, n_topics)
    if data_dir:
        model_fn = "%s/%s" % (data_dir, model_fn)
    if model_type == "word2vec":
        w2v_size = int(opts.w2v_size)
        w2v_window = int(opts.w2v_window)
        model_fn = "%s_w_%s_s_%s" % (model_fn, w2v_window, w2v_size)
    logging.info("Writing models to %s." % model_fn)

    if data_type == "es":
        logging.info("Using data type %s with index %s, doc_type %s query %s" % (data_type, index, doc_type, query))
        dataset = ElasticsearchDataset(
            read_index=index, read_doc_type=doc_type, query=query, normalize_func=normalize_es
        )
    elif data_type == "wiki":
        logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit))
        dataset = WikipediaDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_wiki)
    elif data_type == "file":
        logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit))
        dataset = FileDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_file)
    vocab_file = opts.vocab
    vocab = Dictionary()
    sw = set(stopwords.words("norwegian"))
    if not vocab_file or model_type == "vocabulary":
        vocab.add_documents([get_tokenized(page, sw) for page in dataset])
        vocab.filter_extremes()
        vocab.compactify()
        vocab.save(model_fn + ".vocab")
    else:
        vocab = Dictionary.load(vocab_file)
    if model_type == "vocabulary":
        return
    tfidf = TfidfModel(dictionary=vocab)
    if model_type == "lsi":
        corpus = IterableDataset(dataset, sw, vocab)
        model = LsiModel(corpus=tfidf[corpus], num_topics=n_topics, id2word=vocab)
    elif model_type == "lda":
        corpus = IterableDataset(dataset, sw, vocab)
        model = LdaModel(corpus=tfidf[corpus], num_topics=n_topics, passes=n_passes, id2word=vocab)

    elif model_type == "word2vec":
        corpus = IterableDataset(dataset, sw, vocab, doc2bow=False)
        corpus.dictionary = vocab
        model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size)
    elif model_type == "hdp":
        corpus = IterableDataset(dataset, sw, vocab)
        model = HdpModel(corpus=tfidf[corpus], id2word=vocab)

    logging.info(model)
    model.save(model_fn)
Exemplo n.º 57
0
 def testFilter(self):
     d = Dictionary(self.texts)
     d.filter_extremes(no_below=2, no_above=1.0, keep_n=4)
     expected = {0: 3, 1: 3, 2: 3, 3: 3}
     self.assertEqual(d.dfs, expected)
Exemplo n.º 58
0
 def testFilterKeepTokens_unchangedFunctionality(self):
     # do not provide keep_tokens argument, filter_extremes functionality is unchanged
     d = Dictionary(self.texts)
     d.filter_extremes(no_below=3, no_above=1.0)
     expected = {'graph', 'trees', 'system', 'user'}
     self.assertEqual(set(d.token2id.keys()), expected)
def dbpedia_smallwordconv(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_dbpedia_data(size=sample)

    if sample:
        test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000)))
    else:
        test_size = 5000 * 14

    logging.info('creating train test split ...')
    split = StratifiedShuffleSplit(df.category, test_size=test_size)
    train_split, test_split = next(iter(split))
    train_df = df.iloc[train_split]
    test_df = df.iloc[test_split]

    logging.info('preprocessing, padding and binarizing data ...')
    train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True)
    vocab = Dictionary(train_docs)
    vocab.filter_extremes(keep_n=5000)
    bin = LabelBinarizer()

    x_train = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
                                      for s in train_docs],
                                     max_length=100, padding_word=0))
    y_train = bin.fit_transform(train_df.category.values)

    test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True)
    x_test = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
                                      for s in test_docs],
                                     max_length=100, padding_word=0))
    y_test = bin.transform(test_df.category.values)

    logging.info('building model ...')
    model = Sequential()
    model.add(Embedding(5001, 300, input_length=100))
    model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(14, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])

    model.fit(x_train, y_train, batch_size=32, nb_epoch=5, validation_data=[x_test, y_test])

    print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))
Exemplo n.º 60
0
def calculate_lda(dataset_raw, n_topics=10, lda_model_name="",
                  mallet=True, mallet_path="/Users/verasazonova/no-backup/JARS/mallet-2.0.7/bin/mallet",
                  dataname="none"):

    with open(dataname+"_log.txt", 'a') as fout:

        if dataset_raw.include_date:
            dates = [text[1] for text in dataset_raw]
            dataset = [normalize_words(text[0].split(), dataset_raw.stoplist) for text in dataset_raw]
        else:
            dates = ["" for _ in dataset_raw]
            dataset = dataset_raw

        bi_grams = Phrases(dataset, threshold=3)
        dataset = bi_grams[dataset]


        dictionary = Dictionary(dataset)
        dictionary.filter_extremes(no_below=1, no_above=0.9)

        bow_corpus = [dictionary.doc2bow(text) for text in dataset]

        fout.write("# Topics: %s\n" % n_topics)

        if not os.path.isfile(lda_model_name):

            if mallet:
                lda_model = LdaMallet(mallet_path, corpus=bow_corpus, num_topics=n_topics, id2word=dictionary, workers=4,
                                     optimize_interval=10, iterations=1000)
                lda_model_name = "lda_model_mallet_%s_%i" % (dataname, n_topics)
            else:
                lda_model = LdaModel(bow_corpus, id2word=dictionary, num_topics=n_topics, distributed=False,
                                    chunksize=2000, passes=5, update_every=10, alpha='asymmetric',
                                    eta=0.1, decay=0.5, eval_every=10, iterations=1000, gamma_threshold=0.001)

                lda_model_name = "lda_model_%s_%i" % (dataname, n_topics)

            lda_model.save(lda_model_name)

        else:
            if mallet:
                lda_model = LdaMallet.load(lda_model_name)
            else:
                lda_model = LdaModel.load(lda_model_name)

        topic_definition = []

        for i, topic in enumerate(lda_model.show_topics(n_topics, num_words=20, formatted=False)):
            fout.write("%i \n" % i)
            topic_list = []
            freq_list = []
            a_list = []
            for tup in topic:
                topic_list.append(tup[1])
                freq_list.append(dictionary.dfs[ dictionary.token2id[tup[1]] ] )
                a_list.append(tup[0])


            fout.write( "%s\n\n" % repr((sorted(zip(topic_list, freq_list), key=itemgetter(1) ))))

            topic_definition.append("%i, %s" %(i, repr(" ".join(sorted(topic_list)))[2:-1]))

        fout.write("Total number of documents: %i\n" % dictionary.num_docs )



        earliest_date = dateutil.parser.parse("Sun Jun 08 00:00:00 +0000 2014")

        a = [tup for tup in  sorted(zip(bow_corpus, dates), key=get_date )
             if dateutil.parser.parse(tup[1]) > earliest_date]

        print len(a)
        print a[len(a)-1]
        latest_date = dateutil.parser.parse(a[len(a)-1][1])

        num_bins = 100

        time_span = latest_date - earliest_date
        print time_span
        time_bin = time_span / num_bins
        print time_bin

        bin_lows = [earliest_date]
        bin_high = earliest_date + time_bin
        counts = [[0 for _ in range(n_topics)] for _ in range(num_bins+1)]
        i=0
        for text in a:
            topic_assignments = lda_model[text[0]]
            date_str = text[1]
            if date_str is not None:
                cur_date = dateutil.parser.parse(date_str)
                if cur_date >= bin_high:
                    i+=1
                    bin_lows.append(bin_high)
                    bin_high = bin_lows[len(bin_lows)-1] + time_bin
                #counts[i][max(topic_assignments, key=itemgetter(1))[0]] += 1
                for tup in topic_assignments:
                    counts[i][tup[0]] += tup[1]

        fout.write("Number of documents assigned mostly to the topic: \n")
        fout.write("%s\n" % counts)

        a = 1.*np.array(counts)

        np.savetxt("mpeketoni_cnts.txt", a)
        with open("mpeketoni_bins.txt", 'w') as fout:
            for date in bin_lows:
                fout.write("%s\n" % date)
        with open("mpeketoni_labels.txt", 'w') as fout:
            for label in topic_definition:
                fout.write("%s\n" % label)

        return a, bin_lows, topic_definition