Exemplo n.º 1
0
class process_corpus(object):

    def __init__(self, sql=None,lemmatize=False,first_sentences=False,n_sentences=10):

        self.sql=sql
        self.first_sentences=first_sentences
        self.n_sentences=n_sentences
        self.wordnet=WordNetLemmatizer()
        self.pstemmer=PorterStemmer()
        self.lemmatize=lemmatize
        self.dictionary = Dictionary(self.iterrecords())

        print('dictionary before:', self.dictionary.token2id)


        once_ids = [tokenid for tokenid, docfreq in self.dictionary.dfs.iteritems() if docfreq == 1]
        self.dictionary.filter_tokens(once_ids)
        self.dictionary.compactify()
        print('dictionary after filtering:', self.dictionary.token2id)

    def  __iter__(self):
        self.cl=0
        for tokens in self.iterrecords():  # generates the document tokens and creates bow using dictionary
            self.cl+=1

            yield self.dictionary.doc2bow(tokens)






    def iterrecords(self): # generates document tokens for the dictionary

        self.index=[]
        cursor.execute(self.sql)
        ct=0


        for doc in cursor:
                print ct
                self.index.append(str(doc[0]).strip())

                doc=doc[1]
#                print to_beautiful(doc[1])


                if self.first_sentences:

                    doc=get_first_n_sentences_from_document(doc,self.n_sentences)


                tokens=clean_text_by_word(doc)

                ct+=1
                yield  tokens # or whatever tokenization suits you

    def __len__(self):

        return self.cl
Exemplo n.º 2
0
def build_tag_vectors(tag_directory_path):
    """Loads tag files, builds sparse vectors for each song
        Parameters
        ----------
            tag_directory_path : String, path of directory containing tags
        Returns
        -------
            id_vec_mapping : dict (song id => list[tuple(tagId, count)])
            dictionary : gensim Dictionary containing all tags and ids
    """
    dictionary = Dictionary()
    for f in listdir(tag_directory_path):
        with open(tag_directory_path+"/"+f, 'r') as tags:
            tokens = tags.read().split(sep=' ')
            dictionary.add_documents([tokens])
    dictionary.filter_extremes(no_below=2, no_above=0.5)
    dictionary.compactify()
    id_vec_mapping = {}
    for f in listdir(tag_directory_path):
        song_id = f[0:-4]
        with open(tag_directory_path+"/"+f, 'r') as tags:
            tokens = tags.read().split(sep=' ')
        sparse_vec = dictionary.doc2bow(tokens)
        add_to_dictionary(id_vec_mapping, (song_id, sparse_vec))
    return id_vec_mapping, dictionary
Exemplo n.º 3
0
    def _create_from_texts(cls,
                           tokenized_texts,
                           name,
                           dataset,
                           settings,
                           minimum_frequency=2):
        from gensim.corpora import Dictionary as GensimDictionary

        # build a dictionary
        logger.info("Building a dictionary from texts")
        dictionary = GensimDictionary(tokenized_texts)

        # Remove extremely rare words
        logger.info("Dictionary contains %d words. Filtering..." %
                    len(dictionary.token2id))
        dictionary.filter_extremes(no_below=minimum_frequency,
                                   no_above=1,
                                   keep_n=None)
        dictionary.compactify()
        logger.info("Dictionary contains %d words." % len(dictionary.token2id))

        dict_model = cls(name=name, dataset=dataset, settings=settings)
        dict_model.save()

        dict_model._populate_from_gensim_dictionary(dictionary)

        return dict_model
Exemplo n.º 4
0
 def buildDictionary(self, corpus, txt2tokens, opts):
     '''
     Tokenize texts and add tokens to dictionary.
     :param corpus: Corpus-like or id
     :param txt2tokens: txt2tokens or id
     :param opts: GensimDictBuildOptions
     :param ctx: pytopia context
     :return: gensim Dictionary
     '''
     t = clock()
     corpus, txt2tokens = self.resolve(corpus, txt2tokens)
     # fill the dictionary with tokens from corpus texts
     dictionary = Dictionary(documents=None)
     numDocs = 0; numTokens = 0
     for txto in corpus:
         tokens = txt2tokens(txto.text)
         numDocs += 1; numTokens += len(tokens)
         dictionary.doc2bow(tokens, allow_update=True)
     # form filtering options and run filtering
     no_below = opts.docLowerLimit if opts.docLowerLimit is not None else 0
     if opts.docUpperLimit is None: no_above = 1.0
     elif isinstance(opts.docUpperLimit, float): no_above = opts.docUpperLimit
     else: no_above = opts.docUpperLimit/float(numDocs)
     if opts.words2keep is None: keep_n = numTokens
     else: keep_n = opts.words2keep
     dictionary.filter_extremes(no_below=no_below, no_above=no_above,
                                keep_n=keep_n)
     dictionary.compactify()
     # force id2token map building
     someId = dictionary.token2id.values()[0]
     dictionary[someId]
     return GensimDictAdapter(dictionary, corpus.id, txt2tokens.id, opts)
Exemplo n.º 5
0
    def fit(self, corpus):

        self._verify_corpus(corpus)
        self.N = len(corpus)

        tokens = self.preprocessor.transform(corpus)
        self.observed_tokens = tokens.apply(len).sum()

        vocab = Dictionary(tokens)
        vocab.filter_extremes(no_above=self.max_df,
                              no_below=self.min_df,
                              keep_n=self.vocab_size)
        vocab.compactify()

        self.vocab = vocab
        self.corpus_as_tokens = tokens
        self.corpus_as_bow = [self.vocab.doc2bow(doc) for doc in tokens]
        self.corpus_as_csr = corpus2csc(self.corpus_as_bow,
                                        num_terms=len(self.vocab)).T

        self.lengths = [len(d) for d in self.corpus_as_bow]
        self.num_empty_docs = self.lengths.count(0)

        time_now = time.localtime()
        self.created_on = time.strftime("%d %b %Y %H:%M:%S", time_now)

        return self
Exemplo n.º 6
0
def tf_idf_weight(spacy_contexts):
    """
    @param spacy_contexts Spacy-fied contexts

    Returns list of Dicts, each dictionary corresponds to one document and
    contains words and their tf-idf weights
    """
    docs_dict = Dictionary(spacy_contexts)
    docs_dict.compactify()

    docs_corpus = [docs_dict.doc2bow(doc) for doc in spacy_contexts]

    model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
    docs_tfidf = model_tfidf[docs_corpus]

    # Now generate a list of dicts with k,v = "word": tfidf_frequency
    # each dict contains words from one document (sentence)
    doc_tfidf_dicts = []

    for doc in docs_tfidf:
        d = dict()
        for term, freq in doc:
            d[docs_dict[term]] = freq

        doc_tfidf_dicts.append(d)

    return doc_tfidf_dicts
Exemplo n.º 7
0
def doc_embed_charity_notfidf(processed_docs, word_min=5, word_max_perc=.8):
    'Takes a list of preprocessed texts and returns an embedding vector for each document, a dictionary of the words within the corpus, and the glove vectors for each word in the corpus'

    # Create dictionary from corpus
    docs_dict = Dictionary(processed_docs)
    docs_dict.filter_extremes(no_below=word_min, no_above=word_max_perc)
    docs_dict.compactify()

    # Convert docs into sparce matricx (N_docs x N_words in dictionary) where the number in each cell indicates the number of time that word appeared in that document
    docs_corpus = [docs_dict.doc2bow(doc) for doc in processed_docs]
    docs_vecs = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_corpus])

    #Count number of documents and words in dictionary
    num_docs = np.shape(docs_vecs)[0]
    num_words = np.shape(docs_vecs)[1]

    print("Total # of docs: {}".format(num_docs))
    print("Total # of words in dict: {}".format(num_words))

    # For each word in dict extract embedding vector (Glove vectors)
    glove_vecs = np.vstack([nlp(docs_dict[i]).vector for i in range(len(docs_dict))])

    # Sum glove vectors over words in doc
    docs_emb = np.dot(docs_vecs, glove_vecs)

    return docs_emb, docs_dict, glove_vecs
Exemplo n.º 8
0
    def get_corpus_dict(self, recalculate=False, from_scratch=True):

        if not os.path.isfile(
                self.paths.trigram_dictionary_filepath) or recalculate:

            if not from_scratch:
                raise ValueError(
                    'No corpus Dictionary file exists but from_scratch is False'
                )

            print('Building trigram dict...')
            trigram_docs = LineSentence(self.paths.trigram_corpus_filepath)

            # learn the dictionary by iterating over all of the docs
            trigram_dictionary = Dictionary(trigram_docs)

            # filter tokens that are very rare or too common from
            # the dictionary (filter_extremes) and reassign integer ids (compactify)
            trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
            trigram_dictionary.compactify()

            trigram_dictionary.save(self.paths.trigram_dictionary_filepath)
            print('Done!')
        else:
            print('Loading trigram dict...')
            trigram_dictionary = Dictionary.load(
                self.paths.trigram_dictionary_filepath)

        return trigram_dictionary
Exemplo n.º 9
0
    def _build_vocab(self, max_vocab_cnt):
        all_words = []
        for data in self.valid + self.non_valid:
            all_words.append(data["title"] + data["content"])
        vocab = Dictionary(all_words)
        raw_vocab_size = len(vocab)

        vocab.filter_extremes(no_below=5)
        vocab.filter_extremes(keep_n=max_vocab_cnt)
        len_1_words = list(
            filter(
                lambda w: len(w) == 1 and re.match(r"[\x00-\x7f]", w) and w
                not in ["a", "i"] and True or False, vocab.values()))
        vocab.filter_tokens(list(map(vocab.token2id.get, len_1_words)))
        if self.config.use_dict == "seq" and self.config.enable_pad:
            vocab.token2id[PAD] = len(vocab)
            vocab.compactify()
            self.pad_wid = vocab.token2id.get(PAD)
        self.vocab_seq = vocab  # seq dictionary
        # build bow dictionary
        self.vocab_bow = copy.deepcopy(vocab)
        self.vocab_bow.filter_tokens(
            map(self.vocab_bow.token2id.get, STOPWORDS))  # filter stop words
        self.vocab_bow.compactify()
        if self.config.tfidf:
            tfidf_corpus = [self.vocab_bow.doc2bow(line) for line in all_words]
            self.tfidf_model = TfidfModel(tfidf_corpus)
        print("Load corpus with non_valid size %d, valid size %d, "
              "raw vocab size %d seq vocab size %d, bow vocab size %d" %
              (len(self.non_valid), len(self.valid), raw_vocab_size,
               len(self.vocab_seq), len(self.vocab_bow)))
Exemplo n.º 10
0
def create_LDA_dict():
    #ONE TIME USE, to create and save LDA model
    trigram_dictionary_filepath = '../Dataset/trigram_dict_all.dict'
    trigram_reviews = LineSentence(
        '../Dataset/trigram_transformed_reviews_all.txt')
    # learn the dictionary by iterating over all of the reviews
    trigram_dictionary = Dictionary(trigram_reviews)
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.compactify()
    trigram_dictionary.save(trigram_dictionary_filepath)
    print('LDA dict saved.')
    trigram_bow_filepath = '../Models/trigram_bow_corpus_all.mm'
    MmCorpus.serialize(
        trigram_bow_filepath,
        trigram_bow_generator(
            '../Dataset/trigram_transformed_reviews_all.txt'))
    trigram_bow_corpus = MmCorpus(trigram_bow_filepath)
    lda_model_filepath = '../Models/lda_model_all'  #lda_model_all_30, lda_model_10topic
    # created LDA model with 10, 30, 50 topics, found 30 has best result
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        lda = LdaMulticore(
            trigram_bow_corpus,
            num_topics=30,  #10, 30, 50
            id2word=trigram_dictionary,
            workers=8)
    lda.save(lda_model_filepath)
    print('LDA model saved.')
Exemplo n.º 11
0
class MiCorpus:
    """
    Iterable: en cada iteración devuelve vectores bag-of-words, uno por documento.
    Procesa un documento a la vez usando generators. Nunca carga todo el corpus a RAM.
    """
    def __init__(self, directorio, lenguaje, otros=None):
        self.directorio = directorio
        self.lenguaje = lenguaje
        self.otros = otros

        self.ngramas = model_ngrams(
            iter_sentences(self.directorio, self.lenguaje, self.otros))

        self.diccionario = Dictionary(
            iter_documents(self.ngramas, self.directorio, self.lenguaje,
                           self.otros))
        self.diccionario.filter_extremes(no_above=0.8)
        self.diccionario.filter_tokens(
            bad_ids=(tokid for tokid, freq in self.diccionario.dfs.items()
                     if freq == 1))
        self.diccionario.compactify()

    def __iter__(self):
        """
        CorpusConsultivos es un streamed iterable.
        """
        for tokens in iter_documents(self.ngramas, self.directorio,
                                     self.lenguaje, self.otros):
            yield self.diccionario.doc2bow(tokens)
Exemplo n.º 12
0
def prepare_corpus(tweets_file, corpus_file, dictionary_file, author_topic):
    stop_words = set(stopwords.words('english'))
    stop_words.add(u'rt')

    print('Loading tweets from ' + tweets_file)
    tweets = pd.read_pickle(tweets_file)

    if author_topic:
        tweets = tweets.groupby('user').agg({'text': 'sum'})

    print('%d tweets loaded' % len(tweets.index))

    dictionary = Dictionary(tweets['text'])
    stopword_ids = map(dictionary.token2id.get, stop_words)
    dictionary.filter_tokens(stopword_ids)
    dictionary.compactify()
    dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=None)
    dictionary.compactify()

    corpus = [dictionary.doc2bow(doc) for doc in tweets['text']]

    # print(corpus)
    print("Writing corpus to " + corpus_file)
    MmCorpus.serialize(corpus_file, corpus)
    # print(dictionary)
    print("Writing dictionary to " + dictionary_file)

    dictionary.save(dictionary_file)
Exemplo n.º 13
0
def parse_processed_amazon_dataset(task_files, max_words=10000):
    """
    Code inspired by:
    https://github.com/sclincha/xrce_msda_da_regularization
    """
    datasets = {}
    dico = GensimDict()
    print("Parsing", task_files)

    # First pass on document to build dictionary
    for fname in task_files:
        with open(fname, 'r') as f:
            for l in f:
                tokens = l.split(' ')
                tokens_list = []
                for tok in tokens[:-1]:
                    ts, tfreq = tok.split(':')
                    freq = int(tfreq)
                    tokens_list += [ts] * freq
                dico.doc2bow(tokens_list, allow_update=True)

    # Preprocessing_options
    dico.filter_extremes(no_below=2, keep_n=max_words)
    dico.compactify()

    for fname in task_files:
        X, Y = [], []

        with open(fname, 'r') as f:
            for docid, l in enumerate(f):
                tokens = l.split(' ')
                label_string = tokens[-1]
                tokens_list = []
                for tok in tokens[:-1]:
                    ts, tfreq = tok.split(':')
                    freq = int(tfreq)
                    tokens_list += [ts] * freq
                count_list = dico.doc2bow(tokens_list, allow_update=False)

                idx, freqs = list(zip(*count_list))
                one_hot = np.zeros(max_words)
                one_hot[list(idx)] = np.array(freqs)

                X.append((docid, one_hot))

                #Preprocess Label
                ls, lvalue = label_string.split(':')
                if ls == "#label#":
                    if lvalue.rstrip() == 'positive':
                        Y.append(1)
                    elif lvalue.rstrip() == 'negative':
                        Y.append(0)
                    else:
                        raise Exception("Invalid Label Value")
                else:
                    raise Exception('Invalid Format')

        datasets[os.path.split(os.path.split(fname)[0])[-1]] = (X, Y)

    return datasets, dico
Exemplo n.º 14
0
    def _create_from_texts(cls,
                           tokenized_texts,
                           name,
                           dataset,
                           settings,
                           minimum_frequency=2):
        from gensim.corpora import Dictionary as GensimDictionary

        # build a dictionary of features
        logger.info("Creating features (including n-grams) from texts")
        gemsim_dictionary = GensimDictionary(tokenized_texts)

        # Remove extremely rare features
        logger.info("Features dictionary contains %d features. Filtering..." %
                    len(gemsim_dictionary.token2id))
        gemsim_dictionary.filter_extremes(no_below=minimum_frequency,
                                          no_above=1,
                                          keep_n=None)
        gemsim_dictionary.compactify()
        logger.info("Features Dictionary contains %d features." %
                    len(gemsim_dictionary.token2id))

        dict_model = cls(name=name, dataset=dataset, settings=settings)
        dict_model.save()

        dict_model._populate_from_gensim_dictionary(gemsim_dictionary)

        return dict_model
def dic_tr(clean_revs_file):

    tri_rv = LineSentence(clean_revs_file)
    tri_dict = Dictionary(tri_rv)

    tri_dict.filter_extremes(no_below=5, no_above=0.3)
    tri_dict.compactify()
    tri_dict.save(trigram_dict_path)
		def fetch_dict():
			global dictionary
			dictionary=Dictionary([i for i in my_dictionary])
			once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
			dictionary.filter_tokens(once_ids)
			dictionary.compactify()
			dictionary.save("Topic/dic.loc")
			return dictionary
Exemplo n.º 17
0
def build_vocabulary_and_corpus():
    '''
    Build the vocabularies and stem sequences for each type of entities.
    '''

    # Vocabulary (same for question and answers)
    v = Dictionary()

    # Stemmer.
    stemmer = PorterStemmer()

    # Tokenizer.
    tokenizer = TweetTokenizer()

    # Read indexes
    user_index, question_index, answer_index, comment_index = read_indexes()

    # Question, answer
    q = {}
    a = {}

    # Read entities.
    with open(entity_path, 'rb') as obj:
        entities = pickle.load(obj)

    # Browse question and answers to first build vocabulary.
    for e in entities:
        # Question or answer.
        if e['type'] == 'Q' or e['type'] == 'A':
            # String content.
            title = str(e['title']).encode('utf-8').lower()
            content = str(e['content']).encode('utf-8').lower()
            # Tokenize
            d = tokenizer.tokenize(title + content)
            # Stem word
            d = [stemmer.stem(s) for s in d]
            # Process vocabulary.
            v.add_documents([d])
            # Question
            if e['type'] == 'Q':
                q[question_index[e['id']]] = d
            # Answer
            if e['type'] == 'A':
                a[answer_index[e['id']]] = d

    # Write question corpus.
    with open(os.path.join(data_path, 'q.corpus'), 'wb') as f:
        pickle.dump(q, f)

    # Write answer corpus.
    with open(os.path.join(data_path, 'a.corpus'), 'wb') as f:
        pickle.dump(a, f)

    # Write to analyse.
    v.filter_extremes(no_below=1000, keep_n=10000)
    v.compactify()
    v.save(os.path.join(data_path, "raw_vocabulary.gensim"))
Exemplo n.º 18
0
def docs_to_dict(docs, **kw):
    """Convert docs to Dictionary and BOW, filtering common/rare words.
    Returns (dictionary, BOW)"""
    no_below = kw.pop("no_below", .02)
    no_above = kw.pop("no_above", 0.9)
    d = Dictionary(docs)
    d.filter_extremes(no_below=no_below, no_above=no_above, **kw)
    d.compactify()
    return d, docs.apply(d.doc2bow)
Exemplo n.º 19
0
def get_corpus(df):
    words = clean_text(df["combined_text"].values)
    bigram = bigrams(words)
    bigram = [bigram[tweet] for tweet in words]
    id2word = Dictionary(bigram)
    id2word.filter_extremes(no_below=50, no_above=0.40)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    return corpus, id2word, bigram
Exemplo n.º 20
0
 def _get_docs_dict(self, docs):
     docs_dict = Dictionary(docs)
     # CAREFUL: For small corpus
     docs_dict.filter_extremes(no_below=5, no_above=0.2)
     # docs_dict.filter_extremes(no_below=5)
     # after some tokens have been removed remove the gaps
     docs_dict.compactify()
     print('docs_dict', docs_dict)
     return docs_dict
Exemplo n.º 21
0
def generate_dictionary(input_file_path,
                        applyExtreem=True,
                        no_below=5,
                        no_above=0.4):
    lineSentence = LineSentence(input_file_path)
    dictionary = Dictionary(lineSentence)
    if applyExtreem:
        dictionary.filter_extremes(no_below=no_below, no_above=no_above)
    dictionary.compactify()
    return dictionary
Exemplo n.º 22
0
def main(subreddit):
    const = get_constants(subreddit)

    if os.path.exists(const['CORPUS']):
        print("Loading preexisting corpus...")
        corpus = util.load_pickle(const['CORPUS'])
    else:
        print("Getting and writing dictionary...")

        with open(const['OUTPUTS'], "r") as f:
            num_lines = sum(1 for line in f)


        with open(const['OUTPUTS'], "r") as f:
            dicts = (json.loads(comment) for comment in tqdm(f, total=num_lines))

            if const["INTERVAL"] is not None:
                corpuses = [[] for interval in const["ALL_INTERVALS"]]

                for comment in dicts:
                    i = get_interval_idx(comment["score"])
                    corpuses[i].append(normalize_text(comment["body"], const['STEMMING']))

                for i, interval in enumerate(const["ALL_INTERVALS"]):
                    util.write_pickle(corpuses[i], get_interval_fname(subreddit, interval))

                corpus = corpuses[0]
            else:
                corpus = [normalize_text(comment["body"], const['STEMMING']) for comment in dicts]

                
    gdict = Dictionary(
        corpus
    )

    gdict.filter_extremes(no_above=const['NO_ABOVE_1'], no_below=const['NO_BELOW'])
    gdict.compactify()


    util.write_pickle(gdict.token2id, const['INDICES'])
    util.write_pickle(gdict, const['DICTS'])


    print("Generating word co-occurrences...")
    cooccurgen.run(
       word_gen(corpus, gdict, subreddit, len(corpus)),
       gdict.token2id,
       4,
       const['COUNTS']
    )
    print("Generating PPMI vectors...")
    ppmigen.run(subreddit, cds=True)
    print("Generating SVD vectors...")
    makelowdim.run(const['INDICES'], const['PPMI'], const['VECS'])
Exemplo n.º 23
0
 def fetch_dict():
     global dictionary
     dictionary = Dictionary([i for i in my_dictionary])
     once_ids = [
         tokenid for tokenid, docfreq in dictionary.dfs.iteritems()
         if docfreq == 1
     ]
     dictionary.filter_tokens(once_ids)
     dictionary.compactify()
     dictionary.save("Topic/dic.loc")
     return dictionary
Exemplo n.º 24
0
        def prep_corpus(docs, additional_stopwords=set(), no_below=2, no_above=0.05):

            dictionary = Dictionary(docs)
            stopwords = nltk_stopwords().union(additional_stopwords)
            stopword_ids = map(dictionary.token2id.get, stopwords)
            dictionary.filter_tokens(stopword_ids)
            dictionary.compactify()
            dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
            dictionary.compactify()
            corpus = [dictionary.doc2bow(doc) for doc in docs]

            return dictionary, corpus
Exemplo n.º 25
0
def get_dictionary(documents: Dict[int, List[str]]) -> Dictionary:
	if os.path.exists(DICTIONARY_FILE_NAME):
		print(f"loading dictionary from {DICTIONARY_FILE_NAME}")
		gensim_dict = Dictionary.load(DICTIONARY_FILE_NAME)
	else:
		print("creating dictionary")
		gensim_dict = Dictionary()
		gensim_dict.add_documents(documents.values())
		gensim_dict.compactify()
		print(f"saving dictionary to {DICTIONARY_FILE_NAME}")
		gensim_dict.save(DICTIONARY_FILE_NAME)
	return gensim_dict
Exemplo n.º 26
0
class SentenceCorpus(TextCorpus):

    def __init__(self, sentences, max_size=None):
        self.metadata = False
        self.sentences = sentences
        self.dictionary = Dictionary(self.get_texts(), prune_at=max_size)
        self.dictionary.compactify()
        self.bows = [self.dictionary.doc2bow(tokens) for tokens in self.get_texts()]

    def get_texts(self):
        for sentence in self.sentences:
            yield sentence.tokens
Exemplo n.º 27
0
class BaseWordFilter:
    def __init__(self,
                 documents: List[str],
                 labels: List,
                 stopwords=None,
                 **vocab_options):
        self._filtered_words = []
        self._labels = labels
        self.__generate_vocab(documents, **vocab_options)
        self.__vectorize_documents(documents, stopwords)

    def __generate_vocab(self,
                         docs,
                         vocab_size=2000,
                         no_below=100,
                         no_above=0.9):
        doc_tokens = [simple_preprocess(d) for d in docs]
        self._words = Dictionary(doc_tokens)
        self.words.filter_extremes(no_below=no_below,
                                   no_above=no_above,
                                   keep_n=vocab_size)
        self._words.compactify()

    def __vectorize_documents(self, docs, stopwords):
        vocab = {w: i for i, w in enumerate(self._words.values())}
        vectorizer = CountVectorizer(stop_words=stopwords, vocabulary=vocab)
        self._doc_vecs = vectorizer.fit_transform(docs)

    def fit(self):
        pass

    def save_filter(self, file='models/filter.txt'):
        with open(file, 'wt') as f:
            for word in self.filtered_words:
                f.write('%s\n' % word)
            f.close()

    @property
    def words(self):
        return self._words

    @property
    def filtered_words(self):
        return self._filtered_words

    @property
    def doc_vecs(self):
        return self._doc_vecs

    @property
    def labels(self):
        return self._labels
Exemplo n.º 28
0
def lsa(corpus, size=8):
    dic = Dictionary(corpus)
    dic.filter_extremes(
        no_below=5,
        no_above=0.8,
    )
    dic.filter_n_most_frequent(remove_n=10)
    dic.compactify()
    index_corpus = [dic.doc2bow(sent) for sent in corpus]
    tfidf = TfidfModel(index_corpus, dictionary=dic)
    normed_corpus = [tfidf[sent] for sent in index_corpus]
    lsi = LsiModel(normed_corpus, num_topics=size)
    return [[x[1] for x in lsi[sent]] for sent in normed_corpus]
Exemplo n.º 29
0
def get_dict(pro_texts):
    # create a dictionary
    dictionary = Dictionary(pro_texts)

    # filter out words with frequency< 5 in a document and filter out words apprear in more than %30 of the documents
    dictionary.filter_extremes(no_below=5, no_above=0.3, keep_n=None)
    #remove gaps in ids after the filter
    dictionary.compactify()

    #create bow of the data
    corpus = [dictionary.doc2bow(text) for text in pro_texts]

    return dictionary, corpus
Exemplo n.º 30
0
def Gensim_Dic(sentences, tem_fname):
    dct = Dictionary(sentences)

    a = []
    for w in stopwords:
        if w in dct.token2id.keys():
            a.append(dct.token2id[w])

    dct.filter_extremes(no_below=10)

    dct.filter_tokens(bad_ids=a)
    dct.compactify()
    dct.save_as_text(tmp_fname)
Exemplo n.º 31
0
class SentenceCorpus(TextCorpus):

    def __init__(self, sentences, no_below=3, no_above=0.8, max_size=None):
        self.metadata = False
        self.sentences = sentences
        self.dictionary = Dictionary(self.get_texts(), prune_at=max_size)
        self.dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=max_size)
        self.dictionary.compactify()
        self.bows = [self.dictionary.doc2bow(tokens) for tokens in self.get_texts()]

    def get_texts(self):
        for sentence in self.sentences:
            yield sentence.tokens
Exemplo n.º 32
0
def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5):
    print('Building dictionary...')
    dictionary = Dictionary(docs)
    # remove stopwords
    stopwords = nltk_stopwords().union(additional_stopwords)
    stopword_ids = map(dictionary.token2id.get, stopwords)
    # get ids for short words len(word)<=3
    shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3]
    dictionary.filter_tokens(stopword_ids)
    dictionary.compactify()
    # get ids for short words len(word)<=3
    shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3]
    dictionary.filter_tokens(shortword_ids)
    dictionary.compactify()
    # remove words that appear only once
    once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems()if docfreq == 1]
    dictionary.filter_tokens(once_ids)
    dictionary.compactify()
    # filter extreme values
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
    dictionary.compactify()

    print('Building corpus...')
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    return dictionary, corpus
Exemplo n.º 33
0
def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5):
  print('Building dictionary...')
  dictionary = Dictionary(docs)
  stopwords = nltk_stopwords().union(additional_stopwords)
  stopword_ids = map(dictionary.token2id.get, stopwords)
  dictionary.filter_tokens(stopword_ids)
  dictionary.compactify()
  dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
  dictionary.compactify()

  print('Building corpus...')
  corpus = [dictionary.doc2bow(doc) for doc in docs]

  return dictionary, corpus
	def fetch_dict():
		print "Fetching Dictionary...",
		try:
			dictionary=Dictionary().load("Topic/dic.tm")
			print "Dictionary loaded!"
		except IOError:
			print "Dictionary not found, building Dictionary..."
			dictionary=Dictionary(i for i in MyDictionary())
			once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
			dictionary.filter_tokens(once_ids)
			dictionary.compactify() 
			print "\rDictionary Built!"
			print dictionary
			dictionary.save("Topic/dic.tm")
		return dictionary
Exemplo n.º 35
0
def extract_topics(words):
    word_id_map=Dictionary([words])
    word_id_map.filter_tokens([id for id, occurance in word_id_map.dfs.iteritems() if occurance == 2])
    word_id_map.compactify()
    deals_corpus=[word_id_map.doc2bow(words)]
    lda=LdaModel(corpus=deals_corpus, id2word=word_id_map, num_topics=15, update_every=1, chunksize=1000,passes=1)
    topics=[]
    for i in range(15):
        tokens=lda.print_topic(i).split('+')
        topic_scores=[]
        for token in tokens:
            score,token_val=token.split('*')
            topic_scores.append((token_val,score))
        topics.append(topic_scores)
    return topics
Exemplo n.º 36
0
    def create(pathtomapping, pathtocorpus, corpusname, window, numtokeep=50000, save_raw=True, shifts=(1, 5, 10)):
        """
        Creates an Shifted Positive Pointwise Mutual Information matrix.

        :param pathtomapping: The path to the id2word mapping. If this is left empty, the id2word mapping gets
        recreated. Warning: this takes a long time.
        :param pathtocorpus: The path to the corpus folder. The corpus can be spread out over multiple files or folders,
        and is read iteratively.
        :param corpusname: The name of the corpus. Used for saving the files.
        :param window: The window used to consider co-occurrences.
        :param numtokeep: The number of most frequent words to keep. Note that the matrix is non-sparse.
        Because of this, the memory requirements of the code are quadratic.
        :param save_raw: Whether to save the raw co-occurrence matrix as a numpy matrix.
        :param shifts: The shifts to apply to the co-occurrence matrix. Each shifted matrix
        gets saved as a separate model.
        """

        start = time.time()

        if not pathtomapping:
            id2word = Dictionary(SentenceIter(pathtocorpus), prune_at=None)
            id2word.filter_extremes(no_below=5, keep_n=numtokeep)
            id2word.compactify()
            logger.info("Creating the word2id took {0} seconds".format(time.time() - start))
        else:
            id2word = Dictionary.load(pathtomapping)

        inter = time.time()

        word2id = gensim.utils.revdict(id2word)

        corpus = SentenceIter(pathtocorpus)
        raw = get_cooccur(corpus, word2id, window=window)

        logger.info("Creating raw co-occurrence matrix took {0} seconds".format(time.time() - inter))

        if save_raw:
            np.save('{0}-cooccur.npy'.format(corpusname), raw)

        SPPMIFactory._save_word2id(word2id, "{0}mapping.json".format(corpusname))
        SPPMIFactory._save_freqs(id2word, "{0}freqs.json".format(corpusname))

        raw = SPPMIFactory.raw2pmi(raw)

        for k in shifts:
            sparse = SPPMIFactory.shift_clip_pmi(np.copy(raw), k_shift=k)
            SPPMIFactory._save_sparse_mtr(sparse, "{0}-SPPMI-sparse-{1}-shift.npz".format(corpusname, k))
            del sparse
Exemplo n.º 37
0
def train_lda_model(articles, num_topics=10):
    docs = [article_to_bow(a) for a in articles]

    dict = Dictionary(docs)
    dict.filter_extremes()
    dict.compactify()

    corpus = [dict.doc2bow(article_to_bow(a)) for a in articles]

    tfidf = TfidfModel(corpus=corpus, id2word=dict)

    w_corpus = [tfidf[doc] for doc in corpus]

    lda = LdaModel(corpus=w_corpus, num_topics=num_topics,
                   update_every=0, passes=20, id2word=dict)

    return lda, tfidf, dict
Exemplo n.º 38
0
def get_topics_lda(tokens, n_topics=10):
    """
    Using the `gensim` package for LDA. 
    LDA is a little better than LSA as it provides a reasonal mixture of topics (Wikipedia).
    `gensim` is a package for topic modeling only. So for a particular topic modeling task,
    it is a lighter option to install and run. Also it can be run distributed and updated over an existing model

    :param tokens: Preprocessed tokens for faster dictionary building
    :param n_topics: Number of topics to decompose data to
    :return: list() of topics
    """
    dict_file = 'resources/deals.dict'
    if not os.path.isfile(dict_file):
        print "Dictionary file does not exist. Creating one"
        dictionary = Dictionary(tokens)
        freq1 = [id for id, freq in dictionary.dfs.iteritems() if freq == 1]
        dictionary.filter_tokens(freq1)
        dictionary.compactify()
        dictionary.save(dict_file)
    dictionary = Dictionary.load(dict_file)
    # print dictionary

    corpus_file = 'resources/deals.mm'
    if not os.path.isfile(corpus_file):
        print "Corpus file does not exist. Creating one"
        corpus = [dictionary.doc2bow(token) for token in tokens]
        MmCorpus.serialize(corpus_file, corpus)
    mm = MmCorpus(corpus_file)
    # print mm
    # tfidf = TfidfModel(mm)
    # corpus_tfidf = tfidf[mm]

    lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=n_topics, update_every=1, chunksize=1000,
                   passes=1)
    topics = []
    for i in range(0, n_topics):
        words = lda.print_topic(i).split('+')
        topic = []
        for word in words:
            score, w = word.split('*')
            topic.append((w, score))
        topics.append(topic)
    return topics
Exemplo n.º 39
0
    def produce(self):
        doc_n = 0
        docs = []
        doctokens = [] # AKA gensim "text"
        stopwords = nltk.corpus.stopwords.words('english')

        NOALPHA = re.compile('[^a-z]+')
        def prep_string(my_string,pattern = NOALPHA):
            return re.sub(pattern, ' ', my_string.strip().lower())

        print('Getting src docs')
        for doc in self.src_doc_generator():
            content = re.sub(NOALPHA, ' ', doc) # Do this in the corpus generator?
            docs.append(content)
            doctokens.append([token for token in nltk.word_tokenize(content) if token not in stopwords])
            doc_n += 1
            if doc_n % 1000 == 0: print(doc_n)
                
        print('Creating the dictionary')
        dictionary = Dictionary(doctokens)
        dictionary.compactify()
        dictionary.filter_extremes(keep_n=None)
        if self.dictfile:
            dictionary.save_as_text(self.dictfile+'.dict', sort_by_word=True)

        with self.dbi as db:

            print('Creating DOC')
            db.create_table('doc')
            for i, doc in enumerate(docs):
                db.cur.execute('INSERT INTO doc VALUES (?,?)',(i,doc))

            print('Creating WORD')
            db.create_table('word')
            for item in dictionary.iteritems():
                db.cur.execute('INSERT INTO word (word_id, word_str) VALUES (?,?)',item)

            print('Creating DOCWORD')
            db.create_table('docword')
            for i, tokens in enumerate(doctokens):
                for item in (dictionary.doc2bow(tokens)):
                    db.cur.execute('INSERT INTO docword (doc_id,word_id,word_count) VALUES (?,?,?)',[i,item[0],item[1]])
Exemplo n.º 40
0
    def _create_from_texts(cls, tokenized_texts, name, dataset, settings, minimum_frequency=2):
        from gensim.corpora import Dictionary as GensimDictionary

        # build a dictionary of features
        logger.info("Creating features (including n-grams) from texts")
        gemsim_dictionary = GensimDictionary(tokenized_texts)

        # Remove extremely rare features
        logger.info("Features dictionary contains %d features. Filtering..." % len(gemsim_dictionary.token2id))
        gemsim_dictionary.filter_extremes(no_below=minimum_frequency, no_above=1, keep_n=None)
        gemsim_dictionary.compactify()
        logger.info("Features Dictionary contains %d features." % len(gemsim_dictionary.token2id))

        dict_model = cls(name=name,
                         dataset=dataset,
                         settings=settings)
        dict_model.save()

        dict_model._populate_from_gensim_dictionary(gemsim_dictionary)

        return dict_model
Exemplo n.º 41
0
    def _create_from_texts(cls, tokenized_texts, name, dataset, settings, minimum_frequency=2):
        from gensim.corpora import Dictionary as GensimDictionary

        # build a dictionary
        logger.info("Building a dictionary from texts")
        dictionary = GensimDictionary(tokenized_texts)

        # Remove extremely rare words
        logger.info("Dictionary contains %d words. Filtering..." % len(dictionary.token2id))
        dictionary.filter_extremes(no_below=minimum_frequency, no_above=0.5, keep_n=None)
        dictionary.compactify()
        logger.info("Dictionary contains %d words." % len(dictionary.token2id))

        dict_model = cls(name=name,
                         dataset=dataset,
                         settings=settings)
        dict_model.save()

        dict_model._populate_from_gensim_dictionary(dictionary)

        return dict_model
Exemplo n.º 42
0
class ArticlesCollection:
    """Class which holds all articles (perhaps over several years)
       -- with ability to perform LDA on it."""
    
    def __init__(self, year_range, text_output_dirpath, lang=DE_LANG):
        self.year_range = year_range
        self.text_output_dirpath = text_output_dirpath
        self.lang = lang
        self.articles = []
        self.bow_corpus = None
        self.identifier = ''
        self.wordsids_filepath = ''
        self.bowmm_filepath = ''
        self.tfidf_filepath = ''
        self.number_of_docs = 0
        self.number_of_tokens = 0
        self.number_of_types = 0
        
        # gensim data structures
        self.dictionary = None
        
        # Read in collection & clean it & start LDA process
        self._read_collection()
        self._collection_identifier()
        self._set_filepaths()
        self._create_dictionary()
        self._create_bow_representation()
        self._set_number_of_docs()
        self._set_number_of_tokens()
        self._set_number_of_types()
        
        # Create tf*idf matrix if requested.
        if USE_TFIDF:
            self._create_tfidf_matrix()
    
    def show_lda(self):
        """Show latent topics found."""
        
        model = None
        
        # Only use tf*idf input if requested.
        corpus = self.bow_corpus
        if USE_TFIDF:
            corpus = MmCorpus(self.tfidf_filepath)
        
        # k = number of documents = number of topics (for now)
        num_topics = self.number_of_docs
        if NUM_TOPICS != -1:
            num_topics = NUM_TOPICS
        
        print('Number of docs presented: ' + str(self.number_of_docs))
        print('Number of origin. tokens: ' + str(self.number_of_tokens))
        print('Number of original types: ' + str(self.number_of_types))
        print('Number of types at usage: ' + str(len(self.dictionary.\
                                                     keys())))
        print('Number of topics to find: ' + str(num_topics))
        print('Number of topics to show: ' + str(TOPICS_DISPLAY))
        
        if MODEL == 'LdaMallet':
            model = LdaMallet(PATH_TO_MALLET_BIN,
                            corpus=corpus,
                            num_topics=num_topics,
                            id2word=self.dictionary,
                            iterations=ITERATIONS)
                            
        elif MODEL == 'HdpModel':
            model = HdpModel(corpus, self.dictionary)
        else:
            model = LdaModel(corpus=corpus,
                           id2word=self.dictionary,
                           num_topics=num_topics,
                           iterations=ITERATIONS,
                           update_every=1,
                           chunksize=10,
                           passes=1,
                           distributed=False)
                           
            '''
            More possible options above:
                           chunksize=1,
                           update_every=1,
                           decay=0.5,
            '''
        
        if MODEL == 'LdaModel' or MODEL == 'LdaMallet':               
            topic_number = 0
            for topic in model.show_topics(topics=TOPICS_DISPLAY, 
                                         topn=WORDS_DISPLAY,
                                         formatted=True):
                topic_number += 1
                print('Topic#' + str(topic_number) + ': ', topic)
        else: # For MODEL 'HdpModel'
            for topic in model.print_topics(topics=TOPICS_DISPLAY, \
                               topn=WORDS_DISPLAY):
                print topic

    def _set_number_of_types(self):
        """Set number of types (from tokens)."""
        self.number_of_types = len(set(list(itertools.\
                                    chain(*self.articles))))
        
    def _set_number_of_tokens(self):
        """Set number of tokens gotten in all documents."""
        self.number_of_tokens = sum(len(article) \
                                    for article in self.articles)
        
    def _set_number_of_docs(self):
        """Set number of docs found in collection read in."""
        self.number_of_docs = len(self.articles)
        
    def _set_filepaths(self):
        """Sets filepaths for intermediate data."""

        # Filepaths necessary for topic modeling
        self.wordsids_filepath = WORDSIDS_DIR + self.identifier + \
                                 '_' + 'wordsids.txt'
        self.bowmm_filepath = BOWMM_DIR + self.identifier + '_' + \
                              'bow.mm'
        self.tfidf_filepath = TFIDF_DIR + self.identifier + '_' + \
                              'tfidf.mm'

    def _create_dictionary(self):
        """Create a mapping of ids and surface froms (=words)."""
        
        print('Create dictionary of collection.')
        self.dictionary = Dictionary(self.articles)
        self.dictionary.filter_extremes(no_below=NO_BELOW,
                                        no_above=NO_ABOVE)
        self.dictionary.save_as_text(self.wordsids_filepath)
        self.dictionary.compactify()
        print(self.dictionary)
    
    def _create_bow_representation(self):
        """Create bag-of-words representation of collection, and save it 
           in Matrix Matrix format to disk."""
        
        print('Create bag-of-words matrix representation.')
        self.bow_corpus = [self.dictionary.doc2bow(article) 
                           for article in self.articles]
        MmCorpus.serialize(self.bowmm_filepath, self.bow_corpus)

    def _create_tfidf_matrix(self):
        """Create TF-IDF matrix and save it in Matrix Matrix format to 
           disk"""
        
        print('Create TF-IDF matrix of collection.')
        tfidf = TfidfModel(self.bow_corpus, 
                           id2word=self.dictionary, 
                           normalize=True)
        MmCorpus.serialize(self.tfidf_filepath, 
                           tfidf[self.bow_corpus])
        print('Number of documents:', tfidf.num_docs)

    def _collection_identifier(self):
        """Collection id is important for the caching files and the
           file naming of the corresponding files."""
           
        start_year = self.year_range[0]
        end_year = self.year_range[-1]
        
        if start_year == end_year:
            self.identifier = str(start_year) + '_' + self.lang
        else:
            self.identifier = str(start_year) + '-' + str(end_year) + \
                              '_' + self.lang 
        
    def _read_collection(self):
        """Iterate through all years in order to get all articles read
           in."""
        for year in self.year_range:
            # Not every single yearbook is available.
            try:
                self._read_book(year)
            except:
                print('Skip (inexistent) yearbook ' + str(year) + '.')
        
    def _read_book(self, year):
        """Read in a a single book and save its articles."""
        filepath = sac_filepath(year, lang=self.lang)
        
        print('Read in yearbook ' + str(year) + '.')
        sac_xml = etree.parse(SAC_XML_DIR + filepath)
        sac_xml_articles_list = sac_xml.xpath('.//article')
        
        # For each article
        for sac_xml_article in sac_xml_articles_list:
            
            # Prepare file to write out words
            sac_xml_article_no = sac_xml_article.attrib['n']
            out_filename = str(year) + '-' + str(self.lang) + '-' \
                           + sac_xml_article_no + '.txt'
            out_filepath = self.text_output_dirpath + sep + out_filename
            print(out_filepath)
            out_filehdl = open(out_filepath, 'w')
                               
            article_word_list = []
            sac_xml_sentences_list = \
                sac_xml_article.xpath('.//s[@lang=\'' + \
                                      self.lang + '\']')
            # For each sentence (in the article)
            for sac_xml_sentence in sac_xml_sentences_list:
                sac_xml_words_list = \
                    sac_xml_words_list = sac_xml_sentence.xpath('.//w')
                # For each word (in the sentence of the article)
                for sac_xml_word in sac_xml_words_list:
                    word = None
                    try:
                        if WITH_POS_FILTER is False:
                            if WITH_LEMMATA:
                                word = sac_xml_word.attrib['lemma'].lower()
                                if self._is_lemma_bogus(word):
                                    word = sac_xml_word.text.lower()
                            if WITH_LEMMATA is False:
                                word = sac_xml_word.text.lower()
                        elif WITH_POS_FILTER:
                            word = self._get_pos_filtered_word(sac_xml_word)
                    except:
                        pass
                        
                    # Don't add stop words, in any case
                    if not word in STOPWORDS[self.lang] \
                    and word is not None and len(word) >= MIN_WORDLEN:
                        article_word_list.append(self.\
                                                 _normalize_word(word).\
                                                 encode(ENCODING))
            # Save article as bag-of-words (of the sentences)
            self.articles.append(article_word_list)
            out_filehdl.write(' '.join(article_word_list))
            out_filehdl.close()
    
    def _get_pos_filtered_word(self, sac_xml_word):
        """ Get word by PoS filter
        """
        # There are words without PoS tags, i. e. try
        try:
            if sac_xml_word.attrib['pos'] \
            in POS_FILTER[self.lang]:
                if WITH_LEMMATA:
                    word = sac_xml_word.attrib['lemma'].lower()
                    if self._is_lemma_bogus(word):
                        return sac_xml_word.text.lower()
                    else:
                        return sac_xml_word.attrib['lemma'].lower()
                else:
                    return sac_xml_word.text.lower()
            else:
                return None
        except:
            return None
    
    def _is_lemma_bogus(self, lemma):
        """ Return true if the lemma is not useful for LDA, otherwise
            false.
        """
        
        for bogus_symbol in SURFACE_TRIGGERS:
            if bogus_symbol in lemma:
                return True
        
        # That's the last resort
        return False
    
    def _normalize_word(self, word_to_normalize):
        """
        This function helps to normalize words, because of encoding
        issues of some LDA tools ...
        @return: Normalized word as str type
        """
        
        # Transform umlauts to ASCII friendly form
        word = word_to_normalize.replace(u"ä","ae").replace(u"ö","oe"). \
            replace(u"ü","ue").replace(u"ß","ss")
        return word
                
    def __str__(self):
        """ Return a string which shows document number, number of
            words and number of types.
        """
        ret_string = ''
        art_number = 0
        
        for article in self.articles:
            art_number += 1
            ret_string += 'Doc#' + str(art_number) + ': '
            ret_string += str(len(article)) + ' [' + \
                          str(len(set((article)))) + ']'
            ret_string += '\n'
            
        return ret_string
Exemplo n.º 43
0
class TfidfVectorizer():
    """
    Transform text to tf-idf representation
    """

    def __init__(self):

        self.base_path = os.path.dirname(__file__)
        self.dictionary_path = os.path.join(self.base_path, "dictionary")
        self.tf_idf_model_path = os.path.join(self.base_path, "tfidf")

        self.stemmer = NepStemmer()
        self.tf_idf_model = None

    def get_tokens(self, document):
        if not self.stemmer:
            raise Exception("Stemmer not available")

        return self.stemmer.get_stems(document)

    def construct_model(self, documents):
        logging.basicConfig(
            format='%(asctime)s:%(levelname)s:%(message)s',
            level=logging.INFO
        )

        logging.info("Obtaining word tokens")
        tokens = [self.get_tokens(document) for document in documents]
        # self.tf_idf_model = TfidfModel(tokens)

        logging.info("Constructing dictionary")
        self.dictionary = Dictionary(tokens)
        self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)
        self.dictionary.compactify()
        self.dictionary.save(self.dictionary_path)

        logging.info("Constructing TF-IDF model")
        self.tf_idf_model = TfidfModel(dictionary=self.dictionary)
        self.tf_idf_model.save(self.tf_idf_model_path)

    def load_data(self):

        if not self.tf_idf_model:
            if not os.path.exists(self.tf_idf_model_path):
                raise Exception('TF-IDF model file not found')

            self.dictionary = Dictionary.load(self.dictionary_path)
            self.tf_idf_model = TfidfModel.load(self.tf_idf_model_path)

    def doc2vector(self, document):
        """ Returns the sparse tf-idf vector for given document """

        tokens = self.get_tokens(document)
        bag_of_words = self.dictionary.doc2bow(tokens)

        return (self.tf_idf_model[bag_of_words])

    def obtain_feature_vector(self, document):
        """
        Returns a single dense tf-idf vector for a given document
        """

        self.load_data()

        tf_idf_vector = matutils.sparse2full(
            self.doc2vector(document),
            self.no_of_features
        ).reshape(1, -1)

        return tf_idf_vector

    def obtain_feature_matrix(self, documents):
        """
        Returns the tf-idf dense matrix for the given documents
        """

        self.load_data()

        input_matrix_sparse = [
            self.doc2vector(x)
            for x in documents
        ]

        no_of_features = len(self.tf_idf_model.idfs)

        input_matrix = matutils.corpus2dense(
            input_matrix_sparse,
            no_of_features
        ).transpose()

        return input_matrix
    elif not opts.scaling:
        scaling = None
    else:
        raise ValueError("Only tfidf scaling is supported")

    word_model = opts.word_model

    if word_model:
        logging.info("Building word model")
        corpus = LimitCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), word_limit)
    else:
        corpus = SublexicalizedCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), order=order, word_limit=word_limit)

    voc = Dictionary(corpus)
    voc.filter_extremes(no_below=cutoff)
    voc.compactify()

    bow_corpus = (voc.doc2bow(art) for art in corpus)

    tfidf = None

    if scaling == 'tfidf':
        tfidf = TfidfModel(bow_corpus)
        bow_corpus = (tfidf[voc.doc2bow(art)] for art in corpus)

    model = LsiModel(corpus=bow_corpus, num_topics=num_topics, id2word=voc)
    model.save(model_fn)

    if tfidf:
        tfidf.save(model_fn + '.tfidf')
def main():
    parser = ArgumentParser(
        description="wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information"
    )
    parser.add_argument("-ds", "--dataset", default="wiki", help="What kind of dataset to use. (wiki,es,file)")
    parser.add_argument("-d", "--dump-file", help="Wiki: bz2 dump file with wiki in it")
    parser.add_argument("-l", "--limit", help="Wiki: How many documents to extract from wiki")
    parser.add_argument("--model-id", default="model", help="Filename for created model.")
    parser.add_argument("--model-type", default="lsi", help="Model type (lsi, lda, word2vec, hdp, vocabulary).")
    parser.add_argument("--n-topics", default=10, help="Number of topics to model.")
    parser.add_argument("--n-passes", default=1, help="Number of passes for LDA  model.")
    parser.add_argument("--w2v-size", default=100, help="size of Word2Vec context.")
    parser.add_argument("--w2v-window", default=5, help="window for Word2Vec.")
    parser.add_argument("-q", "--query", default=None, help="Elasticsearch: Query to use to fetch documents")
    parser.add_argument("--index", help="Elasticsearch: index to read from.")
    parser.add_argument("--doc_type", default="doc", help="Elasticsearch: data type in index.")
    parser.add_argument("--data-dir", help="Directory to save the generated models and vocabularies into.")
    parser.add_argument("--vocab", help="Prebuilt Vocabulary file. Use this to avoid having to generate one.")

    opts = parser.parse_args()

    model_type = opts.model_type.lower()
    if model_type not in ["lsi", "lda", "word2vec", "hdp", "vocabulary"]:
        logging.error("Invalid model type %s" % model_type)
        parser.print_usage()
        exit(-1)

    logging.info("Using model type %s" % model_type)

    dump_fn = opts.dump_file
    limit = int(opts.limit) if opts.limit else None

    data_type = opts.dataset.lower()
    if data_type not in ["es", "wiki", "file"]:
        logging.error("Invalid dataset  type %s" % data_type)
        parser.print_usage()
        exit(-1)
    limit = None
    if opts.limit:
        limit = int(opts.limit)
    if not dump_fn and data_type in ["wiki"]:
        logging.error("--dump-file required for wiki dataset")
        sys.exit(1)

    query = opts.query
    index = opts.index
    doc_type = opts.doc_type
    if data_type == "es" and index is None:
        logging.error(
            "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter"
        )
        sys.exit(1)

    n_topics = int(opts.n_topics)
    n_passes = int(opts.n_passes)
    logging.info("Using %d topics." % n_topics)
    data_dir = opts.data_dir
    model_id = opts.model_id
    model_fn = "%s_%s_%d" % (model_id, model_type, n_topics)
    if data_dir:
        model_fn = "%s/%s" % (data_dir, model_fn)
    if model_type == "word2vec":
        w2v_size = int(opts.w2v_size)
        w2v_window = int(opts.w2v_window)
        model_fn = "%s_w_%s_s_%s" % (model_fn, w2v_window, w2v_size)
    logging.info("Writing models to %s." % model_fn)

    if data_type == "es":
        logging.info("Using data type %s with index %s, doc_type %s query %s" % (data_type, index, doc_type, query))
        dataset = ElasticsearchDataset(
            read_index=index, read_doc_type=doc_type, query=query, normalize_func=normalize_es
        )
    elif data_type == "wiki":
        logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit))
        dataset = WikipediaDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_wiki)
    elif data_type == "file":
        logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit))
        dataset = FileDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_file)
    vocab_file = opts.vocab
    vocab = Dictionary()
    sw = set(stopwords.words("norwegian"))
    if not vocab_file or model_type == "vocabulary":
        vocab.add_documents([get_tokenized(page, sw) for page in dataset])
        vocab.filter_extremes()
        vocab.compactify()
        vocab.save(model_fn + ".vocab")
    else:
        vocab = Dictionary.load(vocab_file)
    if model_type == "vocabulary":
        return
    tfidf = TfidfModel(dictionary=vocab)
    if model_type == "lsi":
        corpus = IterableDataset(dataset, sw, vocab)
        model = LsiModel(corpus=tfidf[corpus], num_topics=n_topics, id2word=vocab)
    elif model_type == "lda":
        corpus = IterableDataset(dataset, sw, vocab)
        model = LdaModel(corpus=tfidf[corpus], num_topics=n_topics, passes=n_passes, id2word=vocab)

    elif model_type == "word2vec":
        corpus = IterableDataset(dataset, sw, vocab, doc2bow=False)
        corpus.dictionary = vocab
        model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size)
    elif model_type == "hdp":
        corpus = IterableDataset(dataset, sw, vocab)
        model = HdpModel(corpus=tfidf[corpus], id2word=vocab)

    logging.info(model)
    model.save(model_fn)