예제 #1
0
파일: ucicorpus.py 프로젝트: darindf/gensim
    def create_dictionary(self):
        """
        Utility method to generate gensim-style Dictionary directly from
        the corpus and vocabulary data.
        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = utils.revdict(self.id2word)

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i', docno,
                            self.num_docs)

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary
예제 #2
0
    def load_word_topics(self):
        """Load words X topics matrix from :meth:`gensim.models.wrappers.ldamallet.LdaMallet.fstate` file.

        Returns
        -------
        numpy.ndarray
            Matrix words X topics.

        """
        logger.info("loading assigned topics from %s", self.fstate())
        word_topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float64)
        if hasattr(self.id2word, 'token2id'):
            word2id = self.id2word.token2id
        else:
            word2id = revdict(self.id2word)

        with utils.smart_open(self.fstate()) as fin:
            _ = next(fin)  # header
            self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]])
            assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics"
            _ = next(fin)  # noqa:F841 beta
            for lineno, line in enumerate(fin):
                line = utils.to_unicode(line)
                doc, source, pos, typeindex, token, topic = line.split(" ")
                if token not in word2id:
                    continue
                tokenid = word2id[token]
                word_topics[int(topic), tokenid] += 1.0
        return word_topics
예제 #3
0
    def load_word_topics(self):
        logger.info("loading assigned topics from %s", self.fstate())
        word_topics = numpy.zeros((self.num_topics, self.num_terms),
                                  dtype=numpy.float64)
        if hasattr(self.id2word, 'token2id'):
            word2id = self.id2word.token2id
        else:
            word2id = revdict(self.id2word)

        with utils.smart_open(self.fstate()) as fin:
            _ = next(fin)  # header
            self.alpha = numpy.array(
                [float(val) for val in next(fin).split()[2:]])
            assert len(
                self.alpha
            ) == self.num_topics, "mismatch between MALLET vs. requested topics"
            _ = next(fin)  # noqa:F841 beta
            for lineno, line in enumerate(fin):
                line = utils.to_unicode(line)
                doc, source, pos, typeindex, token, topic = line.split(" ")
                if token not in word2id:
                    continue
                tokenid = word2id[token]
                word_topics[int(topic), tokenid] += 1.0
        return word_topics
예제 #4
0
    def load_word_topics(self):
        """Load words X topics matrix from :meth:`gensim.models.wrappers.ldamallet.LdaMallet.fstate` file.

        Returns
        -------
        numpy.ndarray
            Matrix words X topics.

        """
        logger.info("loading assigned topics from %s", self.fstate())
        word_topics = numpy.zeros((self.num_topics, self.num_terms),
                                  dtype=numpy.float64)
        if hasattr(self.id2word, 'token2id'):
            word2id = self.id2word.token2id
        else:
            word2id = revdict(self.id2word)

        with utils.smart_open(self.fstate()) as fin:
            _ = next(fin)  # header
            self.alpha = numpy.fromiter(next(fin).split()[2:], dtype=float)
            assert len(
                self.alpha
            ) == self.num_topics, "mismatch between MALLET vs. requested topics"
            _ = next(fin)  # noqa:F841 beta
            for lineno, line in enumerate(fin):
                line = utils.to_unicode(line)
                doc, source, pos, typeindex, token, topic = line.split(" ")
                if token not in word2id:
                    continue
                tokenid = word2id[token]
                word_topics[int(topic), tokenid] += 1.0
        return word_topics
예제 #5
0
    def create_dictionary(self):
        """
        Utility method to generate gensim-style Dictionary directly from
        the corpus and vocabulary data.
        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = utils.revdict(self.id2word)

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i', docno, self.num_docs)

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary
예제 #6
0
 def reverse(self, documents_idx, unknown_word="UNKNOWN"):
     if len(self.id2token) != len(self.token2id):
         self.id2token = utils.revdict(self.token2id)
     rst = []
     for doc_idx in documents_idx:
         rst_i = []
         for idx in doc_idx:
             rst_i.append(self.id2token.get(idx, unknown_word))
         rst.append(rst_i)
     return rst
예제 #7
0
def saveLDACorpus(train_data_path,test_data_path,model_file,dictionary_file,corpus_file):
        ""
        lda = LdaModel.load(model_file)
        dictionary = Dictionary.load_from_text(dictionary_file)
        dictionary.id2token = utils.revdict(dictionary.token2id)
        src_df = pd.read_csv(corpus_file)
        src_df = parallelize(src_df, data_fram_proc1,dictionary ,lda) #计入ida特征
        train_data, test_data = train_test_split(src_df[['label','multiLabels','item']], test_size=0.2, random_state=42)
        train_data.to_csv(train_data_path,  index=None )#, header=None
        test_data.to_csv(test_data_path, index=None )#, header=None
예제 #8
0
def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None):
    """


    :rtype : gensim.corpora.dictionary.Dictionary
    :param corpora: 
    :param stopwords: 
    :param allowed_pos: 
    :param max_doc: 
    :return: 
    """
    logging.info('Lemmatizing the corpora...')
    count = 0
    corpus_num = len(corpora)
    processed_corpora = []
    corpus_id2orig_id = []

    for index, corpus in corpora.items():
        count += 1
        if count > max_doc:
            break
        if corpus is None:  # skip if corpus is None
            continue

        print '\r', count, '/', corpus_num,
        cleaned_corpus = clean_text(corpus)  # delete irrelevant characters
        corpus = []
        tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos)
        for token in tokens:
            word, pos = token.split('/')
            corpus.append(word)

        # convert compound word into one token
        corpus = convert_compound(corpus)

        # filter stop words, long words, and non-english words
        corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()]
        processed_corpora.append(corpus)
        corpus_id2orig_id.append(index)

    print '\n'

    logging.info('Creating dictionary and corpus...')
    dictionary = Dictionary(processed_corpora)
    dictionary.corpus_id2orig_id = corpus_id2orig_id

    logging.info('Filtering unimportant terms...')
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
    dictionary.compactify()

    logging.info('Generating corpus...')
    dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora]
    dictionary.id2token = revdict(dictionary.token2id)

    return dictionary
예제 #9
0
    def create_dictionary(self):
        """Generate :class:`gensim.corpora.dictionary.Dictionary` directly from the corpus and vocabulary data.

        Return
        ------
        :class:`gensim.corpora.dictionary.Dictionary`
            Dictionary, based on corpus.

        Examples
        --------

        .. sourcecode:: pycon

            >>> from gensim.corpora.ucicorpus import UciCorpus
            >>> from gensim.test.utils import datapath
            >>> ucc = UciCorpus(datapath('testcorpus.uci'))
            >>> dictionary = ucc.create_dictionary()

        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = utils.revdict(self.id2word)

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i', docno,
                            self.num_docs)

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary
예제 #10
0
파일: ldamallet.py 프로젝트: JKamlah/gensim
    def load_word_topics(self):
        logger.info("loading assigned topics from %s", self.fstate())
        word_topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32)
        if hasattr(self.id2word, 'token2id'):
            word2id = self.id2word.token2id
        else:
            word2id = revdict(self.id2word)

        with utils.smart_open(self.fstate()) as fin:
            _ = next(fin)  # header
            self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]])
            assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics"
            _ = next(fin)  # beta
            for lineno, line in enumerate(fin):
                line = utils.to_unicode(line)
                doc, source, pos, typeindex, token, topic = line.split(" ")
                if token not in word2id:
                    continue
                tokenid = word2id[token]
                word_topics[int(topic), tokenid] += 1.0
        return word_topics
예제 #11
0
    def create_dictionary(self):
        """Generate :class:`gensim.corpora.dictionary.Dictionary` directly from the corpus and vocabulary data.

        Return
        ------
        :class:`gensim.corpora.dictionary.Dictionary`
            Dictionary, based on corpus.

        Examples
        --------

        .. sourcecode:: pycon

            >>> from gensim.corpora.ucicorpus import UciCorpus
            >>> from gensim.test.utils import datapath
            >>> ucc = UciCorpus(datapath('testcorpus.uci'))
            >>> dictionary = ucc.create_dictionary()

        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = utils.revdict(self.id2word)

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i', docno, self.num_docs)

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary
예제 #12
0
    def __getitem__(self, tokenid):
        """Get the string token that corresponds to `tokenid`.

        Parameters
        ----------
        tokenid : int
            Id of token.

        Returns
        -------
        str
            Token corresponding to `tokenid`.

        Raises
        ------
        KeyError
            If this Dictionary doesn't contain such `tokenid`.

        """
        if len(self.id2token) != len(self.token2id):
            # the word->id mapping has changed (presumably via add_documents);
            # recompute id->word accordingly
            self.id2token = utils.revdict(self.token2id)
        return self.id2token[tokenid]  # will throw for non-existent ids
예제 #13
0
    def __getitem__(self, tokenid):
        """Get token by provided `tokenid`.

        Parameters
        ----------
        tokenid : int
            Id of token

        Returns
        -------
        str
            Token corresponding to `tokenid`.

        Raises
        ------
        KeyError
            If `tokenid` isn't contained in :class:`~gensim.corpora.dictionary.Dictionary`.

        """
        if len(self.id2token) != len(self.token2id):
            # the word->id mapping has changed (presumably via add_documents);
            # recompute id->word accordingly
            self.id2token = utils.revdict(self.token2id)
        return self.id2token[tokenid]  # will throw for non-existent ids
예제 #14
0
    def __getitem__(self, tokenid):
        """Get the string token that corresponds to `tokenid`.

        Parameters
        ----------
        tokenid : int
            Id of token.

        Returns
        -------
        str
            Token corresponding to `tokenid`.

        Raises
        ------
        KeyError
            If this Dictionary doesn't contain such `tokenid`.

        """
        if len(self.id2token) != len(self.token2id):
            # the word->id mapping has changed (presumably via add_documents);
            # recompute id->word accordingly
            self.id2token = utils.revdict(self.token2id)
        return self.id2token[tokenid]  # will throw for non-existent ids
예제 #15
0
    def __getitem__(self, tokenid):
        """Get token by provided `tokenid`.

        Parameters
        ----------
        tokenid : int
            Id of token

        Returns
        -------
        str
            Token corresponding to `tokenid`.

        Raises
        ------
        KeyError
            If `tokenid` isn't contained in :class:`~gensim.corpora.dictionary.Dictionary`.

        """
        if len(self.id2token) != len(self.token2id):
            # the word->id mapping has changed (presumably via add_documents);
            # recompute id->word accordingly
            self.id2token = utils.revdict(self.token2id)
        return self.id2token[tokenid]  # will throw for non-existent ids
예제 #16
0
 def id2word(self, val):
     self._id2word = val
     self.word2id = utils.revdict(val)
예제 #17
0
    if 'word2vec' in program:
        if os.path.exists(outf('w2v')):
            logger.info("word2vec model found, loading")
            model = utils.unpickle(outf('w2v'))
        else:
            logger.info("word2vec model not found, creating")
            if NEGATIVE:
                model = gensim.models.Word2Vec(size=DIM, min_count=0, window=WINDOW, workers=WORKERS, hs=0, negative=NEGATIVE)
            else:
                model = gensim.models.Word2Vec(size=DIM, min_count=0, window=WINDOW, workers=WORKERS)
            model.build_vocab(corpus())
            model.train(corpus())  # train with 1 epoch
            model.init_sims(replace=True)
            model.word2id = dict((w, v.index) for w, v in model.vocab.iteritems())
            model.id2word = utils.revdict(model.word2id)
            model.word_vectors = model.syn0norm
            utils.pickle(model, outf('w2v'))

    if 'glove' in program:
        if os.path.exists(outf('glove')):
            logger.info("glove model found, loading")
            model = utils.unpickle(outf('glove'))
        else:
            if os.path.exists(outf('glove_corpus')):
                logger.info("glove corpus matrix found, loading")
                cooccur = utils.unpickle(outf('glove_corpus'))
            else:
                logger.info("glove corpus matrix not found, creating")
                cooccur = glove.Corpus(dictionary=word2id)
                cooccur.fit(corpus(), window=WINDOW)
예제 #18
0
 def id2word(self, val):
     self._id2word = val
     self.word2id = utils.revdict(val)
예제 #19
0
def show_documents_bow(corpus):
    st.markdown("Bag-of-words representation of the documents:")
    tcid = utils.revdict(corpus.dictionary.token2id)
    st.dataframe([[(tcid[t], w) for (t, w) in doc] for doc in corpus.bow()])
예제 #20
0
                                               min_count=0,
                                               window=WINDOW,
                                               workers=WORKERS,
                                               hs=0,
                                               negative=NEGATIVE)
            else:
                model = gensim.models.Word2Vec(size=DIM,
                                               min_count=0,
                                               window=WINDOW,
                                               workers=WORKERS)
            model.build_vocab(corpus())
            model.train(corpus())  # train with 1 epoch
            model.init_sims(replace=True)
            model.word2id = dict(
                (w, v.index) for w, v in model.vocab.iteritems())
            model.id2word = utils.revdict(model.word2id)
            model.word_vectors = model.syn0norm
            utils.pickle(model, outf('w2v'))

    if 'glove' in program:
        if os.path.exists(outf('glove')):
            logger.info("glove model found, loading")
            model = utils.unpickle(outf('glove'))
        else:
            if os.path.exists(outf('glove_corpus')):
                logger.info("glove corpus matrix found, loading")
                cooccur = utils.unpickle(outf('glove_corpus'))
            else:
                logger.info("glove corpus matrix not found, creating")
                cooccur = glove.Corpus(dictionary=word2id)
                cooccur.fit(corpus(), window=WINDOW)
예제 #21
0
 def __getitem__(self, tokenid):
     if len(self.id2token) != len(self.token2id):
         # the word->id mapping has changed (presumably via add_documents);
         # recompute id->word accordingly
         self.id2token = utils.revdict(self.token2id)
     return self.id2token[tokenid]  # will throw for non-existent ids
예제 #22
0
            word, pos = token.split('/')
            document.append(word)

        # convert compound word into one token
        document = convert_compound(document)

        # filter stop words, long words, and non-english words
        document = [w for w in document if not w in stop_words and 2 <= len(w) <= 15 and w.islower()]

        new_documents.append(document)
        titles.append(index)
        froms.append(from_name)
        dates.append(date)

    print '\n'
    logging.info('create dictionary and corpus...')
    dictionary = corpora.Dictionary(new_documents)
    dictionary.docid2title = titles
    dictionary.docid2from = froms
    dictionary.docid2date = dates

    logging.info('filter unimportant words...')
    dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=None)
    dictionary.compactify()

    logging.info('generate corpus...')
    dictionary.corpus = [dictionary.doc2bow(document) for document in new_documents]
    dictionary.id2token = revdict(dictionary.token2id)

    dictionary.save('data/dictionary/report_' + allowed_pos.pattern + '.dict')
예제 #23
0
 def __getitem__(self, tokenid):
     if len(self.id2token) != len(self.token2id):
         # the word->id mapping has changed (presumably via add_documents);
         # recompute id->word accordingly
         self.id2token = utils.revdict(self.token2id)
     return self.id2token[tokenid]  # will throw for non-existent ids
예제 #24
0
파일: etl.py 프로젝트: vitillo/kaggle

def contexts(document, window):
    assert(window % 2 == 0)

    half_window = window/2

    for i in range(half_window, len(document) - window):
        yield document[i:i + half_window], document[i + half_window], document[i + half_window + 1: i + window + 1]


if __name__ == "__main__":
    frame = pd.read_csv("/Users/vitillo/Downloads/labeledTrainData.tsv", sep='\t')
    sentences = extract_sentences(frame["review"])
    dictionary = corpora.Dictionary(sentences)

    dictionary.filter_extremes(no_below=5, no_above=0.6, keep_n=1000)
    rev_dictionary = revdict(dictionary)

    with open('data.csv', 'w') as f:
        for sentence in sentences:
            document = [rev_dictionary[w] for w in sentence if w in rev_dictionary]

            for context in contexts(document, 4):
                prefix, word, postfix = context

                f.write(",".join(map(str, prefix + postfix + [word])))
                f.write("\n")
 
    dictionary.save_as_text("dictionary.tsv")
예제 #25
0
    def __init__(self,
                 corpus=None,
                 num_topics=100,
                 id2word=None,
                 distributed=False,
                 chunksize=2000,
                 passes=1,
                 update_every=1,
                 alpha='symmetric',
                 eta=None,
                 decay=0.5,
                 offset=1.0,
                 eval_every=10,
                 iterations=50,
                 gamma_threshold=0.001,
                 minimum_probability=0.01,
                 defined_kws={},
                 tfMod=None):
        """
        If given, start training from the iterable `corpus` straight away. If not given,
        the model is left untrained (presumably because you want to call `update()` manually).

        `num_topics` is the number of requested latent topics to be extracted from
        the training corpus.

        `id2word` is a mapping from word ids (integers) to words (strings). It is
        used to determine the vocabulary size, as well as for debugging and topic
        printing.

        `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic
        (theta) and topic-word (lambda) distributions. Both default to a symmetric
        1.0/num_topics prior.

        `alpha` can be set to an explicit array = prior of your choice. It also
        support special values of 'asymmetric' and 'auto': the former uses a fixed
        normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric
        prior directly from your data.

        `eta` can be a scalar for a symmetric prior over topic/word
        distributions, or a matrix of shape num_topics x num_words, which can
        be used to impose asymmetric priors over the word distribution on a
        per-topic basis. This may be useful if you want to seed certain topics
        with particular words by boosting the priors for those words.  It also
        supports the special value 'auto', which learns an asymmetric prior
        directly from your data.

        Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_
        on how to set up a cluster of machines for gensim).

        Calculate and log perplexity estimate from the latest mini-batch every
        `eval_every` model updates (setting this to 1 slows down training ~2x;
        default is 10 for better performance). Set to None to disable perplexity estimation.

        `decay` and `offset` parameters are the same as Kappa and Tau_0 in
        Hoffman et al, respectively.

        `minimum_probability` controls filtering the topics returned for a document (bow).

        Example:

        >>> lda = LdaModel(corpus, num_topics=100)  # train model
        >>> print(lda[doc_bow]) # get topic probability distribution for a document
        >>> lda.update(corpus2) # update the LDA model with additional documents
        >>> print(lda[doc_bow])

        >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5)  # train asymmetric alpha from data

        """
        # store user-supplied parameters
        self.id2word = id2word
        if corpus is None and self.id2word is None:
            raise ValueError(
                'at least one of corpus/id2word must be specified, to establish input space dimensionality'
            )

        if self.id2word is None:
            logger.warning(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        elif len(self.id2word) > 0:
            self.num_terms = 1 + max(self.id2word.keys())
        else:
            self.num_terms = 0

        if self.num_terms == 0:
            raise ValueError(
                "cannot compute LDA over an empty collection (no terms)")

        self.distributed = bool(distributed)
        self.num_topics = int(num_topics)
        self.chunksize = chunksize
        self.decay = decay
        self.offset = offset
        self.minimum_probability = minimum_probability
        self.num_updates = 0

        self.passes = passes
        self.update_every = update_every
        self.eval_every = eval_every

        self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')

        assert self.alpha.shape == (
            num_topics,
        ), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(
            self.alpha.shape), num_topics)

        self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')

        assert (
            self.eta.shape == (num_topics, 1)
            or self.eta.shape == (num_topics, self.num_terms)
        ), ("Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)"
            % (str(self.eta.shape), num_topics, num_topics, self.num_terms))

        # VB constants
        self.iterations = iterations
        self.gamma_threshold = gamma_threshold

        # set up distributed environment if necessary
        if not distributed:
            logger.info("using serial LDA version on this node")
            self.dispatcher = None
            self.numworkers = 1
        else:
            if self.optimize_alpha:
                raise NotImplementedError(
                    "auto-optimizing alpha not implemented in distributed LDA")
            # set up distributed version
            try:
                import Pyro4
                dispatcher = Pyro4.Proxy('PYRONAME:gensim.lda_dispatcher')
                logger.debug("looking for dispatcher at %s" %
                             str(dispatcher._pyroUri))
                dispatcher.initialize(id2word=self.id2word,
                                      num_topics=num_topics,
                                      chunksize=chunksize,
                                      alpha=alpha,
                                      eta=eta,
                                      distributed=False)
                self.dispatcher = dispatcher
                self.numworkers = len(dispatcher.getworkers())
                logger.info("using distributed version with %i workers" %
                            self.numworkers)
            except Exception as err:
                logger.error("failed to initialize distributed LDA (%s)", err)
                raise RuntimeError(
                    "failed to initialize distributed LDA (%s)" % err)

        # Initialize the variational distribution q(beta|lambda)
        self.state = LdaState(self.eta, (self.num_topics, self.num_terms))
        self.state.sstats = numpy.random.gamma(
            100., 1. / 100., (self.num_topics, self.num_terms))

        #reassign word/topic for specific words
        self.word2id = utils.revdict(self.id2word)
        sstats = self.state.sstats
        self.defined_kws = defined_kws
        self.defined_wordids = {}
        for w, t in defined_kws.iteritems():
            if w in self.word2id:
                wid = self.word2id[w]
                self.defined_wordids[wid] = numpy.array(list(t))

        for wid, t in self.defined_wordids.iteritems():
            sstats[:, wid] = numpy.random.gamma(0.1, 0.05, (self.num_topics, ))

        for wid, topics in self.defined_wordids.iteritems():
            if tfMod is not None:
                score = self.num_topics * tfMod.idfs.get(wid, 1.0)
            else:
                score = self.num_topics
            if topics.shape[0] > 1:
                # score = self.num_topics / math.log(len(topics) + 1)
                score = 0.4 * self.num_topics
                # print 'score :{}'.format(score)
                # print self.num_topics
                # print topics

            # for t in topics:
            sstats[topics, wid] = score

        self.expElogbeta = numpy.exp(dirichlet_expectation(self.state.sstats))

        # if a training corpus was provided, start estimating the model right away
        if corpus is not None:
            use_numpy = self.dispatcher is not None
            self.update(corpus, chunks_as_numpy=use_numpy)
예제 #26
0
    outf = lambda prefix: os.path.join(output_dir, prefix)
    logger.info("output file template will be %s" % outf('PREFIX'))

    sentences = MyCorpus(corpus_path)

    if os.path.exists(outf('word2id')):
        logger.info("dictionary found, loading")
        word2id = utils.unpickle(outf('word2id'))
    else:
        logger.info("dictionary not found, creating")
        id2word = corpora.Dictionary(sentences, prune_at=10000000)
        id2word.filter_extremes(
            keep_n=TOKEN_LIMIT)  # filter out too freq/infreq words
        word2id = dict((v, k) for k, v in id2word.iteritems())
        utils.pickle(word2id, outf('word2id'))
    id2word = utils.revdict(word2id)

    # Filter all wiki documents to contain only those words.
    corpus = lambda: ([word for word in sentence if word in word2id]
                      for sentence in sentences)

    if os.path.exists(outf('kw2v_%s' % GAMMA)):
        logger.info("Kernel word2vec model found, loading")
        # model = utils.unpickle(outf('kw2v'))
        model = Word2Vec.load_word2vec_format(outf('kw2v_%s' % GAMMA),
                                              binary=True)
    else:
        logger.info("Kernel word2vec model not found, creating")
        if NEGATIVE:
            model = Word2Vec(size=DIM,
                             gamma=GAMMA,