def __init__(self, fname, index_fname=None): """ Initialize this abstract base class, by loading a previously saved index from `index_fname` (or `fname.index` if `index_fname` is not set). This index will allow subclasses to support the `corpus[docno]` syntax (random access to document #`docno` in O(1)). >>> # save corpus in SvmLightCorpus format with an index >>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]] >>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus) >>> # load back as a document stream (*not* plain Python list) >>> corpus_with_random_access = gensim.corpora.SvmLightCorpus('tstfile.svmlight') >>> print(corpus_with_random_access[1]) [(0, 1.0), (1, 2.0)] """ try: if index_fname is None: index_fname = utils.smart_extension(fname, '.index') self.index = utils.unpickle(index_fname) # change self.index into a numpy.ndarray to support fancy indexing self.index = numpy.asarray(self.index) logger.info("loaded corpus index from %s" % index_fname) except: self.index = None self.length = None
def load_varembed_format(cls, vectors, morfessor_model=None): """ Load the word vectors into matrix from the varembed output vector files. Using morphemes requires Python 2.7 version or above. 'vectors' is the pickle file containing the word vectors. 'morfessor_model' is the path to the trained morfessor model. 'use_morphemes' False(default) use of morpheme embeddings in output. """ result = cls() if vectors is None: raise Exception("Please provide vectors binary to load varembed model") d = utils.unpickle(vectors) word_to_ix = d['word_to_ix'] morpho_to_ix = d['morpho_to_ix'] word_embeddings = d['word_embeddings'] morpho_embeddings = d['morpheme_embeddings'] result.load_word_embeddings(word_embeddings, word_to_ix) if morfessor_model: try: import morfessor morfessor_model = morfessor.MorfessorIO().read_binary_model_file(morfessor_model) result.add_morphemes_to_embeddings(morfessor_model, morpho_embeddings, morpho_to_ix) except ImportError: # Morfessor Package not found. logger.error('Could not import morfessor. Not using morpheme embeddings') raise ImportError('Could not import morfessor.') logger.info('Loaded varembed model vectors from %s', vectors) return result
def load(cls, fname, *args, **kwargs): """ Load a previously saved object from file (also see `save`). Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`: >>> LdaModel.load(fname, mmap='r') """ kwargs['mmap'] = kwargs.get('mmap', None) result = super(LdaModel, cls).load(fname, *args, **kwargs) # check if `random_state` attribute has been set after main pickle load # if set -> the model to be loaded was saved using a >= 0.13.2 version of Gensim # if not set -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so set `random_state` as the default value if not hasattr(result, 'random_state'): result.random_state = utils.get_random_state(None) # using default value `get_random_state(None)` logging.warning("random_state not set so using default value") state_fname = utils.smart_extension(fname, '.state') try: result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs) except Exception as e: logging.warning("failed to load state from %s: %s", state_fname, e) id2word_fname = utils.smart_extension(fname, '.id2word') # check if `id2word_fname` file is present on disk # if present -> the model to be loaded was saved using a >= 0.13.2 version of Gensim, so set `result.id2word` using the `id2word_fname` file # if not present -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so `result.id2word` already set after the main pickle load if (os.path.isfile(id2word_fname)): try: result.id2word = utils.unpickle(id2word_fname) except Exception as e: logging.warning("failed to load id2word dictionary from %s: %s", id2word_fname, e) return result
def __init__(self, model_prefix=None, num_best=None): self.model_prefix = model_prefix self.num_best = num_best if self.model_prefix is None: raise ValueError("model_prefix must be specified") logger.info("ESA: Loading word dictionary...") self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2') logger.info("ESA: Loading document name map...") self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle') logger.info("ESA: Loading TF-IDF model...") self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model') logger.info("ESA: Loading similarity index...") self.similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r') #logger.info("ESA: Preloading reverse indexes...") #self.similarity_index.preload_reverse_index() logger.info("ESA: Finished loading model files.")
def load_model(model, topn, positive=[], negative=[]): if model == 'glove' or model == 'ppmi' or model == 'svd': model = utils.unpickle('./model/{}.model'.format(model)) return most_similar_dist(model, positive=positive, negative=negative, topn=topn) else: model = Word2Vec.load('./model/{}.model'.format(model)) return model.most_similar(positive=positive, negative=negative, topn=topn)
def load(cls, fname, *args, **kwargs): """ Load a previously saved object from file (also see `save`). Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`: >>> LdaModel.load(fname, mmap='r') """ kwargs['mmap'] = kwargs.get('mmap', None) result = super(LdaModel, cls).load(fname, *args, **kwargs) state_fname = utils.smart_extension(fname, '.state') try: result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs) except Exception as e: logging.warning("failed to load state from %s: %s", state_fname, e) id2word_fname = utils.smart_extension(fname, '.id2word') if (os.path.isfile(id2word_fname)): try: result.id2word = utils.unpickle(id2word_fname) except Exception as e: logging.warning("failed to load id2word dictionary from %s: %s", id2word_fname, e) else: result.id2word = None return result
def main(): parser = argparse.ArgumentParser( description= 'convertes a given .metadata.cpickle file (such as generated by gensim MmCorpus.serialize(..., metadata=True) to a pickled frozenset of contained pageids', epilog= 'Example: ./{} --metadata=enwiki-metadata.cpickle.bz2 --pageids=enwiki-pageids.cpickle.bz2' .format(sys.argv[0])) parser.add_argument( '--metadata', type=argparse.FileType('r'), help='path to input binary metadata file (.cpickle/.cpickle.bz2)', required=True) parser.add_argument( '--pageids', type=argparse.FileType('w'), help= 'path to output binary frozenset of pageids file (.cpickle/.cpickle.bz2)', required=True) args = parser.parse_args() input_metadata_path = args.metadata.name output_pageids_path = args.pageids.name logger.info('running with:\n{}'.format( pformat({ 'input_metadata_path': input_metadata_path, 'output_pageids_path': output_pageids_path }))) metadata = unpickle(input_metadata_path) logger.debug('unpickled {}'.format(metadata)) pageids = frozenset(int(md[0]) for md in metadata.values()) logger.info('extracted {} pageids'.format(len(pageids))) logger.debug('created set {}'.format(pageids)) pickle(pageids, output_pageids_path)
def __init__(self, fname, index_fname=None): """ Initialize this abstract base class, by loading a previously saved index from `index_fname` (or `fname.index` if `index_fname` is not set). This index will allow subclasses to support the `corpus[docno]` syntax (random access to document #`docno` in O(1)). >>> # save corpus in SvmLightCorpus format with an index >>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]] >>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus) >>> # load back as a document stream (*not* plain Python list) >>> corpus_with_random_access = gensim.corpora.SvmLightCorpus('tstfile.svmlight') >>> print(corpus_with_random_access[1]) [(0, 1.0), (1, 2.0)] """ try: if index_fname is None: index_fname = utils.smart_extension(fname, '.index') self.index = utils.unpickle(index_fname) # change self.index into a numpy.ndarray to support fancy indexing self.index = numpy.asarray(self.index) logger.info("loaded corpus index from %s", index_fname) except Exception: self.index = None self.length = None
def load(cls, fname, *args, **kwargs): """ Load a previously saved object from file (also see `save`). Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`: >>> LdaModel.load(fname, mmap='r') """ kwargs['mmap'] = kwargs.get('mmap', None) result = super(LdaModel, cls).load(fname, *args, **kwargs) state_fname = utils.smart_extension(fname, '.state') try: result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs) except Exception as e: logging.warning("failed to load state from %s: %s", state_fname, e) id2word_fname = utils.smart_extension(fname, '.id2word') if (os.path.isfile(id2word_fname)): try: result.id2word = utils.unpickle(id2word_fname) except Exception as e: logging.warning( "failed to load id2word dictionary from %s: %s", id2word_fname, e) else: result.id2word = None return result
def load(cls, fname): """ Load a previously saved object from file (also see `save`). """ logger.debug("loading %s object from %s" % (cls.__name__, fname)) result = utils.unpickle(fname) result.index = numpy.load(fname + '.npy', mmap_mode='r') # load back as read-only return result
def load(cls, fname): """ Load a previously saved object from file (also see `save`). """ logger.info("loading %s object from %s" % (cls.__name__, fname)) result = utils.unpickle(fname) result.corpus = np.load(fname + '.npy', mmap_mode='r') # load back as read-only return result #endclass EsaModel
def load_y(docid2path_path): docid2path = unpickle(docid2path_path) y = list() for path in docid2path.values(): if 'ham' in path.split(os.sep): y.append(1) else: y.append(0) return np.asarray(y, dtype=int)
def __init__(self, fname): super(IdMmCorpus, self).__init__(fname) try: dockeys_fname = utils.smart_extension(fname, '.dockeys') self.dockeys = utils.unpickle(dockeys_fname) self.key_to_index = {k:n for (n, k) in enumerate(self.dockeys)} logger.info("loaded dockey index from %s" % dockeys_fname) except: self.dockeys = None
def __init__(self, fname): super(IdMmCorpus, self).__init__(fname) try: dockeys_fname = utils.smart_extension(fname, '.dockeys') self.dockeys = utils.unpickle(dockeys_fname) self.key_to_index = {k: n for (n, k) in enumerate(self.dockeys)} logger.info("loaded dockey index from %s" % dockeys_fname) except: self.dockeys = None
def main(): parser = argparse.ArgumentParser( description= 'creates an id2author gensim dictionary mapping file which maps internal author ids to author names. pageids can be filtered against a whitelist', epilog= 'Example: ./{} --history-dump=enwiki-pages-meta-history.xml.bz2 --id2author=enwiki-id2author.cpickle --whitelist=enwiki-pageids.cpickle.bz2' .format(sys.argv[0])) parser.add_argument( '--history-dump', type=argparse.FileType('r'), help= 'path to input WikiMedia *-pages-meta-history file (.xml/.xml.bz2)', required=True) parser.add_argument( '--id2author', type=argparse.FileType('w'), help= 'path to output binary id2author dictionary file (.cpickle/.cpickle.bz2)', required=True) parser.add_argument( '--pageids-whitelist', type=argparse.FileType('r'), help= 'path to input pageids whitelist (binary pickled frozenset, .cpickle/.cpickle.bz2)' ) args = parser.parse_args() input_history_dump_path = args.history_dump.name output_id2author_path = args.id2author.name input_pageids_whitelist_path = args.pageids_whitelist.name if args.pageids_whitelist else None program, logger = init_gensim_logger() logger.info('running {} with:\n{}'.format( program, pformat({ 'input_history_dump_path': input_history_dump_path, 'output_id2author_path': output_id2author_path, 'input_pageids_whitelist_path': input_pageids_whitelist_path }))) dump = xml_dump.Iterator.from_file(smart_open(input_history_dump_path)) if input_pageids_whitelist_path: whitelist = unpickle(input_pageids_whitelist_path) logger.info('loaded pageids whitelist of {} pages'.format( len(whitelist))) author_iter = ((revision.contributor.user_text for revision in page) for page in dump if page.id in whitelist) else: logger.info('no pageids whitelist given') author_iter = ((revision.contributor.user_text for revision in page) for page in dump) id2author = Dictionary(author_iter) id2author.save(output_id2author_path) logging.info('number of processed documents: {}'.format( id2author.num_docs)) logging.info('number of found authors: {}'.format(len(id2author.token2id)))
def __init__(self, model, topn, alpha, tagger, complex_freq, simple_freq, freq_t, char_ngram): logger.info("Instatiating Simple Science Simplifier...") self.model = unpickle(model) logger.info("Loaded embeddings models from : `{}`".format(model)) self.topn = topn self.alpha = alpha self.tagger = GeniaTagger(tagger) logger.info("Loaded Genia PoS tagger from : `{}`".format(tagger)) self.complex_freq = unpickle(complex_freq) logger.info( "Loaded Complex Word Frequencies from : `{}`".format(complex_freq)) self.simple_freq = unpickle(simple_freq) logger.info( "Loaded Simple Word Frequencies from : `{}`".format(simple_freq)) self.freq_t = freq_t self.char_ngram = char_ngram
def load(cls, fname): """ Load a previously saved object from file (also see `save`). """ logger.info("loading %s object from %s and %s" % (cls.__name__, fname, fname + ".index")) result = utils.unpickle(fname) result.similarity_index = MatrixSimilarity.load(fname + ".index") return result
def load(cls, fname): """ Load a previously saved object from file (also see `save`). """ logger.debug("loading %s object from %s and %s.*.npy" % (cls.__name__, fname, fname)) result = utils.unpickle(fname) data = numpy.load(fname + '.data.npy', mmap_mode='r') # load back as read-only indptr = numpy.load(fname + '.indptr.npy', mmap_mode='r') indices = numpy.load(fname + '.indices.npy', mmap_mode='r') result.index.data, result.index.indptr, result.index.indices = data, indptr, indices return result
def load(cls, fname): """ Load a previously saved object from file (also see `save`). """ logger.debug("loading %s object from %s" % (cls.__name__, fname)) result = utils.unpickle(fname) ufname = fname + '.npy' try: result.projection.u = numpy.load(ufname, mmap_mode='r') # load back as read-only except: logger.debug("failed to load mmap'ed projection from %s" % ufname) return result
def load(cls, fname): """ Load a previously saved object from file (also see `save`). """ logger.debug("loading %s object from %s" % (cls.__name__, fname)) result = utils.unpickle(fname) ufname = fname + '.npy' try: result.projection.u = numpy.load(ufname, mmap_mode='r') # load back as read-only except: logger.debug("failed to load mmap'ed projection from %s" % ufname) result.dispatcher = None # TODO load back incl. distributed state? will require re-initialization of worker state return result
def load(cls, fname): """ Load a previously saved object from file (also see `save`). """ logger.debug("loading %s object from %s" % (cls.__name__, fname)) result = utils.unpickle(fname) ufname = fname + '.npy' try: result.projection.u = numpy.load( ufname, mmap_mode='r') # load back as read-only except: logger.debug("failed to load mmap'ed projection from %s" % ufname) return result
def load(cls, fname): """ Load a previously saved object from file (also see `save`). """ logger.info("loading %s object from %s" % (cls.__name__, fname)) result = utils.unpickle(fname) ufname = fname + '.npy' try: result.projection.u = numpy.load( ufname, mmap_mode='r') # load back as read-only except: logger.info("failed to load mmap'ed projection from %s" % ufname) result.dispatcher = None # TODO load back incl. distributed state? will require re-initialization of worker state return result
def load(cls, fname, mmap=None): """ Load a previously corpus index from file. """ LOGGER.info("Loading %s object from %s", cls.__name__, fname) result = utils.unpickle(fname) LOGGER.info("Finished unpickling EsaModel") path = fname + '.npz' sc = np.load(path) result.sparse_corpus = sparse.coo_matrix( (sc['data'], (sc['row'], sc['col'])), shape=sc['shape']) LOGGER.info("Finished loading sparse corpus") return result
def load_varembed_format(cls, vectors, morfessor_model=None): """ Load the word vectors into matrix from the varembed output vector files. Using morphemes requires Python 2.7 version or above. 'vectors' is the pickle file containing the word vectors. 'morfessor_model' is the path to the trained morfessor model. 'use_morphemes' False(default) use of morpheme embeddings in output. """ result = cls() if vectors is None: raise Exception( "Please provide vectors binary to load varembed model") D = utils.unpickle(vectors) word_to_ix = D['word_to_ix'] morpho_to_ix = D['morpho_to_ix'] word_embeddings = D['word_embeddings'] morpho_embeddings = D['morpheme_embeddings'] result.load_word_embeddings(word_embeddings, word_to_ix) if morfessor_model: if sys.version_info >= ( 2, 7): #Morfessor is only supported for Python 2.7 and above. try: import morfessor morfessor_model = morfessor.MorfessorIO( ).read_binary_model_file(morfessor_model) result.add_morphemes_to_embeddings(morfessor_model, morpho_embeddings, morpho_to_ix) except ImportError: # Morfessor Package not found. logger.error( 'Could not import morfessor. Not using morpheme embeddings' ) raise ImportError('Could not import morfessor.') else: # Raise exception in Python 2.6 or earlier. raise Exception( 'Using Morphemes requires Python 2.7 and above. Morfessor is not supported in python 2.6' ) logger.info('Loaded varembed model vectors from %s', vectors) return result
def load_varembed_format(cls, vectors, morfessor_model=None): """Load the word vectors into matrix from the varembed output vector files. Parameters ---------- vectors : dict Pickle file containing the word vectors. morfessor_model : str, optional Path to the trained morfessor model. Returns ------- :class:`~gensim.models.wrappers.varembed.VarEmbed` Ready to use instance. """ result = cls() if vectors is None: raise Exception( "Please provide vectors binary to load varembed model") d = utils.unpickle(vectors) word_to_ix = d['word_to_ix'] morpho_to_ix = d['morpho_to_ix'] word_embeddings = d['word_embeddings'] morpho_embeddings = d['morpheme_embeddings'] result.load_word_embeddings(word_embeddings, word_to_ix) if morfessor_model: try: import morfessor morfessor_model = morfessor.MorfessorIO( ).read_binary_model_file(morfessor_model) result.add_morphemes_to_embeddings(morfessor_model, morpho_embeddings, morpho_to_ix) except ImportError: # Morfessor Package not found. logger.error( 'Could not import morfessor. Not using morpheme embeddings' ) raise ImportError('Could not import morfessor.') logger.info('Loaded varembed model vectors from %s', vectors) return result
def __init__(self, fname, index_fname=None): """ Parameters ---------- fname : str Path to corpus. index_fname : str, optional Path to index, if not provided - used `fname.index`. """ try: if index_fname is None: index_fname = utils.smart_extension(fname, '.index') self.index = utils.unpickle(index_fname) # change self.index into a numpy.ndarray to support fancy indexing self.index = numpy.asarray(self.index) logger.info("loaded corpus index from %s", index_fname) except Exception: self.index = None self.length = None
def __init__(self, model_prefix = None, num_best = None): self.model_prefix = model_prefix self.num_best = num_best if self.model_prefix is None: raise ValueError("model_prefix must be specified") logger.info("ESA: Loading word dictionary...") self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2') logger.info("ESA: Loading document name map...") self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle') logger.info("ESA: Loading TF-IDF model...") self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model') logger.info("ESA: Loading similarity index...") self.similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r') #logger.info("ESA: Preloading reverse indexes...") #self.similarity_index.preload_reverse_index() logger.info("ESA: Finished loading model files.")
def load_varembed_format(cls, vectors, morfessor_model=None): """Load the word vectors into matrix from the varembed output vector files. Parameters ---------- vectors : dict Pickle file containing the word vectors. morfessor_model : str, optional Path to the trained morfessor model. Returns ------- :class:`~gensim.models.wrappers.varembed.VarEmbed` Ready to use instance. """ result = cls() if vectors is None: raise Exception("Please provide vectors binary to load varembed model") d = utils.unpickle(vectors) word_to_ix = d['word_to_ix'] morpho_to_ix = d['morpho_to_ix'] word_embeddings = d['word_embeddings'] morpho_embeddings = d['morpheme_embeddings'] result.load_word_embeddings(word_embeddings, word_to_ix) if morfessor_model: try: import morfessor morfessor_model = morfessor.MorfessorIO().read_binary_model_file(morfessor_model) result.add_morphemes_to_embeddings(morfessor_model, morpho_embeddings, morpho_to_ix) except ImportError: # Morfessor Package not found. logger.error('Could not import morfessor. Not using morpheme embeddings') raise ImportError('Could not import morfessor.') logger.info('Loaded varembed model vectors from %s', vectors) return result
logging.root.setLevel(level=logging.INFO) # check and process input arguments if len(sys.argv) < 2: print(inspect.cleandoc(__doc__) % locals()) sys.exit(1) model_prefix = sys.argv[1] logger.info("running %s" % ' '.join(sys.argv)) logger.info("Loading word dictionary...") dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2') logger.debug(dictionary) logger.info("Loading document name map...") article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle') logger.info("Loading tf-idf model...") tfidf = TfidfModel.load(model_prefix + '.tfidf_model') logger.info("Loading similarity index...") similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r') similarity_index.use_reverse_index = True logger.info("Finished loading model files.") mismatches = 0 for doc_idx in range(0, len(similarity_index)): logger.info("Checking doc: %d %s" % (doc_idx, article_dict[doc_idx])) rev_doc = scipy.sparse.dok_matrix((1, len(dictionary)), dtype=np.float64) fwd_doc = similarity_index.vector_by_id(doc_idx)
def __init__(self, model): self.model = unpickle(model)
logging.root.setLevel(level=logging.INFO) # check and process input arguments if len(sys.argv) < 3: print(inspect.cleandoc(__doc__) % locals()) sys.exit(1) input_file, output_prefix = sys.argv[1:3] logger.info("running %s" % ' '.join(sys.argv)) logger.info("Loading word dictionary...") dictionary = Dictionary.load_from_text(output_prefix + '_wordids.txt.bz2') logger.debug(dictionary) logger.info("Loading document name map...") article_dict = utils.unpickle(output_prefix + '_bow.mm.metadata.cpickle') logger.info("Loading tf-idf model...") tfidf = TfidfModel.load(output_prefix + '.tfidf_model') logger.info("Loading similarity index...") similarity_index = Similarity.load(output_prefix + '_similarity.index', mmap='r') similarity_index.use_reverse_index = True similarity_index.preload_reverse_index() logger.info("Finished loading model files.") logger.info("Processing input documents...") try: infile = open(input_file, 'r')
# decomposition (glove, ppmi) take too much RAM (quadratic in vocabulary size). logger.info("dictionary found, loading") with open(outf("pruned_vocab.csv")) as csvfile: reader = csv.reader(csvfile) word2id = dict((rows[0],rows[1]) for rows in reader) utils.pickle(word2id, outf('word2id')) id2word = gensim.utils.revdict(word2id) # filter sentences to contain only the dictionary words corpus = lambda: ([word for word in sentence if word in word2id] for sentence in sentences()) if 'word2vec' in program: if os.path.exists(outf('w2v')): logger.info("word2vec model found, loading") model = utils.unpickle(outf('w2v')) else: logger.info("word2vec model not found, creating") if NEGATIVE: model = gensim.models.Word2Vec(size=DIM, min_count=0, window=WINDOW, workers=WORKERS, hs=0, negative=NEGATIVE) else: model = gensim.models.Word2Vec(size=DIM, min_count=0, window=WINDOW, workers=WORKERS) model.build_vocab(corpus()) model.train(corpus()) # train with 1 epoch model.init_sims(replace=True) model.word2id = dict((w, v.index) for w, v in model.vocab.iteritems()) model.id2word = utils.revdict(model.word2id) model.word_vectors = model.syn0norm utils.pickle(model, outf('w2v')) logger.info("evaluating accuracy")
sys.exit() logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logger = logging.getLogger(program) logger.info("running %s" % " ".join(sys.argv)) outf = lambda prefix: os.path.join(output_dir, prefix) logger.info("output file template will be %s" % outf('PREFIX')) sentences = MyCorpus(corpus_path) if os.path.exists(outf('word2id')): logger.info("dictionary found, loading") word2id = utils.unpickle(outf('word2id')) else: logger.info("dictionary not found, creating") id2word = corpora.Dictionary(sentences, prune_at=10000000) id2word.filter_extremes( keep_n=TOKEN_LIMIT) # filter out too freq/infreq words word2id = dict((v, k) for k, v in id2word.iteritems()) utils.pickle(word2id, outf('word2id')) id2word = utils.revdict(word2id) # Filter all wiki documents to contain only those words. corpus = lambda: ([word for word in sentence if word in word2id] for sentence in sentences) if os.path.exists(outf('kw2v_%s' % GAMMA)): logger.info("Kernel word2vec model found, loading")
if len(sys.argv) < 4: print(globals()['__doc__'] % locals()) sys.exit(1) in_file = gensim.models.word2vec.LineSentence(sys.argv[1]) # in_file = gensim.models.word2vec.Text8Corpus(sys.argv[1]) q_file = sys.argv[2] outf = lambda prefix: os.path.join(sys.argv[3], prefix) logger.info("output file template will be %s" % outf('PREFIX')) sentences = lambda: itertools.islice(in_file, DOC_LIMIT) # use only a small subset of all words; otherwise the methods based on matrix # decomposition (glove, ppmi) take too much RAM (quadratic in vocabulary size). if os.path.exists(outf('word2id')): logger.info("dictionary found, loading") word2id = utils.unpickle(outf('word2id')) else: logger.info("dictionary not found, creating") id2word = gensim.corpora.Dictionary(sentences(), prune_at=10000000) id2word.filter_extremes(keep_n=TOKEN_LIMIT) # filter out too freq/infreq words word2id = dict((v, k) for k, v in id2word.iteritems()) utils.pickle(word2id, outf('word2id')) id2word = gensim.utils.revdict(word2id) # filter sentences to contain only the dictionary words corpus = lambda: ([word for word in sentence if word in word2id] for sentence in sentences()) if 'word2vec' in program: if os.path.exists(outf('w2v')): logger.info("word2vec model found, loading") model = utils.unpickle(outf('w2v'))
def __init__(self, filename): self.corpus = MmCorpus(filename) self.metadata = unpickle(filename + ".metadata.cpickle")
from gensim import utils import time import sqlite3 as sql from wiki_to_esa_db.sql_statements import * import pathlib as pl del_articles_strings = ["list of", "liste der", "liste von"] TF_IDF_THRESHOLD = 50 dp = pl.Path("data_de_snowball_stemmed") tfidf_mat_path = dp / "corpus_tfidf.mm" tfidf_corpus = MmCorpus(str(tfidf_mat_path)) id_to_titles = utils.unpickle(str(dp / "bow.mm.metadata.cpickle")) titles_to_id = utils.unpickle(str(dp / "titles_to_id.pickle")) dictionary = Dictionary.load_from_text(str(dp / "dictionary.txt.bz2")) db_path = dp / "esa.db" if db_path.exists(): db_path.unlink() conn = sql.connect(str(db_path)) with conn: cursor = conn.cursor() cursor.execute(term_table) cursor.execute(article_table) cursor.execute(term_article_table) cursor.execute(term_index) cursor.execute(term_article_index)
def createSearchObjs(): """ Creates the SimSearch and KeySearch objects using the data structures created in `make_wikicorpus.py`. Returns (simsearch, keysearch, titles_to_id) """ # Load the article titles. These have the format (pageid, article title) fprint('Loading Wikipedia article titles...') t0 = time.time() id_to_titles = utils.unpickle('./data/bow.mm.metadata.cpickle') titles_to_id = utils.unpickle('./data/titles_to_id.pickle') # id_to_titles is actually a map of indeces to (pageid, article title) # The 'pageid' property is unused. # Convert id_to_titles into a simple list of titles. titles = [item[1][1] for item in id_to_titles.items()] fprint(' Took %.2f seconds' % (time.time() - t0)) # Load the dictionary (830ms on my machine) fprint('\nLoading dictionary...') t0 = time.time() dictionary = Dictionary.load_from_text('./data/dictionary.txt.bz2') fprint(' Took %.2f seconds' % (time.time() - t0)) # Load tf-idf model (60ms on my machine). fprint('\nLoading tf-idf model...') t0 = time.time() tfidf_model = TfidfModel.load('./data/tfidf.tfidf_model') fprint(' Took %.2f seconds' % (time.time() - t0)) # We must not use `load`--that would attempt to load the corpus into # memory, and it's 16.7 GB!! #corpus_tfidf = MmCorpus.load('./data/corpus_tfidf.mm') fprint('\nCreating tf-idf corpus object (leaves the vectors on disk)...') t0 = time.time() corpus_tfidf = MmCorpus('./data/corpus_tfidf.mm') fprint(' Took %.2f seconds' % (time.time() - t0)) # Create the KeySearch and SimSearch objects. ksearch = KeySearch(dictionary, tfidf_model, corpus_tfidf, titles) simsearch = SimSearch(ksearch) # TODO - SimSearch doesn't currently have a clean way to provide the index # and model. fprint('\nLoading LSI model...') t0 = time.time() simsearch.lsi = LsiModel.load('./data/lsi.lsi_model') fprint(' Took %.2f seconds' % (time.time() - t0)) # Load the Wikipedia LSI vectors into memory. # The matrix is 4.69GB for me, and takes ~15 seconds on my machine to load. fprint('\nLoading Wikipedia LSI index...') t0 = time.time() simsearch.index = MatrixSimilarity.load('./data/lsi_index.mm') fprint(' Took %.2f seconds' % (time.time() - t0)) # TODO - It would be interesting to try the 'Similarity' class which # shards the dataset on disk for you... return (simsearch, ksearch, titles_to_id)
dictionary = gensim.corpora.dictionary.Dictionary.from_corpus( corpus, id2word=id2word) logger.info("calculating truncated SVD") lsi = gensim.models.LsiModel(corpus, id2word=dictionary, num_topics=DIM) self.singular_scaled = lsi.projection.s**s_exponent # embeddings = left singular vectors scaled by the (exponentiated) singular values self.word_vectors = lsi.projection.u * self.singular_scaled if __name__ == "__main__": logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) from svd import SvdModel word2id = utils.unpickle('./tmp/word2id') id2word = gensim.utils.revdict(word2id) logger.info("SVD model creating") corpus = gensim.corpora.MmCorpus('./tmp/pmi_matrix.mm') model = SvdModel(corpus, id2word, s_exponent=0.0) model.word2id = word2id model.id2word = id2word utils.pickle(model, './model/svd.model') logger.info("finished running svd")
def doc_with_meta(self, fname): self.doc_metadata = utils.unpickle(fname + '.metadata.cpickle')
def recommend(): """ When A POST request with json data is made to this uri, Read the example from the json, predict probability and send it with a response """ # Get value for our example that came with the request data = request.json # prob_nmf = pickle.load(open('models/prob_nmf.pickle', 'rb')) # # prob_nmf = joblib.load('models/prob_nmf.pkl') # all_titles = pickle.load(open('models/all_titles.pkl', 'rb')) # f = 30 # t = AnnoyIndex(f) # Length of item vector that will be indexed # for i, row in enumerate(prob_nmf): # v = row # t.add_item(i, v) # # t.build(10) # 10 trees ########### title = data["example"].strip('\"') # clean_titles = [t[5:] for t in all_titles] # # title_id = clean_titles.index(title) # idx = t.get_nns_by_item(title_id, 1000) # tedx_list = [] # for i in idx: # if all_titles[i][:5] == 'TEDX_': # tedx_list.append(all_titles[i][5:]) # if len(tedx_list) > 2: # break w2vTITLE = utils.unpickle(modelpath+"w2vTitle_s410_minC40pcent_window7.model") # w2vTITLE = utils.unpickle(modelpath + "w2vTitle_s400_minC60pcent_window7.model") DF2 = pd.read_pickle(modelpath+'BBCgoodfood_TokensNLemms4word2vec.pkl') outlist = [[i, round(v * 1000) / 1000] for i, v in w2vTITLE.most_similar(positive=[title], topn=200) if i not in [n for m in DF2.ingredLems for n in m] and i not in ['BBC Children in Need cupcakes'] and v > 0.76] outlist[:5] searchedTitle= [title] RECrecipes = outlist[:5] #['test rec 0','test rec 1','test rec 2'] # blog_list = ["", ""] # count = 0 # for i in idx: # if all_titles[i][:5] == 'IDEA_': # blog_list[count] = all_titles[i][5:] # count += 1 # if count > 1: # break # Put the result in a nice dict so we can send it as json # results = {"recommend_tedx": tedx_list, # "recommend_blog": blog_list} results = {"searchedTitle": searchedTitle, "RECrecipes": RECrecipes} return jsonify(results)
sys.stdout.flush() t0 = time.time() # Generate bag-of-words vectors (term-document frequency matrix) and # write these directly to disk. # On my machine, this took 3.53 hrs. # By setting metadata = True, this will also record all of the article # titles into a separate pickle file, 'bow.mm.metadata.cpickle' MmCorpus.serialize('./data/bow.mm', wiki, metadata=True, progress_cnt=10000) print ' Conversion to bag-of-words took %s' % formatTime(time.time() - t0) sys.stdout.flush() # Load the article titles back id_to_titles = utils.unpickle('./data/bow.mm.metadata.cpickle') # Create the reverse mapping, from article title to index. titles_to_id = {} # For each article... for at in id_to_titles.items(): # `at` is (index, (pageid, article_title)) e.g., (0, ('12', 'Anarchism')) # at[1][1] is the article title. # The pagied property is unused. titles_to_id[at[1][1]] = at[0] # Store the resulting map. utils.pickle(titles_to_id, './data/titles_to_id.pickle') # We're done with the article titles so free up their memory.
if len(sys.argv) < 4: print(globals()['__doc__'] % locals()) sys.exit(1) in_file = gensim.models.word2vec.LineSentence(sys.argv[1]) # in_file = gensim.models.word2vec.Text8Corpus(sys.argv[1]) q_file = sys.argv[2] outf = lambda prefix: os.path.join(sys.argv[3], prefix) logger.info("output file template will be %s" % outf('PREFIX')) sentences = lambda: itertools.islice(in_file, DOC_LIMIT) # use only a small subset of all words; otherwise the methods based on matrix # decomposition (glove, ppmi) take too much RAM (quadratic in vocabulary size). if os.path.exists(outf('word2id')): logger.info("dictionary found, loading") word2id = utils.unpickle(outf('word2id')) else: logger.info("dictionary not found, creating") id2word = gensim.corpora.Dictionary(sentences(), prune_at=10000000) id2word.filter_extremes( keep_n=TOKEN_LIMIT) # filter out too freq/infreq words word2id = dict((v, k) for k, v in id2word.iteritems()) utils.pickle(word2id, outf('word2id')) id2word = gensim.utils.revdict(word2id) # filter sentences to contain only the dictionary words corpus = lambda: ([word for word in sentence if word in word2id] for sentence in sentences()) if 'word2vec' in program: if os.path.exists(outf('w2v')):
import gensim import os from gensim import corpora from gensim import utils class DtmCorpus(corpora.textcorpus.TextCorpus): def get_texts(self): return self.input def __len__(self): return len(self.input) corpus, time_seq = utils.unpickle('gensim/test/test_data/dtm_test') dtm_home = os.environ.get('DTM_HOME', "C:/Users/Artyom/SkyDrive/TopicModels/dtm-master/") dtm_path = os.path.join(dtm_home, 'bin', 'dtm') if dtm_home else None model = gensim.models.DtmModel(dtm_path, corpus, time_seq, num_topics=2, id2word=corpus.dictionary) topics = model.show_topics(topics=2, times=2, topn=10)