def __init__(self, fname, fname_vocab=None): """ Initialize the corpus from a file. `fname_vocab` is the file with vocabulary; if not specified, it defaults to `fname.vocab`. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s" % fname) if fname_vocab is None: fname_base, _ = path.splitext(fname) fname_dir = path.dirname(fname) for fname_vocab in [ fname + '.vocab', fname + '/vocab.txt', fname_base + '.vocab', fname_dir + '/vocab.txt', ]: if path.exists(fname_vocab): break else: raise IOError('BleiCorpus: could not find vocabulary file') self.fname = fname with utils.smart_open(fname_vocab) as fin: words = [utils.to_unicode(word).rstrip() for word in fin] self.id2word = dict(enumerate(words)) self.length = None
def __init__(self, fname, fname_vocab=None): """ Parameters ---------- fname : str Path to corpus in UCI format. fname_vocab : bool, optional Path to vocab. Examples -------- .. sourcecode:: pycon >>> from gensim.corpora import UciCorpus >>> from gensim.test.utils import datapath >>> >>> corpus = UciCorpus(datapath('testcorpus.uci')) >>> for document in corpus: ... pass """ IndexedCorpus.__init__(self, fname) UciReader.__init__(self, fname) if fname_vocab is None: fname_vocab = utils.smart_extension(fname, '.vocab') self.fname = fname with utils.open(fname_vocab, 'rb') as fin: words = [word.strip() for word in fin] self.id2word = dict(enumerate(words)) self.transposed = True
def __init__(self, fname, fname_vocab=None): """ Parameters ---------- fname : str Path to corpus in UCI format. fname_vocab : bool, optional Path to vocab. Examples -------- >>> from gensim.corpora import UciCorpus >>> from gensim.test.utils import datapath >>> >>> corpus = UciCorpus(datapath('testcorpus.uci')) >>> for document in corpus: ... pass """ IndexedCorpus.__init__(self, fname) UciReader.__init__(self, fname) if fname_vocab is None: fname_vocab = utils.smart_extension(fname, '.vocab') self.fname = fname with utils.smart_open(fname_vocab) as fin: words = [word.strip() for word in fin] self.id2word = dict(enumerate(words)) self.transposed = True
def __init__(self, fname, fname_vocab=None): """ Initialize the corpus from a file. `fname_vocab` is the file with vocabulary; if not specified, it defaults to `fname.vocab`. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s" % fname) if fname_vocab is None: fname_base, _ = path.splitext(fname) fname_dir = path.dirname(fname) for fname_vocab in [ fname + '.vocab', fname + '/vocab.txt', fname_base + '.vocab', fname_dir + '/vocab.txt', ]: if path.exists(fname_vocab): break else: raise IOError('BleiCorpus: could not find vocabulary file') self.fname = fname with utils.smart_open(fname_vocab) as fin: words = [utils.to_unicode(word).rstrip() for word in fin] self.id2word = dict(enumerate(words))
def __init__(self, fname: str): """ Initialize the corpus from an existing file. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s" % fname) self.fname = fname self.length = None
def __init__(self, fname): """ Initialize the corpus from a file. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s" % fname) self.fname = fname # input file, see class doc for format self.length = None
def __init__(self, fname, fname_vocab=None): IndexedCorpus.__init__(self, fname) UciReader.__init__(self, fname) if fname_vocab is None: fname_vocab = fname + '.vocab' self.fname = fname words = [word.strip() for word in open(fname_vocab)] self.id2word = dict(enumerate(words)) self.transposed = True
def __init__(self, fname, fname_vocab=None): IndexedCorpus.__init__(self, fname) UciReader.__init__(self, fname) if fname_vocab is None: fname_vocab = utils.smart_extension(fname, '.vocab') self.fname = fname with utils.smart_open(fname_vocab) as fin: words = [word.strip() for word in fin] self.id2word = dict(enumerate(words)) self.transposed = True
def __init__(self, fname): """ Parameters ---------- fname : {str, file-like object} Path to file in MM format or a file-like object that supports `seek()` (e.g. a compressed file opened by `smart_open <https://github.com/RaRe-Technologies/smart_open>`_). """ # avoid calling super(), too confusing IndexedCorpus.__init__(self, fname) matutils.MmReader.__init__(self, fname)
def __init__(self, fname): """ Parameters ---------- fname : {str, file-like object} Path to file in MM format or a file-like object that supports `seek()` (e.g. :class:`gzip.GzipFile`, :class:`bz2.BZ2File`). """ # avoid calling super(), too confusing IndexedCorpus.__init__(self, fname) matutils.MmReader.__init__(self, fname)
def __init__(self, fname, fnameVocab=None): """ Initialize the corpus from a file. `fnameVocab` is the file with vocabulary; if not specified, it defaults to `fname.vocab`. """ IndexedCorpus.__init__(self, fname) logging.info("loading corpus from %s" % fname) if fnameVocab is None: fnameVocab = fname + '.vocab' self.fname = fname words = [word.rstrip() for word in open(fnameVocab)] self.id2word = dict(enumerate(words)) self.length = None
def __init__(self, fname, store_labels=True): """ Initialize the corpus from a file. Although vector labels (~SVM target class) are not used in gensim in any way, they are parsed and stored in `self.labels` for convenience. Set `store_labels=False` to skip storing these labels (e.g. if there are too many vectors to store the self.labels array in memory). """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s" % fname) self.fname = fname # input file, see class doc for format self.length = None self.store_labels = store_labels self.labels = []
def __init__(self, fname, id2word=None, line2words=splitOnSpace): """ Initialize the corpus from a file. `id2word` and `line2words` are optional parameters. If provided, `id2word` is a dictionary mapping between wordIds (integers) and words (strings). If not provided, the mapping is constructed from the documents. `line2words` is a function which converts lines into tokens. Defaults to simple splitting on spaces. """ IndexedCorpus.__init__(self, fname) logging.info("loading corpus from %s" % fname) self.fname = fname # input file, see class doc for format self.line2words = line2words # how to translate lines into words (simply split on space by default) self.numDocs = int( open(fname).readline() ) # the first line in input data is the number of documents (integer). throws exception on bad input. if not id2word: # build a list of all word types in the corpus (distinct words) logging.info("extracting vocabulary from the corpus") allTerms = set() self.useWordIds = False # return documents as (word, wordCount) 2-tuples for doc in self: allTerms.update(word for word, wordCnt in doc) allTerms = sorted( allTerms ) # sort the list of all words; rank in that list = word's integer id self.id2word = dict(zip( xrange(len(allTerms)), allTerms)) # build a mapping of word id(int) -> word (string) else: logging.info("using provided word mapping (%i ids)" % len(id2word)) self.id2word = id2word self.word2id = dict((v, k) for k, v in self.id2word.iteritems()) self.numTerms = len(self.word2id) self.useWordIds = True # return documents as (wordIndex, wordCount) 2-tuples logging.info("loaded corpus with %i documents and %i terms from %s" % (self.numDocs, self.numTerms, fname))
def __init__(self, fname, store_labels=True): """ Parameters ---------- fname: str Path to corpus. store_labels : bool, optional Whether to store labels (~SVM target class). They currently have no application but stored in `self.labels` for convenience by default. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) self.fname = fname # input file, see class doc for format self.length = None self.store_labels = store_labels self.labels = []
def __init__(self, fname, fname_vocab=None): """ Parameters ---------- fname : str Path to corpus. fname_vocab : str, optional Vocabulary file. If `fname_vocab` is None, searching one of variants: * `fname`.vocab * `fname`/vocab.txt * `fname_without_ext`.vocab * `fname_folder`/vocab.txt Raises ------ IOError If vocabulary file doesn't exist. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) if fname_vocab is None: fname_base, _ = path.splitext(fname) fname_dir = path.dirname(fname) for fname_vocab in [ utils.smart_extension(fname, '.vocab'), utils.smart_extension(fname, '/vocab.txt'), utils.smart_extension(fname_base, '.vocab'), utils.smart_extension(fname_dir, '/vocab.txt'), ]: if path.exists(fname_vocab): break else: raise IOError('BleiCorpus: could not find vocabulary file') self.fname = fname with utils.smart_open(fname_vocab) as fin: words = [utils.to_unicode(word).rstrip() for word in fin] self.id2word = dict(enumerate(words))
def __init__(self, fname, fname_vocab=None): """ Parameters ---------- fname : str Path to corpus. fname_vocab : str, optional Vocabulary file. If `fname_vocab` is None, searching one of variants: * `fname`.vocab * `fname`/vocab.txt * `fname_without_ext`.vocab * `fname_folder`/vocab.txt Raises ------ IOError If vocabulary file doesn't exist. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) if fname_vocab is None: fname_base, _ = path.splitext(fname) fname_dir = path.dirname(fname) for fname_vocab in [ utils.smart_extension(fname, '.vocab'), utils.smart_extension(fname, '/vocab.txt'), utils.smart_extension(fname_base, '.vocab'), utils.smart_extension(fname_dir, '/vocab.txt'), ]: if path.exists(fname_vocab): break else: raise IOError('BleiCorpus: could not find vocabulary file') self.fname = fname with utils.open(fname_vocab, 'rb') as fin: words = [utils.to_unicode(word).rstrip() for word in fin] self.id2word = dict(enumerate(words))
def __init__(self, fname, id2word=None, line2words=split_on_space): """ Parameters ---------- fname : str Path to file in GibbsLda++ format. id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional Mapping between word_ids (integers) and words (strings). If not provided, the mapping is constructed directly from `fname`. line2words : callable, optional Function which converts lines(str) into tokens(list of str), using :func:`~gensim.corpora.lowcorpus.split_on_space` as default. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) self.fname = fname # input file, see class doc for format self.line2words = line2words # how to translate lines into words (simply split on space by default) self.num_docs = self._calculate_num_docs() if not id2word: # build a list of all word types in the corpus (distinct words) logger.info("extracting vocabulary from the corpus") all_terms = set() self.use_wordids = False # return documents as (word, wordCount) 2-tuples for doc in self: all_terms.update(word for word, wordCnt in doc) all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id # build a mapping of word id(int) -> word (string) self.id2word = dict(zip(range(len(all_terms)), all_terms)) else: logger.info("using provided word mapping (%i ids)", len(id2word)) self.id2word = id2word self.num_terms = len(self.word2id) self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples logger.info( "loaded corpus with %i documents and %i terms from %s", self.num_docs, self.num_terms, fname )
def __init__(self, fname, id2word=None, line2words=splitOnSpace): """ Initialize the corpus from a file. `id2word` and `line2words` are optional parameters. If provided, `id2word` is a dictionary mapping between wordIds (integers) and words (strings). If not provided, the mapping is constructed from the documents. `line2words` is a function which converts lines into tokens. Defaults to simple splitting on spaces. """ IndexedCorpus.__init__(self, fname) logging.info("loading corpus from %s" % fname) self.fname = fname # input file, see class doc for format self.line2words = line2words # how to translate lines into words (simply split on space by default) self.numDocs = int( open(fname).readline() ) # the first line in input data is the number of documents (integer). throws exception on bad input. if not id2word: # build a list of all word types in the corpus (distinct words) logging.info("extracting vocabulary from the corpus") allTerms = set() self.useWordIds = False # return documents as (word, wordCount) 2-tuples for doc in self: allTerms.update(word for word, wordCnt in doc) allTerms = sorted(allTerms) # sort the list of all words; rank in that list = word's integer id self.id2word = dict( zip(xrange(len(allTerms)), allTerms) ) # build a mapping of word id(int) -> word (string) else: logging.info("using provided word mapping (%i ids)" % len(id2word)) self.id2word = id2word self.word2id = dict((v, k) for k, v in self.id2word.iteritems()) self.numTerms = len(self.word2id) self.useWordIds = True # return documents as (wordIndex, wordCount) 2-tuples logging.info("loaded corpus with %i documents and %i terms from %s" % (self.numDocs, self.numTerms, fname))
def __init__(self, fname, id2word=None, line2words=split_on_space): """ Initialize the corpus from a file. `id2word` and `line2words` are optional parameters. If provided, `id2word` is a dictionary mapping between word_ids (integers) and words (strings). If not provided, the mapping is constructed from the documents. `line2words` is a function which converts lines into tokens. Defaults to simple splitting on spaces. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) self.fname = fname # input file, see class doc for format self.line2words = line2words # how to translate lines into words (simply split on space by default) self.num_docs = self._calculate_num_docs() if not id2word: # build a list of all word types in the corpus (distinct words) logger.info("extracting vocabulary from the corpus") all_terms = set() self.use_wordids = False # return documents as (word, wordCount) 2-tuples for doc in self: all_terms.update(word for word, wordCnt in doc) all_terms = sorted( all_terms ) # sort the list of all words; rank in that list = word's integer id self.id2word = dict(izip( xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string) else: logger.info("using provided word mapping (%i ids)", len(id2word)) self.id2word = id2word self.num_terms = len(self.word2id) self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples logger.info("loaded corpus with %i documents and %i terms from %s", self.num_docs, self.num_terms, fname)
def __init__(self, fname, id2word=None, line2words=split_on_space): """ Initialize the corpus from a file. `id2word` and `line2words` are optional parameters. If provided, `id2word` is a dictionary mapping between word_ids (integers) and words (strings). If not provided, the mapping is constructed from the documents. `line2words` is a function which converts lines into tokens. Defaults to simple splitting on spaces. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) self.fname = fname # input file, see class doc for format self.line2words = line2words # how to translate lines into words (simply split on space by default) self.num_docs = self._calculate_num_docs() if not id2word: # build a list of all word types in the corpus (distinct words) logger.info("extracting vocabulary from the corpus") all_terms = set() self.use_wordids = False # return documents as (word, wordCount) 2-tuples for doc in self: all_terms.update(word for word, wordCnt in doc) all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id # build a mapping of word id(int) -> word (string) self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) else: logger.info("using provided word mapping (%i ids)", len(id2word)) self.id2word = id2word self.num_terms = len(self.word2id) self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples logger.info( "loaded corpus with %i documents and %i terms from %s", self.num_docs, self.num_terms, fname )
def __init__(self, fname): # avoid calling super(), too confusing IndexedCorpus.__init__(self, fname) matutils.MmReader.__init__(self, fname)