Пример #1
0
    def __init__(self, fname, fname_vocab=None):
        """
        Initialize the corpus from a file.

        `fname_vocab` is the file with vocabulary; if not specified, it defaults to
        `fname.vocab`.
        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s" % fname)

        if fname_vocab is None:
            fname_base, _ = path.splitext(fname)
            fname_dir = path.dirname(fname)
            for fname_vocab in [
                        fname + '.vocab',
                        fname + '/vocab.txt',
                        fname_base + '.vocab',
                        fname_dir + '/vocab.txt',
                        ]:
                if path.exists(fname_vocab):
                    break
            else:
                raise IOError('BleiCorpus: could not find vocabulary file')


        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [utils.to_unicode(word).rstrip() for word in fin]
        self.id2word = dict(enumerate(words))
        self.length = None
Пример #2
0
    def __init__(self, fname, fname_vocab=None):
        """
        Parameters
        ----------
        fname : str
            Path to corpus in UCI format.
        fname_vocab : bool, optional
            Path to vocab.

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.corpora import UciCorpus
            >>> from gensim.test.utils import datapath
            >>>
            >>> corpus = UciCorpus(datapath('testcorpus.uci'))
            >>> for document in corpus:
            ...     pass

        """
        IndexedCorpus.__init__(self, fname)
        UciReader.__init__(self, fname)

        if fname_vocab is None:
            fname_vocab = utils.smart_extension(fname, '.vocab')

        self.fname = fname
        with utils.open(fname_vocab, 'rb') as fin:
            words = [word.strip() for word in fin]
        self.id2word = dict(enumerate(words))

        self.transposed = True
Пример #3
0
    def __init__(self, fname, fname_vocab=None):
        """
        Parameters
        ----------
        fname : str
            Path to corpus in UCI format.
        fname_vocab : bool, optional
            Path to vocab.

        Examples
        --------
        >>> from gensim.corpora import UciCorpus
        >>> from gensim.test.utils import datapath
        >>>
        >>> corpus = UciCorpus(datapath('testcorpus.uci'))
        >>> for document in corpus:
        ...     pass

        """
        IndexedCorpus.__init__(self, fname)
        UciReader.__init__(self, fname)

        if fname_vocab is None:
            fname_vocab = utils.smart_extension(fname, '.vocab')

        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [word.strip() for word in fin]
        self.id2word = dict(enumerate(words))

        self.transposed = True
Пример #4
0
    def __init__(self, fname, fname_vocab=None):
        """
        Initialize the corpus from a file.

        `fname_vocab` is the file with vocabulary; if not specified, it defaults to
        `fname.vocab`.
        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s" % fname)

        if fname_vocab is None:
            fname_base, _ = path.splitext(fname)
            fname_dir = path.dirname(fname)
            for fname_vocab in [
                    fname + '.vocab',
                    fname + '/vocab.txt',
                    fname_base + '.vocab',
                    fname_dir + '/vocab.txt',
            ]:
                if path.exists(fname_vocab):
                    break
            else:
                raise IOError('BleiCorpus: could not find vocabulary file')

        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [utils.to_unicode(word).rstrip() for word in fin]
        self.id2word = dict(enumerate(words))
Пример #5
0
 def __init__(self, fname: str):
     """
     Initialize the corpus from an existing file.
     """
     IndexedCorpus.__init__(self, fname)
     logger.info("loading corpus from %s" % fname)
     self.fname = fname
     self.length = None
Пример #6
0
    def __init__(self, fname):
        """
        Initialize the corpus from a file.
        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s" % fname)

        self.fname = fname  # input file, see class doc for format
        self.length = None
Пример #7
0
    def __init__(self, fname):
        """
        Initialize the corpus from a file.
        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s" % fname)

        self.fname = fname # input file, see class doc for format
        self.length = None
Пример #8
0
    def __init__(self, fname, fname_vocab=None):
        IndexedCorpus.__init__(self, fname)
        UciReader.__init__(self, fname)

        if fname_vocab is None:
            fname_vocab = fname + '.vocab'

        self.fname = fname
        words = [word.strip() for word in open(fname_vocab)]
        self.id2word = dict(enumerate(words))

        self.transposed = True
Пример #9
0
    def __init__(self, fname, fname_vocab=None):
        IndexedCorpus.__init__(self, fname)
        UciReader.__init__(self, fname)

        if fname_vocab is None:
            fname_vocab = fname + '.vocab'

        self.fname = fname
        words = [word.strip() for word in open(fname_vocab)]
        self.id2word = dict(enumerate(words))

        self.transposed = True
Пример #10
0
    def __init__(self, fname, fname_vocab=None):
        IndexedCorpus.__init__(self, fname)
        UciReader.__init__(self, fname)

        if fname_vocab is None:
            fname_vocab = utils.smart_extension(fname, '.vocab')

        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [word.strip() for word in fin]
        self.id2word = dict(enumerate(words))

        self.transposed = True
Пример #11
0
    def __init__(self, fname):
        """

        Parameters
        ----------
        fname : {str, file-like object}
            Path to file in MM format or a file-like object that supports `seek()`
            (e.g. a compressed file opened by `smart_open <https://github.com/RaRe-Technologies/smart_open>`_).

        """
        # avoid calling super(), too confusing
        IndexedCorpus.__init__(self, fname)
        matutils.MmReader.__init__(self, fname)
Пример #12
0
    def __init__(self, fname):
        """

        Parameters
        ----------
        fname : {str, file-like object}
            Path to file in MM format or a file-like object that supports `seek()`
            (e.g. a compressed file opened by `smart_open <https://github.com/RaRe-Technologies/smart_open>`_).

        """
        # avoid calling super(), too confusing
        IndexedCorpus.__init__(self, fname)
        matutils.MmReader.__init__(self, fname)
Пример #13
0
    def __init__(self, fname, fname_vocab=None):
        IndexedCorpus.__init__(self, fname)
        UciReader.__init__(self, fname)

        if fname_vocab is None:
            fname_vocab = utils.smart_extension(fname, '.vocab')

        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [word.strip() for word in fin]
        self.id2word = dict(enumerate(words))

        self.transposed = True
Пример #14
0
    def __init__(self, fname):
        """

        Parameters
        ----------
        fname : {str, file-like object}
            Path to file in MM format or a file-like object that supports `seek()`
            (e.g. :class:`gzip.GzipFile`, :class:`bz2.BZ2File`).

        """

        # avoid calling super(), too confusing
        IndexedCorpus.__init__(self, fname)
        matutils.MmReader.__init__(self, fname)
Пример #15
0
    def __init__(self, fname):
        """

        Parameters
        ----------
        fname : {str, file-like object}
            Path to file in MM format or a file-like object that supports `seek()`
            (e.g. :class:`gzip.GzipFile`, :class:`bz2.BZ2File`).

        """

        # avoid calling super(), too confusing
        IndexedCorpus.__init__(self, fname)
        matutils.MmReader.__init__(self, fname)
Пример #16
0
    def __init__(self, fname, fnameVocab=None):
        """
        Initialize the corpus from a file.

        `fnameVocab` is the file with vocabulary; if not specified, it defaults to
        `fname.vocab`.
        """
        IndexedCorpus.__init__(self, fname)
        logging.info("loading corpus from %s" % fname)

        if fnameVocab is None:
            fnameVocab = fname + '.vocab'

        self.fname = fname
        words = [word.rstrip() for word in open(fnameVocab)]
        self.id2word = dict(enumerate(words))
        self.length = None
Пример #17
0
    def __init__(self, fname, fnameVocab=None):
        """
        Initialize the corpus from a file.

        `fnameVocab` is the file with vocabulary; if not specified, it defaults to
        `fname.vocab`.
        """
        IndexedCorpus.__init__(self, fname)
        logging.info("loading corpus from %s" % fname)

        if fnameVocab is None:
            fnameVocab = fname + '.vocab'

        self.fname = fname
        words = [word.rstrip() for word in open(fnameVocab)]
        self.id2word = dict(enumerate(words))
        self.length = None
Пример #18
0
    def __init__(self, fname, store_labels=True):
        """
        Initialize the corpus from a file.

        Although vector labels (~SVM target class) are not used in gensim in any way,
        they are parsed and stored in `self.labels` for convenience. Set `store_labels=False`
        to skip storing these labels (e.g. if there are too many vectors to store
        the self.labels array in memory).

        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s" % fname)

        self.fname = fname  # input file, see class doc for format
        self.length = None
        self.store_labels = store_labels
        self.labels = []
Пример #19
0
    def __init__(self, fname, store_labels=True):
        """
        Initialize the corpus from a file.

        Although vector labels (~SVM target class) are not used in gensim in any way,
        they are parsed and stored in `self.labels` for convenience. Set `store_labels=False`
        to skip storing these labels (e.g. if there are too many vectors to store
        the self.labels array in memory).

        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s" % fname)

        self.fname = fname # input file, see class doc for format
        self.length = None
        self.store_labels = store_labels
        self.labels = []
Пример #20
0
    def __init__(self, fname, id2word=None, line2words=splitOnSpace):
        """
        Initialize the corpus from a file.

        `id2word` and `line2words` are optional parameters.

        If provided, `id2word` is a dictionary mapping between wordIds (integers)
        and words (strings). If not provided, the mapping is constructed from
        the documents.

        `line2words` is a function which converts lines into tokens. Defaults to
        simple splitting on spaces.
        """
        IndexedCorpus.__init__(self, fname)
        logging.info("loading corpus from %s" % fname)

        self.fname = fname  # input file, see class doc for format
        self.line2words = line2words  # how to translate lines into words (simply split on space by default)
        self.numDocs = int(
            open(fname).readline()
        )  # the first line in input data is the number of documents (integer). throws exception on bad input.

        if not id2word:
            # build a list of all word types in the corpus (distinct words)
            logging.info("extracting vocabulary from the corpus")
            allTerms = set()
            self.useWordIds = False  # return documents as (word, wordCount) 2-tuples
            for doc in self:
                allTerms.update(word for word, wordCnt in doc)
            allTerms = sorted(
                allTerms
            )  # sort the list of all words; rank in that list = word's integer id
            self.id2word = dict(zip(
                xrange(len(allTerms)),
                allTerms))  # build a mapping of word id(int) -> word (string)
        else:
            logging.info("using provided word mapping (%i ids)" % len(id2word))
            self.id2word = id2word
        self.word2id = dict((v, k) for k, v in self.id2word.iteritems())
        self.numTerms = len(self.word2id)
        self.useWordIds = True  # return documents as (wordIndex, wordCount) 2-tuples

        logging.info("loaded corpus with %i documents and %i terms from %s" %
                     (self.numDocs, self.numTerms, fname))
    def __init__(self, fname, store_labels=True):
        """

        Parameters
        ----------
        fname: str
            Path to corpus.
        store_labels : bool, optional
            Whether to store labels (~SVM target class). They currently have no application but stored
            in `self.labels` for convenience by default.

        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s", fname)

        self.fname = fname  # input file, see class doc for format
        self.length = None
        self.store_labels = store_labels
        self.labels = []
Пример #22
0
    def __init__(self, fname, fname_vocab=None):
        """

        Parameters
        ----------
        fname : str
            Path to corpus.
        fname_vocab : str, optional
            Vocabulary file. If `fname_vocab` is None, searching one of variants:

            * `fname`.vocab
            * `fname`/vocab.txt
            * `fname_without_ext`.vocab
            * `fname_folder`/vocab.txt

        Raises
        ------
        IOError
            If vocabulary file doesn't exist.

        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s", fname)

        if fname_vocab is None:
            fname_base, _ = path.splitext(fname)
            fname_dir = path.dirname(fname)
            for fname_vocab in [
                        utils.smart_extension(fname, '.vocab'),
                        utils.smart_extension(fname, '/vocab.txt'),
                        utils.smart_extension(fname_base, '.vocab'),
                        utils.smart_extension(fname_dir, '/vocab.txt'),
                        ]:
                if path.exists(fname_vocab):
                    break
            else:
                raise IOError('BleiCorpus: could not find vocabulary file')

        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [utils.to_unicode(word).rstrip() for word in fin]
        self.id2word = dict(enumerate(words))
Пример #23
0
    def __init__(self, fname, fname_vocab=None):
        """

        Parameters
        ----------
        fname : str
            Path to corpus.
        fname_vocab : str, optional
            Vocabulary file. If `fname_vocab` is None, searching one of variants:

            * `fname`.vocab
            * `fname`/vocab.txt
            * `fname_without_ext`.vocab
            * `fname_folder`/vocab.txt

        Raises
        ------
        IOError
            If vocabulary file doesn't exist.

        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s", fname)

        if fname_vocab is None:
            fname_base, _ = path.splitext(fname)
            fname_dir = path.dirname(fname)
            for fname_vocab in [
                    utils.smart_extension(fname, '.vocab'),
                    utils.smart_extension(fname, '/vocab.txt'),
                    utils.smart_extension(fname_base, '.vocab'),
                    utils.smart_extension(fname_dir, '/vocab.txt'),
            ]:
                if path.exists(fname_vocab):
                    break
            else:
                raise IOError('BleiCorpus: could not find vocabulary file')

        self.fname = fname
        with utils.open(fname_vocab, 'rb') as fin:
            words = [utils.to_unicode(word).rstrip() for word in fin]
        self.id2word = dict(enumerate(words))
Пример #24
0
    def __init__(self, fname, id2word=None, line2words=split_on_space):
        """

        Parameters
        ----------
        fname : str
            Path to file in GibbsLda++ format.
        id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional
            Mapping between word_ids (integers) and words (strings).
            If not provided, the mapping is constructed directly from `fname`.
        line2words : callable, optional
            Function which converts lines(str) into tokens(list of str),
            using :func:`~gensim.corpora.lowcorpus.split_on_space` as default.

        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s", fname)

        self.fname = fname  # input file, see class doc for format
        self.line2words = line2words  # how to translate lines into words (simply split on space by default)
        self.num_docs = self._calculate_num_docs()

        if not id2word:
            # build a list of all word types in the corpus (distinct words)
            logger.info("extracting vocabulary from the corpus")
            all_terms = set()
            self.use_wordids = False  # return documents as (word, wordCount) 2-tuples
            for doc in self:
                all_terms.update(word for word, wordCnt in doc)
            all_terms = sorted(all_terms)  # sort the list of all words; rank in that list = word's integer id
            # build a mapping of word id(int) -> word (string)
            self.id2word = dict(zip(range(len(all_terms)), all_terms))
        else:
            logger.info("using provided word mapping (%i ids)", len(id2word))
            self.id2word = id2word
        self.num_terms = len(self.word2id)
        self.use_wordids = True  # return documents as (wordIndex, wordCount) 2-tuples

        logger.info(
            "loaded corpus with %i documents and %i terms from %s",
            self.num_docs, self.num_terms, fname
        )
Пример #25
0
    def __init__(self, fname, id2word=None, line2words=split_on_space):
        """

        Parameters
        ----------
        fname : str
            Path to file in GibbsLda++ format.
        id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional
            Mapping between word_ids (integers) and words (strings).
            If not provided, the mapping is constructed directly from `fname`.
        line2words : callable, optional
            Function which converts lines(str) into tokens(list of str),
            using :func:`~gensim.corpora.lowcorpus.split_on_space` as default.

        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s", fname)

        self.fname = fname  # input file, see class doc for format
        self.line2words = line2words  # how to translate lines into words (simply split on space by default)
        self.num_docs = self._calculate_num_docs()

        if not id2word:
            # build a list of all word types in the corpus (distinct words)
            logger.info("extracting vocabulary from the corpus")
            all_terms = set()
            self.use_wordids = False  # return documents as (word, wordCount) 2-tuples
            for doc in self:
                all_terms.update(word for word, wordCnt in doc)
            all_terms = sorted(all_terms)  # sort the list of all words; rank in that list = word's integer id
            # build a mapping of word id(int) -> word (string)
            self.id2word = dict(zip(range(len(all_terms)), all_terms))
        else:
            logger.info("using provided word mapping (%i ids)", len(id2word))
            self.id2word = id2word
        self.num_terms = len(self.word2id)
        self.use_wordids = True  # return documents as (wordIndex, wordCount) 2-tuples

        logger.info(
            "loaded corpus with %i documents and %i terms from %s",
            self.num_docs, self.num_terms, fname
        )
Пример #26
0
    def __init__(self, fname, id2word=None, line2words=splitOnSpace):
        """
        Initialize the corpus from a file.

        `id2word` and `line2words` are optional parameters.

        If provided, `id2word` is a dictionary mapping between wordIds (integers)
        and words (strings). If not provided, the mapping is constructed from
        the documents.

        `line2words` is a function which converts lines into tokens. Defaults to
        simple splitting on spaces.
        """
        IndexedCorpus.__init__(self, fname)
        logging.info("loading corpus from %s" % fname)

        self.fname = fname  # input file, see class doc for format
        self.line2words = line2words  # how to translate lines into words (simply split on space by default)
        self.numDocs = int(
            open(fname).readline()
        )  # the first line in input data is the number of documents (integer). throws exception on bad input.

        if not id2word:
            # build a list of all word types in the corpus (distinct words)
            logging.info("extracting vocabulary from the corpus")
            allTerms = set()
            self.useWordIds = False  # return documents as (word, wordCount) 2-tuples
            for doc in self:
                allTerms.update(word for word, wordCnt in doc)
            allTerms = sorted(allTerms)  # sort the list of all words; rank in that list = word's integer id
            self.id2word = dict(
                zip(xrange(len(allTerms)), allTerms)
            )  # build a mapping of word id(int) -> word (string)
        else:
            logging.info("using provided word mapping (%i ids)" % len(id2word))
            self.id2word = id2word
        self.word2id = dict((v, k) for k, v in self.id2word.iteritems())
        self.numTerms = len(self.word2id)
        self.useWordIds = True  # return documents as (wordIndex, wordCount) 2-tuples

        logging.info("loaded corpus with %i documents and %i terms from %s" % (self.numDocs, self.numTerms, fname))
Пример #27
0
    def __init__(self, fname, id2word=None, line2words=split_on_space):
        """
        Initialize the corpus from a file.

        `id2word` and `line2words` are optional parameters.
        If provided, `id2word` is a dictionary mapping between word_ids (integers)
        and words (strings). If not provided, the mapping is constructed from
        the documents.

        `line2words` is a function which converts lines into tokens. Defaults to
        simple splitting on spaces.
        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s", fname)

        self.fname = fname  # input file, see class doc for format
        self.line2words = line2words  # how to translate lines into words (simply split on space by default)
        self.num_docs = self._calculate_num_docs()

        if not id2word:
            # build a list of all word types in the corpus (distinct words)
            logger.info("extracting vocabulary from the corpus")
            all_terms = set()
            self.use_wordids = False  # return documents as (word, wordCount) 2-tuples
            for doc in self:
                all_terms.update(word for word, wordCnt in doc)
            all_terms = sorted(
                all_terms
            )  # sort the list of all words; rank in that list = word's integer id
            self.id2word = dict(izip(
                xrange(len(all_terms)),
                all_terms))  # build a mapping of word id(int) -> word (string)
        else:
            logger.info("using provided word mapping (%i ids)", len(id2word))
            self.id2word = id2word
        self.num_terms = len(self.word2id)
        self.use_wordids = True  # return documents as (wordIndex, wordCount) 2-tuples

        logger.info("loaded corpus with %i documents and %i terms from %s",
                    self.num_docs, self.num_terms, fname)
Пример #28
0
    def __init__(self, fname, id2word=None, line2words=split_on_space):
        """
        Initialize the corpus from a file.

        `id2word` and `line2words` are optional parameters.
        If provided, `id2word` is a dictionary mapping between word_ids (integers)
        and words (strings). If not provided, the mapping is constructed from
        the documents.

        `line2words` is a function which converts lines into tokens. Defaults to
        simple splitting on spaces.
        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s", fname)

        self.fname = fname  # input file, see class doc for format
        self.line2words = line2words  # how to translate lines into words (simply split on space by default)
        self.num_docs = self._calculate_num_docs()

        if not id2word:
            # build a list of all word types in the corpus (distinct words)
            logger.info("extracting vocabulary from the corpus")
            all_terms = set()
            self.use_wordids = False  # return documents as (word, wordCount) 2-tuples
            for doc in self:
                all_terms.update(word for word, wordCnt in doc)
            all_terms = sorted(all_terms)  # sort the list of all words; rank in that list = word's integer id
            # build a mapping of word id(int) -> word (string)
            self.id2word = dict(izip(xrange(len(all_terms)), all_terms))
        else:
            logger.info("using provided word mapping (%i ids)", len(id2word))
            self.id2word = id2word
        self.num_terms = len(self.word2id)
        self.use_wordids = True  # return documents as (wordIndex, wordCount) 2-tuples

        logger.info(
            "loaded corpus with %i documents and %i terms from %s",
            self.num_docs, self.num_terms, fname
        )
Пример #29
0
 def __init__(self, fname):
     # avoid calling super(), too confusing
     IndexedCorpus.__init__(self, fname)
     matutils.MmReader.__init__(self, fname)
Пример #30
0
 def __init__(self, fname):
     # avoid calling super(), too confusing
     IndexedCorpus.__init__(self, fname)
     matutils.MmReader.__init__(self, fname)