Пример #1
0
    def __init__(self, input, transposed=True):
        """
        Initialize the matrix reader.

        The `input` refers to a file on local filesystem, which is expected to
        be in the sparse (coordinate) Matrix Market format. Documents are assumed
        to be rows of the matrix (and document features are columns).

        `input` is either a string (file path) or a file-like object that supports
        `seek()` (e.g. gzip.GzipFile, bz2.BZ2File).
        """
        logger.info("initializing corpus reader from %s" % input)
        self.input, self.transposed = input, transposed
        with utils.file_or_filename(self.input) as lines:
            try:
                header = utils.to_unicode(next(lines)).strip()
                if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
                    raise ValueError("File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
                                    (self.input, header))
            except StopIteration:
                pass

            self.num_docs = self.num_terms = self.num_nnz = 0
            for lineno, line in enumerate(lines):
                line = utils.to_unicode(line)
                if not line.startswith('%'):
                    self.num_docs, self.num_terms, self.num_nnz = map(int, line.split())
                    if not self.transposed:
                        self.num_docs, self.num_terms = self.num_terms, self.num_docs
                    break

        logger.info("accepted corpus with %i documents, %i features, %i non-zero entries" %
                     (self.num_docs, self.num_terms, self.num_nnz))
    def __iter__(self):
        """Iterate over all corpus.

        Yields
        ------
        (prev_id, document) : (int, list of (int, number)
            Number of document and document in BoW format.

        Notes
        -----
        Total number of vectors returned is always equal to the
        number of rows specified in the header; empty documents are inserted and
        yielded where appropriate, even if they are not explicitly stored in the
        Matrix Market file.

        """
        with utils.file_or_filename(self.input) as lines:
            self.skip_headers(lines)

            previd = -1
            for line in lines:
                docid, termid, val = utils.to_unicode(
                    line).split()  # needed for python3
                if not self.transposed:
                    termid, docid = docid, termid
                # -1 because matrix market indexes are 1-based => convert to 0-based
                docid, termid, val = int(docid) - 1, int(termid) - 1, float(
                    val)
                assert previd <= docid, "matrix columns must come in ascending order"
                if docid != previd:
                    # change of document: return the document read so far (its id is prevId)
                    if previd >= 0:
                        yield previd, document  # noqa:F821

                    # return implicit (empty) documents between previous id and new id
                    # too, to keep consistent document numbering and corpus length
                    for previd in xrange(previd + 1, docid):
                        yield previd, []

                    # from now on start adding fields to a new document, with a new id
                    previd = docid
                    document = []

                document.append((
                    termid,
                    val,
                ))  # add another field to the current document

        # handle the last document, as a special case
        if previd >= 0:
            yield previd, document

        # return empty documents between the last explicit document and the number
        # of documents as specified in the header
        for previd in xrange(previd + 1, self.num_docs):
            yield previd, []
Пример #3
0
 def __iter__(self):
     """iterate through the files"""
     for file_name in self.input_files:
         logger.info('reading file %s', file_name)
         with gensim_utils.file_or_filename(file_name) as fin:
             for line in itertools.islice(fin, self.limit):
                 line = gensim_utils.to_unicode(line,
                                                encoding='utf-8').split()
                 i = 0
                 while i < len(line):
                     yield line[i:i + self.max_sentence_length]
                     i += self.max_sentence_length
Пример #4
0
    def getstream(self):
        """Yield documents from the underlying plain text collection (of one or more files).
        Each item yielded from this method will be considered a document by subsequent
        preprocessing methods.
        """
        num_texts = 0
        with utils.file_or_filename(self.input) as f:
            for line in f:
                yield line
                num_texts += 1

        self.length = num_texts
Пример #5
0
    def getstream(self):
        """Yield documents from the underlying plain text collection (of one or more files).
        Each item yielded from this method will be considered a document by subsequent
        preprocessing methods.
        """
        num_texts = 0
        with utils.file_or_filename(self.input) as f:
            for line in f:
                yield line
                num_texts += 1

        self.length = num_texts
Пример #6
0
    def __iter__(self):
        """Iterate over all corpus.

        Yields
        ------
        (prev_id, document) : (int, list of (int, number)
            Number of document and document in BoW format.

        Notes
        -----
        Total number of vectors returned is always equal to the
        number of rows specified in the header; empty documents are inserted and
        yielded where appropriate, even if they are not explicitly stored in the
        Matrix Market file.

        """
        with utils.file_or_filename(self.input) as lines:
            self.skip_headers(lines)

            previd = -1
            for line in lines:
                docid, termid, val = utils.to_unicode(line).split()  # needed for python3
                if not self.transposed:
                    termid, docid = docid, termid
                # -1 because matrix market indexes are 1-based => convert to 0-based
                docid, termid, val = int(docid) - 1, int(termid) - 1, float(val)
                assert previd <= docid, "matrix columns must come in ascending order"
                if docid != previd:
                    # change of document: return the document read so far (its id is prevId)
                    if previd >= 0:
                        yield previd, document  # noqa:F821

                    # return implicit (empty) documents between previous id and new id
                    # too, to keep consistent document numbering and corpus length
                    for previd in xrange(previd + 1, docid):
                        yield previd, []

                    # from now on start adding fields to a new document, with a new id
                    previd = docid
                    document = []

                document.append((termid, val,))  # add another field to the current document

        # handle the last document, as a special case
        if previd >= 0:
            yield previd, document

        # return empty documents between the last explicit document and the number
        # of documents as specified in the header
        for previd in xrange(previd + 1, self.num_docs):
            yield previd, []
Пример #7
0
    def getstream(self):
        """Generate documents from the underlying plain text collection (of one or more files).

        Yields
        ------
        str
            Document read from plain-text file.

        Notes
        -----
        After generator end - initialize self.length attribute.

        """
        num_texts = 0
        with utils.file_or_filename(self.input) as f:
            for line in f:
                yield line
                num_texts += 1

        self.length = num_texts
Пример #8
0
    def __get_header_info(self):
        with utils.file_or_filename(self.fname) as lines:
            try:
                header = utils.to_unicode(next(lines)).strip()
                if not header.lower().startswith(
                        '%%matrixmarket matrix coordinate real general'):
                    raise ValueError(
                        "File %s not in Matrix Market format with coordinate real general; instead found: \n%s"
                        % (self.fname, header))
            except StopIteration:
                logger.error(u'corpus mm file header format error | %s' %
                             self.fname)

            self.num_docs = self.num_terms = self.num_nnz = 0
            for lineno, line in enumerate(lines):
                line = utils.to_unicode(line)
                if not line.startswith('%'):
                    self.num_docs, self.num_terms, self.num_nnz = map(
                        int, line.split())
                break
Пример #9
0
    def getstream(self):
        """Generate documents from the underlying plain text collection (of one or more files).

        Yields
        ------
        str
            Document read from plain-text file.

        Notes
        -----
        After generator end - initialize self.length attribute.

        """
        num_texts = 0
        with utils.file_or_filename(self.input) as f:
            for line in f:
                yield line
                num_texts += 1

        self.length = num_texts
Пример #10
0
    def get_texts(
        self, transtart=1, idpos=0, lemma=True, trunc=None, sw=stopwords.words("english"), suffix=".mde.trans.txt.idtxt"
    ):
        logger.debug("PREFIX:" + self.prefix)

        if os.path.isdir(self.input):
            logger.info("dir: " + self.input)
            filenames = glob.glob(self.input + "/" + self.prefix + "*" + suffix)
        elif os.path.isfile(self.input):
            print "isfile"
            filenames = [filename]
        else:
            filenames = []

        logger.debug("metadata:" + str(self.metadata))
        self.nitems = 0
        for filename in filenames:
            print filename
            currstream = utils.file_or_filename(filename)

            with currstream as lines:
                if self.sentseg:
                    for lineno, line0 in enumerate(lines):
                        line = " ".join(line0.split()[transtart:])
                        line = remove_punc(line=line)
                        sid = line0.split()[idpos]
                        yield get_trans(line, sid, nitems=self.nitems, lemma=lemma, sw=sw)

                        self.nitems += 1
                else:
                    line = ""
                    i = 0
                    for line0 in lines:
                        line = line + " ".join(line0.split()[transtart:])
                        if i == 0:
                            sid = line0.split()[idpos].split(".")[0]

                    line = remove_punc(line=line)
                    yield get_trans(line, sid, nitems=self.nitems, lemma=lemma, sw=sw)

                    self.nitems += 1
Пример #11
0
    def __init__(self, input, transposed=True):
        """

        Parameters
        ----------
        input : {str, file-like object}
            Path to input file or file-like object (in Matrix Market format).
        transposed : bool, optional
            "Orientation" of document. By default, documents should be rows of the matrix,
            otherwise, needed to set this to False

        """
        logger.info("initializing corpus reader from %s", input)
        self.input, self.transposed = input, transposed
        with utils.file_or_filename(self.input) as lines:
            try:
                header = utils.to_unicode(next(lines)).strip()
                if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
                    raise ValueError(
                        "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
                        (self.input, header)
                    )
            except StopIteration:
                pass

            self.num_docs = self.num_terms = self.num_nnz = 0
            for lineno, line in enumerate(lines):
                line = utils.to_unicode(line)
                if not line.startswith('%'):
                    self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split())
                    if not self.transposed:
                        self.num_docs, self.num_terms = self.num_terms, self.num_docs
                    break

        logger.info(
            "accepted corpus with %i documents, %i features, %i non-zero entries",
            self.num_docs, self.num_terms, self.num_nnz
        )
    def __init__(self, input, transposed=True):
        """

        Parameters
        ----------
        input : {str, file-like object}
            Path to input file or file-like object (in Matrix Market format).
        transposed : bool, optional
            "Orientation" of document. By default, documents should be rows of the matrix,
            otherwise, needed to set this to False

        """
        logger.info("initializing corpus reader from %s", input)
        self.input, self.transposed = input, transposed
        with utils.file_or_filename(self.input) as lines:
            try:
                header = utils.to_unicode(next(lines)).strip()
                if not header.lower().startswith(
                        '%%matrixmarket matrix coordinate real general'):
                    raise ValueError(
                        "File %s not in Matrix Market format with coordinate real general; instead found: \n%s"
                        % (self.input, header))
            except StopIteration:
                pass

            self.num_docs = self.num_terms = self.num_nnz = 0
            for lineno, line in enumerate(lines):
                line = utils.to_unicode(line)
                if not line.startswith('%'):
                    self.num_docs, self.num_terms, self.num_nnz = (
                        int(x) for x in line.split())
                    if not self.transposed:
                        self.num_docs, self.num_terms = self.num_terms, self.num_docs
                    break

        logger.info(
            "accepted corpus with %i documents, %i features, %i non-zero entries",
            self.num_docs, self.num_terms, self.num_nnz)
 def __init__(self, input):
     self.input = input
     self.length = sum(1 for _ in file_or_filename(self.input))
Пример #14
0
    corpus2_dir = [
        f for f in os.listdir(args.corpus2_path)
        if not f.startswith('.') and f.endswith('.txt.gz')
    ]
    if args.corpus3_path:
        corpus3_dir = [
            f for f in os.listdir(args.corpus3_path)
            if not f.startswith('.') and f.endswith('.txt.gz')
        ]

    assert len(corpus1_dir) == 1
    assert len(corpus2_dir) == 1
    if args.corpus3_path:
        assert len(corpus3_dir) == 1

    with gensim_utils.file_or_filename(
            os.path.join(args.corpus1_path, corpus1_dir[0])) as f:
        corpus1 = [
            gensim_utils.to_unicode(line, encoding='utf-8') for line in f
        ]

    with gensim_utils.file_or_filename(
            os.path.join(args.corpus2_path, corpus2_dir[0])) as f:
        corpus2 = [
            gensim_utils.to_unicode(line, encoding='utf-8') for line in f
        ]

    if args.corpus3_path:
        with gensim_utils.file_or_filename(
                os.path.join(args.corpus3_path, corpus3_dir[0])) as f:
            corpus3 = [
                gensim_utils.to_unicode(line, encoding='utf-8') for line in f
Пример #15
0
 def getstream(self):
     return utils.file_or_filename(self.input)
 def __iter__(self):
     """Yield each document as list of words separated by space."""
     with file_or_filename(self.input) as file:
         for line in file.read().splitlines():
             yield [str(byte_word, 'utf-8') for byte_word in line.split()]
 def getstream(self):
     return utils.file_or_filename(self.input)
Пример #18
0
    def get_texts(
        self,
        transtart=5,
        idpos=3,
        lemma=True,
        trunc=None,
        sw=stopwords.words("english"),
        wsize=None,
        markboundary=False,
        convfilter=pd.DataFrame(),
        convid=False,
        tokens_only=False,
    ):
        logger.debug("PREFIX:" + self.prefix)
        if os.path.isdir(self.input):
            filenames = glob.glob(self.input + "/" + self.prefix + "*.sent.txt")
            # print self.input
            # print filenameus
        elif os.path.isfile(self.input):
            logger.debug("isfile")
            filenames = [filename]
        else:
            filenames = []

        logger.debug("metadata:" + str(self.metadata))
        self.nitems = 0

        if not convfilter.empty:
            filenames = filterconvs(filenames, convfilter)
            logger.debug("no. files: %d" % len(filenames))

        for filename in filenames:
            # print filename
            # logger.info(filename)
            currstream = utils.file_or_filename(filename)
            altconv = os.path.basename(filename).split(".")[0]
            if markboundary:
                yield "#START#", (0, "0.0.0.0")
            with currstream as lines:
                ## Sentences
                if self.sentseg:
                    for lineno, line0 in enumerate(lines):
                        line = " ".join(line0.split()[transtart:])
                        line = remove_punc(line=line)
                        # print line
                        sid = line0.split()[idpos]
                        if convid:
                            yield altconv, get_trans(
                                line, sid, nitems=self.nitems, lemma=lemma, sw=sw, tokens_only=tokens_only
                            )
                        else:
                            yield get_trans(line, sid, nitems=self.nitems, lemma=lemma, sw=sw, tokens_only=tokens_only)

                        self.nitems += 1
                else:  ## Whole documents
                    line = ""
                    i = 0
                    for line0 in lines:
                        line = line + " ".join(line0.split()[transtart:])
                        if i == 0:
                            sid = line0.split()[idpos].split(".")[0]

                    line = remove_punc(line=line)
                    if convid:
                        yield altconv, get_trans(line, sid, nitems=self.nitems, lemma=lemma, sw=sw)
                    else:
                        yield get_trans(line, sid, nitems=self.nitems, lemma=lemma, sw=sw)

                    self.nitems += 1
Пример #19
0
    target_lemmas = {nlp(w).sentences[0].words[0].lemma: w for w in targets}
    logger.warning('\nTarget lemmas:')
    logger.warning('{}.\n'.format(', '.join(targets)))

    if lang == 'en':
        pos_tags = {w: pos_tags[target_lemmas[w]] for w in target_lemmas}

    start_time = time.time()

    logger.warning('Lemmatize corpus...')
    all_forms = {lemma: {lemma} for lemma in target_lemmas}
    for path in [args.train_path, args.test_path]:
        if not path:
            continue

        with gensim_utils.file_or_filename(path) as f:
            n_lines = 0
            for line in f:
                n_lines += 1

        with gensim_utils.file_or_filename(path) as f:
            lines = ''
            for line in tqdm(f, total=n_lines):
                line = gensim_utils.to_unicode(line, encoding='utf-8')
                for sentence in nlp(line).sentences:
                    for w in sentence.words:
                        if w.lemma in target_lemmas:
                            if lang != 'en' or (lang == 'en' and pos_match(
                                    pos_tags[w.lemma], w.pos)):
                                all_forms[w.lemma].add(w.text)