def __init__(self, input, transposed=True): """ Initialize the matrix reader. The `input` refers to a file on local filesystem, which is expected to be in the sparse (coordinate) Matrix Market format. Documents are assumed to be rows of the matrix (and document features are columns). `input` is either a string (file path) or a file-like object that supports `seek()` (e.g. gzip.GzipFile, bz2.BZ2File). """ logger.info("initializing corpus reader from %s" % input) self.input, self.transposed = input, transposed with utils.file_or_filename(self.input) as lines: try: header = utils.to_unicode(next(lines)).strip() if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): raise ValueError("File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % (self.input, header)) except StopIteration: pass self.num_docs = self.num_terms = self.num_nnz = 0 for lineno, line in enumerate(lines): line = utils.to_unicode(line) if not line.startswith('%'): self.num_docs, self.num_terms, self.num_nnz = map(int, line.split()) if not self.transposed: self.num_docs, self.num_terms = self.num_terms, self.num_docs break logger.info("accepted corpus with %i documents, %i features, %i non-zero entries" % (self.num_docs, self.num_terms, self.num_nnz))
def __iter__(self): """Iterate over all corpus. Yields ------ (prev_id, document) : (int, list of (int, number) Number of document and document in BoW format. Notes ----- Total number of vectors returned is always equal to the number of rows specified in the header; empty documents are inserted and yielded where appropriate, even if they are not explicitly stored in the Matrix Market file. """ with utils.file_or_filename(self.input) as lines: self.skip_headers(lines) previd = -1 for line in lines: docid, termid, val = utils.to_unicode( line).split() # needed for python3 if not self.transposed: termid, docid = docid, termid # -1 because matrix market indexes are 1-based => convert to 0-based docid, termid, val = int(docid) - 1, int(termid) - 1, float( val) assert previd <= docid, "matrix columns must come in ascending order" if docid != previd: # change of document: return the document read so far (its id is prevId) if previd >= 0: yield previd, document # noqa:F821 # return implicit (empty) documents between previous id and new id # too, to keep consistent document numbering and corpus length for previd in xrange(previd + 1, docid): yield previd, [] # from now on start adding fields to a new document, with a new id previd = docid document = [] document.append(( termid, val, )) # add another field to the current document # handle the last document, as a special case if previd >= 0: yield previd, document # return empty documents between the last explicit document and the number # of documents as specified in the header for previd in xrange(previd + 1, self.num_docs): yield previd, []
def __iter__(self): """iterate through the files""" for file_name in self.input_files: logger.info('reading file %s', file_name) with gensim_utils.file_or_filename(file_name) as fin: for line in itertools.islice(fin, self.limit): line = gensim_utils.to_unicode(line, encoding='utf-8').split() i = 0 while i < len(line): yield line[i:i + self.max_sentence_length] i += self.max_sentence_length
def getstream(self): """Yield documents from the underlying plain text collection (of one or more files). Each item yielded from this method will be considered a document by subsequent preprocessing methods. """ num_texts = 0 with utils.file_or_filename(self.input) as f: for line in f: yield line num_texts += 1 self.length = num_texts
def __iter__(self): """Iterate over all corpus. Yields ------ (prev_id, document) : (int, list of (int, number) Number of document and document in BoW format. Notes ----- Total number of vectors returned is always equal to the number of rows specified in the header; empty documents are inserted and yielded where appropriate, even if they are not explicitly stored in the Matrix Market file. """ with utils.file_or_filename(self.input) as lines: self.skip_headers(lines) previd = -1 for line in lines: docid, termid, val = utils.to_unicode(line).split() # needed for python3 if not self.transposed: termid, docid = docid, termid # -1 because matrix market indexes are 1-based => convert to 0-based docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) assert previd <= docid, "matrix columns must come in ascending order" if docid != previd: # change of document: return the document read so far (its id is prevId) if previd >= 0: yield previd, document # noqa:F821 # return implicit (empty) documents between previous id and new id # too, to keep consistent document numbering and corpus length for previd in xrange(previd + 1, docid): yield previd, [] # from now on start adding fields to a new document, with a new id previd = docid document = [] document.append((termid, val,)) # add another field to the current document # handle the last document, as a special case if previd >= 0: yield previd, document # return empty documents between the last explicit document and the number # of documents as specified in the header for previd in xrange(previd + 1, self.num_docs): yield previd, []
def getstream(self): """Generate documents from the underlying plain text collection (of one or more files). Yields ------ str Document read from plain-text file. Notes ----- After generator end - initialize self.length attribute. """ num_texts = 0 with utils.file_or_filename(self.input) as f: for line in f: yield line num_texts += 1 self.length = num_texts
def __get_header_info(self): with utils.file_or_filename(self.fname) as lines: try: header = utils.to_unicode(next(lines)).strip() if not header.lower().startswith( '%%matrixmarket matrix coordinate real general'): raise ValueError( "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % (self.fname, header)) except StopIteration: logger.error(u'corpus mm file header format error | %s' % self.fname) self.num_docs = self.num_terms = self.num_nnz = 0 for lineno, line in enumerate(lines): line = utils.to_unicode(line) if not line.startswith('%'): self.num_docs, self.num_terms, self.num_nnz = map( int, line.split()) break
def get_texts( self, transtart=1, idpos=0, lemma=True, trunc=None, sw=stopwords.words("english"), suffix=".mde.trans.txt.idtxt" ): logger.debug("PREFIX:" + self.prefix) if os.path.isdir(self.input): logger.info("dir: " + self.input) filenames = glob.glob(self.input + "/" + self.prefix + "*" + suffix) elif os.path.isfile(self.input): print "isfile" filenames = [filename] else: filenames = [] logger.debug("metadata:" + str(self.metadata)) self.nitems = 0 for filename in filenames: print filename currstream = utils.file_or_filename(filename) with currstream as lines: if self.sentseg: for lineno, line0 in enumerate(lines): line = " ".join(line0.split()[transtart:]) line = remove_punc(line=line) sid = line0.split()[idpos] yield get_trans(line, sid, nitems=self.nitems, lemma=lemma, sw=sw) self.nitems += 1 else: line = "" i = 0 for line0 in lines: line = line + " ".join(line0.split()[transtart:]) if i == 0: sid = line0.split()[idpos].split(".")[0] line = remove_punc(line=line) yield get_trans(line, sid, nitems=self.nitems, lemma=lemma, sw=sw) self.nitems += 1
def __init__(self, input, transposed=True): """ Parameters ---------- input : {str, file-like object} Path to input file or file-like object (in Matrix Market format). transposed : bool, optional "Orientation" of document. By default, documents should be rows of the matrix, otherwise, needed to set this to False """ logger.info("initializing corpus reader from %s", input) self.input, self.transposed = input, transposed with utils.file_or_filename(self.input) as lines: try: header = utils.to_unicode(next(lines)).strip() if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): raise ValueError( "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % (self.input, header) ) except StopIteration: pass self.num_docs = self.num_terms = self.num_nnz = 0 for lineno, line in enumerate(lines): line = utils.to_unicode(line) if not line.startswith('%'): self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) if not self.transposed: self.num_docs, self.num_terms = self.num_terms, self.num_docs break logger.info( "accepted corpus with %i documents, %i features, %i non-zero entries", self.num_docs, self.num_terms, self.num_nnz )
def __init__(self, input, transposed=True): """ Parameters ---------- input : {str, file-like object} Path to input file or file-like object (in Matrix Market format). transposed : bool, optional "Orientation" of document. By default, documents should be rows of the matrix, otherwise, needed to set this to False """ logger.info("initializing corpus reader from %s", input) self.input, self.transposed = input, transposed with utils.file_or_filename(self.input) as lines: try: header = utils.to_unicode(next(lines)).strip() if not header.lower().startswith( '%%matrixmarket matrix coordinate real general'): raise ValueError( "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % (self.input, header)) except StopIteration: pass self.num_docs = self.num_terms = self.num_nnz = 0 for lineno, line in enumerate(lines): line = utils.to_unicode(line) if not line.startswith('%'): self.num_docs, self.num_terms, self.num_nnz = ( int(x) for x in line.split()) if not self.transposed: self.num_docs, self.num_terms = self.num_terms, self.num_docs break logger.info( "accepted corpus with %i documents, %i features, %i non-zero entries", self.num_docs, self.num_terms, self.num_nnz)
def __init__(self, input): self.input = input self.length = sum(1 for _ in file_or_filename(self.input))
corpus2_dir = [ f for f in os.listdir(args.corpus2_path) if not f.startswith('.') and f.endswith('.txt.gz') ] if args.corpus3_path: corpus3_dir = [ f for f in os.listdir(args.corpus3_path) if not f.startswith('.') and f.endswith('.txt.gz') ] assert len(corpus1_dir) == 1 assert len(corpus2_dir) == 1 if args.corpus3_path: assert len(corpus3_dir) == 1 with gensim_utils.file_or_filename( os.path.join(args.corpus1_path, corpus1_dir[0])) as f: corpus1 = [ gensim_utils.to_unicode(line, encoding='utf-8') for line in f ] with gensim_utils.file_or_filename( os.path.join(args.corpus2_path, corpus2_dir[0])) as f: corpus2 = [ gensim_utils.to_unicode(line, encoding='utf-8') for line in f ] if args.corpus3_path: with gensim_utils.file_or_filename( os.path.join(args.corpus3_path, corpus3_dir[0])) as f: corpus3 = [ gensim_utils.to_unicode(line, encoding='utf-8') for line in f
def getstream(self): return utils.file_or_filename(self.input)
def __iter__(self): """Yield each document as list of words separated by space.""" with file_or_filename(self.input) as file: for line in file.read().splitlines(): yield [str(byte_word, 'utf-8') for byte_word in line.split()]
def get_texts( self, transtart=5, idpos=3, lemma=True, trunc=None, sw=stopwords.words("english"), wsize=None, markboundary=False, convfilter=pd.DataFrame(), convid=False, tokens_only=False, ): logger.debug("PREFIX:" + self.prefix) if os.path.isdir(self.input): filenames = glob.glob(self.input + "/" + self.prefix + "*.sent.txt") # print self.input # print filenameus elif os.path.isfile(self.input): logger.debug("isfile") filenames = [filename] else: filenames = [] logger.debug("metadata:" + str(self.metadata)) self.nitems = 0 if not convfilter.empty: filenames = filterconvs(filenames, convfilter) logger.debug("no. files: %d" % len(filenames)) for filename in filenames: # print filename # logger.info(filename) currstream = utils.file_or_filename(filename) altconv = os.path.basename(filename).split(".")[0] if markboundary: yield "#START#", (0, "0.0.0.0") with currstream as lines: ## Sentences if self.sentseg: for lineno, line0 in enumerate(lines): line = " ".join(line0.split()[transtart:]) line = remove_punc(line=line) # print line sid = line0.split()[idpos] if convid: yield altconv, get_trans( line, sid, nitems=self.nitems, lemma=lemma, sw=sw, tokens_only=tokens_only ) else: yield get_trans(line, sid, nitems=self.nitems, lemma=lemma, sw=sw, tokens_only=tokens_only) self.nitems += 1 else: ## Whole documents line = "" i = 0 for line0 in lines: line = line + " ".join(line0.split()[transtart:]) if i == 0: sid = line0.split()[idpos].split(".")[0] line = remove_punc(line=line) if convid: yield altconv, get_trans(line, sid, nitems=self.nitems, lemma=lemma, sw=sw) else: yield get_trans(line, sid, nitems=self.nitems, lemma=lemma, sw=sw) self.nitems += 1
target_lemmas = {nlp(w).sentences[0].words[0].lemma: w for w in targets} logger.warning('\nTarget lemmas:') logger.warning('{}.\n'.format(', '.join(targets))) if lang == 'en': pos_tags = {w: pos_tags[target_lemmas[w]] for w in target_lemmas} start_time = time.time() logger.warning('Lemmatize corpus...') all_forms = {lemma: {lemma} for lemma in target_lemmas} for path in [args.train_path, args.test_path]: if not path: continue with gensim_utils.file_or_filename(path) as f: n_lines = 0 for line in f: n_lines += 1 with gensim_utils.file_or_filename(path) as f: lines = '' for line in tqdm(f, total=n_lines): line = gensim_utils.to_unicode(line, encoding='utf-8') for sentence in nlp(line).sentences: for w in sentence.words: if w.lemma in target_lemmas: if lang != 'en' or (lang == 'en' and pos_match( pos_tags[w.lemma], w.pos)): all_forms[w.lemma].add(w.text)