def __init__(self, *args, **kwargs): StreamBackedCorpusView.__init__(self, *args, **kwargs) # open self._stream self._open() # skip the heading block read_blankline_block(self._stream) # reset the start position to the current position in the stream self._filepos = [self._stream.tell()]
def _annotate_sents(self, sents, j_class, lemma_col, tag_col, outfile=None): from nltk.corpus.reader.util import read_blankline_block input_txt = self._to_conll09(sents, lemma_col, tag_col) with tempfile.NamedTemporaryFile(mode='w+', delete=True) as input_file, \ tempfile.NamedTemporaryFile(mode='w+', delete=True) as output_file: # prepare the input file input_file.writelines(input_txt) input_file.flush() stdout, stderr = self._execute(j_class, input_file.name, output_file.name) output_file.seek(0) conll = [] while True: block = read_blankline_block(output_file) if block: conll.extend(block) else: break if outfile: with open(outfile, 'w') as out: "\n".join(conll) return conll
def tag_sents(self, sents): from nltk.corpus.reader.util import read_blankline_block input_txt = self._prepare_input(sents) with tempfile.NamedTemporaryFile(mode='w+', delete=True) as input_file, \ tempfile.NamedTemporaryFile(mode='w+', delete=True) as output_file: # prepare the input file input_file.writelines(input_txt) input_file.flush() stdout, stderr = self.tag(input_file.name, output_file.name) output_file.seek(0) conll = [] while True: block = read_blankline_block(output_file) if block: conll.extend(block) else: break tagged_sents = [] for s in conll: sent = [(l.split("\t")[1], l.split("\t")[5]) for l in s.rstrip().split("\n")] tagged_sents.append(sent) return tagged_sents
def _read_block(self, stream): sent_block = read_blankline_block(stream) if not sent_block: return sent_block lines_w_comments = sent_block[0].split('\n') lines_wo_comments = (line.strip() for line in lines_w_comments if line and line[0] != '#') field_block = (line.split('\t') for line in lines_wo_comments) # need to kill lines that represent contractions. Their first # field is a range (e.g. 1-2) field_block = (fields for fields in field_block if '-' not in fields[0]) # "blocks" are lists of sentences, so return our generator # encapsulated return [field_block]
def _read_grid_block(self, stream): grids = [] for block in read_blankline_block(stream): block = block.strip() if not block: continue grid = [ line.split() for line in block.split('\n') if line[0] != "#" ] if not grid: continue # If there's a docstart row, then discard. ([xx] eventually it # would be good to actually use it) if grid[0][self._colmap.get('words', 0)] == '-DOCSTART-': del grid[0] # Check that the grid is consistent. for row in grid: if len(row) != len(grid[0]): raise ValueError('Inconsistent number of columns:\n%s' % block) grids.append(grid) return grids
def _read_block(self, stream): # blocks are split by blankline (or EOF) - default return read_blankline_block(stream)