Пример #1
0
 def __init__(self, *args, **kwargs):
     StreamBackedCorpusView.__init__(self, *args, **kwargs)
     # open self._stream
     self._open()
     # skip the heading block
     read_blankline_block(self._stream)
     # reset the start position to the current position in the stream
     self._filepos = [self._stream.tell()]
Пример #2
0
	def __init__(self, *args, **kwargs):
		StreamBackedCorpusView.__init__(self, *args, **kwargs)
		# open self._stream
		self._open()
		# skip the heading block
		read_blankline_block(self._stream)
		# reset the start position to the current position in the stream
		self._filepos = [self._stream.tell()]
Пример #3
0
    def _annotate_sents(self,
                        sents,
                        j_class,
                        lemma_col,
                        tag_col,
                        outfile=None):
        from nltk.corpus.reader.util import read_blankline_block

        input_txt = self._to_conll09(sents, lemma_col, tag_col)

        with tempfile.NamedTemporaryFile(mode='w+', delete=True) as input_file, \
                tempfile.NamedTemporaryFile(mode='w+', delete=True) as output_file:

            # prepare the input file
            input_file.writelines(input_txt)
            input_file.flush()
            stdout, stderr = self._execute(j_class, input_file.name,
                                           output_file.name)

            output_file.seek(0)

            conll = []
            while True:
                block = read_blankline_block(output_file)
                if block:
                    conll.extend(block)
                else:
                    break
        if outfile:
            with open(outfile, 'w') as out:
                "\n".join(conll)
        return conll
Пример #4
0
    def tag_sents(self, sents):
        from nltk.corpus.reader.util import read_blankline_block

        input_txt = self._prepare_input(sents)

        with tempfile.NamedTemporaryFile(mode='w+', delete=True) as input_file, \
                tempfile.NamedTemporaryFile(mode='w+', delete=True) as output_file:

            # prepare the input file
            input_file.writelines(input_txt)
            input_file.flush()
            stdout, stderr = self.tag(input_file.name, output_file.name)

            output_file.seek(0)

            conll = []
            while True:
                block = read_blankline_block(output_file)
                if block:
                    conll.extend(block)
                else:
                    break
        tagged_sents = []
        for s in conll:
            sent = [(l.split("\t")[1], l.split("\t")[5])
                    for l in s.rstrip().split("\n")]
            tagged_sents.append(sent)
        return tagged_sents
Пример #5
0
 def _read_block(self, stream):
     sent_block = read_blankline_block(stream)
     if not sent_block:
         return sent_block
     lines_w_comments = sent_block[0].split('\n')
     lines_wo_comments = (line.strip() for line in lines_w_comments
                          if line and line[0] != '#')
     field_block = (line.split('\t') for line in lines_wo_comments)
     # need to kill lines that represent contractions. Their first
     # field is a range (e.g. 1-2)
     field_block = (fields for fields in field_block
                    if '-' not in fields[0])
     # "blocks" are lists of sentences, so return our generator
     # encapsulated
     return [field_block]
    def _read_grid_block(self, stream):
        grids = []
        for block in read_blankline_block(stream):
            block = block.strip()
            if not block: continue

            grid = [
                line.split() for line in block.split('\n') if line[0] != "#"
            ]

            if not grid: continue

            # If there's a docstart row, then discard. ([xx] eventually it
            # would be good to actually use it)
            if grid[0][self._colmap.get('words', 0)] == '-DOCSTART-':
                del grid[0]

            # Check that the grid is consistent.
            for row in grid:
                if len(row) != len(grid[0]):
                    raise ValueError('Inconsistent number of columns:\n%s' %
                                     block)
            grids.append(grid)
        return grids
Пример #7
0
 def _read_block(self, stream):
     # blocks are split by blankline (or EOF) - default
     return read_blankline_block(stream)
Пример #8
0
Файл: knbc.py Проект: DrDub/nltk
 def _read_block(self, stream):
     # blocks are split by blankline (or EOF) - default
     return read_blankline_block(stream)