def derivation_with_index(self, filename, index=None): with open(filename, 'r') as file: if index: return self.parse_file(''.join( nth_occurrence(file.xreadlines(), N=index, when=lambda line: re.match(r"^\(", line), until=lambda line: re.match(r"^\(", line)))) else: return self.parse_file(file.read())
def derivation_with_index(self, filename, index=None): self.file = open(filename, 'r') base = imap(lambda line: line.rstrip(), self.file.xreadlines()) if index: lines = nth_occurrence(base, N=1, # put a space after the pattern to ensure we match the whole token when=lambda line: re.match(r"^ID=wsj_%02d%02d.%d " % (self.sec_no, self.doc_no, index), line), until=lambda line: re.match(r"^ID", line)) return iter(lines) else: return base
def derivation_with_index(self, filename, index=None): self.file = open(filename, "r") base = imap(lambda line: line.rstrip(), self.file.xreadlines()) if index: lines = nth_occurrence( base, N=1, # put a space after the pattern to ensure we match the whole token when=lambda line: re.match(r"^ID=wsj_%02d%02d.%d " % (self.sec_no, self.doc_no, index), line), until=lambda line: re.match(r"^ID", line), ) return iter(lines) else: return base
def derivation_with_index(self, filename, i=None): self.contents = SGMLBag() with open(filename, 'r') as file: if i: text = ''.join(nth_occurrence(file.xreadlines(), N=i, when=lambda line: re.match(r'^<S', line), until=lambda line: re.match(r'^</S', line))) else: text = file.read() self.contents.feed(text) # HACK HACK HACK: # Sometimes <S>...</S> encloses more than one root (3:7 has some); # in which case, counting <S> will undercount the number of sentences if self.contents['s'] is None: return parse_tree('', AugmentedPennParser) return parse_tree('\n'.join(self.contents['s']), AugmentedPennParser)
def derivation_with_index(self, filename, i=None): self.contents = SGMLBag() with open(filename, 'r') as file: headline_lines = nth_occurrence(file, N=1, when=lambda line: re.match(r'^<HEADLINE', line), until=lambda line: re.match(r'^</HEADLINE', line)) if not headline_lines: return None if not headline_lines[0].startswith('<HEADLINE'): raise CPTBParseException('Expected to find a <HEADLINE> line.') headline_lines = headline_lines[1:] # strip off <HEADLINE> if i: text = ''.join(headline_lines[i]) else: text = '\n'.join(headline_lines) self.contents.feed(text) return parse_tree('\n'.join(self.contents['s']), AugmentedPennParser)
def derivation_with_index(self, filename, i=None): self.contents = SGMLBag() with open(filename, 'r') as file: if i: text = ''.join( nth_occurrence(file.xreadlines(), N=i, when=lambda line: re.match(r'^<S', line), until=lambda line: re.match(r'^</S', line))) else: text = file.read() self.contents.feed(text) # HACK HACK HACK: # Sometimes <S>...</S> encloses more than one root (3:7 has some); # in which case, counting <S> will undercount the number of sentences if self.contents['s'] is None: return parse_tree('', AugmentedPennParser) return parse_tree('\n'.join(self.contents['s']), AugmentedPennParser)
def derivation_with_index(self, filename, i=None): self.contents = SGMLBag() with open(filename, 'r') as file: headline_lines = nth_occurrence( file, N=1, when=lambda line: re.match(r'^<HEADLINE', line), until=lambda line: re.match(r'^</HEADLINE', line)) if not headline_lines: return None if not headline_lines[0].startswith('<HEADLINE'): raise CPTBParseException('Expected to find a <HEADLINE> line.') headline_lines = headline_lines[1:] # strip off <HEADLINE> if i: text = ''.join(headline_lines[i]) else: text = '\n'.join(headline_lines) self.contents.feed(text) return parse_tree('\n'.join(self.contents['s']), AugmentedPennParser)