def test_read_stream_until_success(self): FIRST_WORD = self.data_str[:5] data, result = read_stream_until(self.stream, FIRST_WORD) self.assertEqual(data, FIRST_WORD) self.assertTrue(result) self.assertEqual(self.stream.tell(), len(data)) remaining_1st_line = self.data_str.split('\n')[0] remaining_1st_line = remaining_1st_line[len(data):] + '\n' data, result = read_stream_until(self.stream, '\n') self.assertEqual(data, remaining_1st_line) self.assertTrue(result) self.assertEqual(self.stream.tell(), len(remaining_1st_line) + len(FIRST_WORD))
def __skip_tokens(self, tokenized, message): print '%s: %s' % (message, tokenized) for token in tokenized.split(): unescaped = StanfordParsedSentence.unescape_token_text(token) _, found_token = read_stream_until(self._parse_file, unescaped, False, False) assert found_token, ('Skipped token not found: %s' % unescaped).encode('ascii', 'replace')
def get_next_sentence(self): if not self._parse_file: return None # Read the next 3 blocks of the parse file. tokenized = self._parse_file.readline() if not tokenized: # empty string means we've hit the end of the file return None tokenized = tokenized.strip() tmp = self._parse_file.readline() assert not tmp.strip(), ( 'Invalid parse file: expected blank line after tokens: %s' % tokenized).encode('ascii', 'replace') lemmas = self._parse_file.readline() lemmas = lemmas.strip() assert lemmas, ( 'Invalid parse file: expected lemmas line after tokens: %s' % tokenized).encode('ascii', 'replace') tmp = self._parse_file.readline() assert not tmp.strip(), ( 'Invalid parse file: expected blank line after lemmas: %s' % lemmas).encode('ascii', 'replace') # If the sentence was unparsed, don't return a new sentence object for # it, but do advance the stream past the unparsed words. # NOTE: This relies on the printWordsForUnparsed flag we introduced to # the Stanford parser. if lemmas == '(())': self.__skip_tokens(tokenized, 'Ignoring unparsed sentence') return self.get_next() # Process the constituency parse, if present. if peek_and_revert_unless(self._parse_file, lambda x: False)[0] == '(': constituency_parse, double_newline_found = read_stream_until( self._parse_file, '\n\n') assert double_newline_found, ( 'Invalid parse file: expected blank line after constituency parse: %s' % constituency_parse).encode('ascii', 'replace') else: constituency_parse = None parse_lines = [] tmp = self._parse_file.readline().strip() if not tmp: self.__skip_tokens( tokenized, 'Skipping sentence with empty dependency parse') return self.get_next() while tmp: parse_lines.append(tmp) tmp = self._parse_file.readline().strip() # Leaves file in the state where the final blank line after the edges # has been read. This also means that if there's a blank line at the end # of a file, it won't make us think there's another entry coming. # Now create the sentence from the read data + the text file. sentence = self.sentence_class(tokenized, lemmas, constituency_parse, parse_lines, self._file_stream) assert (len( sentence.original_text) == self._file_stream.character_position - sentence.document_char_offset), ( 'Sentence length != offset difference: %s' % sentence.original_text).encode('ascii', 'replace') return sentence
def __align_tokens_to_text(self, document_text): eat_whitespace(document_text) self.document_char_offset = document_text.character_position # Root has no alignment to source. self.tokens[0].start_offset = None self.tokens[0].end_offset = None non_root_tokens = self.tokens[1:] for i, token in enumerate(non_root_tokens): # i is one less than the index of the current token in self.tokens, # because root. original = token.original_text if token.is_absent: # Handle case of duplicated character, which is the only type of # absent token that will have been detected so far. prev_token = self.tokens[i] if prev_token.original_text.endswith(original): # print "Found duplicated token:", ( # token.original_text.encode('utf-8')) token.start_offset = prev_token.end_offset - len(original) token.end_offset = prev_token.end_offset elif original == '.' and i == len(non_root_tokens) - 1: # End-of-sentence period gets special treatment: the "real" # original text may have been a period substitute or missing. # (Other things can get converted to fake end-of-sentence # periods to make life easier for the parser.) start_pos = document_text.tell() eaten_ws = eat_whitespace(document_text, True) not_at_eof = not is_at_eof(document_text) next_char, next_is_period_sub = peek_and_revert_unless( document_text, lambda char: self.PERIOD_SUBSTITUTES.find(char) != -1) if (not_at_eof and next_is_period_sub): # We've moved the stream over the period, so adjust offset. token.start_offset = (document_text.character_position - self.document_char_offset - 1) token.end_offset = token.start_offset + 1 token.original_text = next_char self.original_text += eaten_ws + next_char else: # The period is actually not there. token.is_absent = True token.original_text = '' document_text.seek(start_pos) else: # Normal case: just read the next token. search_start = document_text.character_position # Our preprocessing may hallucinate periods onto the ends of # abbreviations, particularly "U.S." Deal with them. if original[-1] == '.': token_text_to_find = original[:-1] else: token_text_to_find = original text_until_token, found_token = ( read_stream_until(document_text, token_text_to_find, True)) self.original_text += text_until_token assert found_token, ( (u'Could not find token "%s" starting at position %d ' '(accumulated: %s)') % ( original, search_start, self.original_text)).encode('utf-8') if original[-1] == '.': # If it ends in a period, and the next character in the # stream is a period, it's a duplicated period. Advance # over the period and append it to the accumulated text. _, is_period = peek_and_revert_unless( document_text, lambda char: char == '.') if is_period: self.original_text += '.' token.end_offset = (document_text.character_position - self.document_char_offset) token.start_offset = token.end_offset - len(original) '''
def test_read_stream_until_failure(self): data, result = read_stream_until(self.stream, 'q') self.assertEqual(data, self.data_str) self.assertFalse(result) self.assertEqual(self.stream.tell(), len(self.data_str))