예제 #1
0
    def test_read_stream_until_success(self):
        FIRST_WORD = self.data_str[:5]
        data, result = read_stream_until(self.stream, FIRST_WORD)
        self.assertEqual(data, FIRST_WORD)
        self.assertTrue(result)
        self.assertEqual(self.stream.tell(), len(data))

        remaining_1st_line = self.data_str.split('\n')[0]
        remaining_1st_line = remaining_1st_line[len(data):] + '\n'
        data, result = read_stream_until(self.stream, '\n')
        self.assertEqual(data, remaining_1st_line)
        self.assertTrue(result)
        self.assertEqual(self.stream.tell(),
                         len(remaining_1st_line) + len(FIRST_WORD))
예제 #2
0
 def __skip_tokens(self, tokenized, message):
     print '%s: %s' % (message, tokenized)
     for token in tokenized.split():
         unescaped = StanfordParsedSentence.unescape_token_text(token)
         _, found_token = read_stream_until(self._parse_file, unescaped,
                                            False, False)
         assert found_token, ('Skipped token not found: %s' %
                              unescaped).encode('ascii', 'replace')
예제 #3
0
    def get_next_sentence(self):
        if not self._parse_file:
            return None
        # Read the next 3 blocks of the parse file.
        tokenized = self._parse_file.readline()
        if not tokenized:  # empty string means we've hit the end of the file
            return None
        tokenized = tokenized.strip()
        tmp = self._parse_file.readline()
        assert not tmp.strip(), (
            'Invalid parse file: expected blank line after tokens: %s' %
            tokenized).encode('ascii', 'replace')

        lemmas = self._parse_file.readline()
        lemmas = lemmas.strip()
        assert lemmas, (
            'Invalid parse file: expected lemmas line after tokens: %s' %
            tokenized).encode('ascii', 'replace')
        tmp = self._parse_file.readline()
        assert not tmp.strip(), (
            'Invalid parse file: expected blank line after lemmas: %s' %
            lemmas).encode('ascii', 'replace')

        # If the sentence was unparsed, don't return a new sentence object for
        # it, but do advance the stream past the unparsed words.
        # NOTE: This relies on the printWordsForUnparsed flag we introduced to
        # the Stanford parser.
        if lemmas == '(())':
            self.__skip_tokens(tokenized, 'Ignoring unparsed sentence')
            return self.get_next()

        # Process the constituency parse, if present.
        if peek_and_revert_unless(self._parse_file, lambda x: False)[0] == '(':
            constituency_parse, double_newline_found = read_stream_until(
                self._parse_file, '\n\n')
            assert double_newline_found, (
                'Invalid parse file: expected blank line after constituency parse: %s'
                % constituency_parse).encode('ascii', 'replace')
        else:
            constituency_parse = None

        parse_lines = []
        tmp = self._parse_file.readline().strip()
        if not tmp:
            self.__skip_tokens(
                tokenized, 'Skipping sentence with empty dependency parse')
            return self.get_next()
        while tmp:
            parse_lines.append(tmp)
            tmp = self._parse_file.readline().strip()

        # Leaves file in the state where the final blank line after the edges
        # has been read. This also means that if there's a blank line at the end
        # of a file, it won't make us think there's another entry coming.

        # Now create the sentence from the read data + the text file.
        sentence = self.sentence_class(tokenized, lemmas, constituency_parse,
                                       parse_lines, self._file_stream)
        assert (len(
            sentence.original_text) == self._file_stream.character_position -
                sentence.document_char_offset), (
                    'Sentence length != offset difference: %s' %
                    sentence.original_text).encode('ascii', 'replace')
        return sentence
예제 #4
0
    def __align_tokens_to_text(self, document_text):
        eat_whitespace(document_text)
        self.document_char_offset = document_text.character_position

        # Root has no alignment to source.
        self.tokens[0].start_offset = None
        self.tokens[0].end_offset = None

        non_root_tokens = self.tokens[1:]
        for i, token in enumerate(non_root_tokens):
            # i is one less than the index of the current token in self.tokens,
            # because root.
            original = token.original_text
            if token.is_absent:
                # Handle case of duplicated character, which is the only type of
                # absent token that will have been detected so far.
                prev_token = self.tokens[i]
                if prev_token.original_text.endswith(original):
                    # print "Found duplicated token:", (
                    #    token.original_text.encode('utf-8'))
                    token.start_offset = prev_token.end_offset - len(original)
                    token.end_offset = prev_token.end_offset
            elif original == '.' and i == len(non_root_tokens) - 1:
                # End-of-sentence period gets special treatment: the "real"
                # original text may have been a period substitute or missing.
                # (Other things can get converted to fake end-of-sentence
                # periods to make life easier for the parser.)
                start_pos = document_text.tell()
                eaten_ws = eat_whitespace(document_text, True)
                not_at_eof = not is_at_eof(document_text)
                next_char, next_is_period_sub = peek_and_revert_unless(
                    document_text,
                    lambda char: self.PERIOD_SUBSTITUTES.find(char) != -1)
                if (not_at_eof and next_is_period_sub):
                    # We've moved the stream over the period, so adjust offset.
                    token.start_offset = (document_text.character_position
                                          - self.document_char_offset - 1)
                    token.end_offset = token.start_offset + 1
                    token.original_text = next_char
                    self.original_text += eaten_ws + next_char
                else:
                    # The period is actually not there.
                    token.is_absent = True
                    token.original_text = ''
                    document_text.seek(start_pos)
            else: # Normal case: just read the next token.
                search_start = document_text.character_position
                # Our preprocessing may hallucinate periods onto the ends of
                # abbreviations, particularly "U.S." Deal with them.
                if original[-1] == '.':
                    token_text_to_find = original[:-1]
                else:
                    token_text_to_find = original

                text_until_token, found_token = (
                    read_stream_until(document_text, token_text_to_find, True))
                self.original_text += text_until_token
                assert found_token, (
                    (u'Could not find token "%s" starting at position %d '
                     '(accumulated: %s)') % (
                    original, search_start, self.original_text)).encode('utf-8')

                if original[-1] == '.':
                    # If it ends in a period, and the next character in the
                    # stream is a period, it's a duplicated period. Advance
                    # over the period and append it to the accumulated text.
                    _, is_period = peek_and_revert_unless(
                        document_text, lambda char: char == '.')
                    if is_period:
                        self.original_text += '.'
                token.end_offset = (document_text.character_position
                                    - self.document_char_offset)
                token.start_offset = token.end_offset - len(original)

            '''
예제 #5
0
 def test_read_stream_until_failure(self):
     data, result = read_stream_until(self.stream, 'q')
     self.assertEqual(data, self.data_str)
     self.assertFalse(result)
     self.assertEqual(self.stream.tell(), len(self.data_str))