예제 #1
0
class CausationInstance(_RelationInstance):
    Degrees = Enum(['Facilitate', 'Enable', 'Disentail', 'Inhibit'])
    CausationTypes = Enum(
        ['Consequence', 'Motivation', 'Purpose', 'Inference'])
    _types = CausationTypes
    _num_args = 3

    def __init__(self,
                 source_sentence,
                 degree=None,
                 causation_type=None,
                 connective=None,
                 cause=None,
                 effect=None,
                 means=None,
                 annotation_id=None):
        if degree is None:
            degree = len(self.Degrees)
        if causation_type is None:
            degree = len(self.CausationTypes)

        super(CausationInstance,
              self).__init__(source_sentence, connective, cause, effect,
                             causation_type, annotation_id)
        self.degree = degree
        self.arg2 = means

    # Map argument attribute names to arg_i attributes.
    arg_names = bidict({'arg0': 'cause', 'arg1': 'effect', 'arg2': 'means'})
예제 #2
0
class OverlappingRelationInstance(_RelationInstance):
    RelationTypes = Enum([
        'Temporal', 'Correlation', 'Hypothetical', 'Obligation_permission',
        'Creation_termination', 'Extremity_sufficiency', 'Context'
    ])
    _types = RelationTypes

    def __init__(self,
                 source_sentence,
                 rel_type=None,
                 connective=None,
                 arg0=None,
                 arg1=None,
                 annotation_id=None,
                 attached_causation=None):
        if rel_type is None:
            rel_type = set()  # overlapping rel can have multiple types

        all_args = locals().copy()
        del all_args['self']
        del all_args['attached_causation']
        super(OverlappingRelationInstance, self).__init__(**all_args)

        self.attached_causation = attached_causation

    def get_interpretable_type(self):
        if self.type:
            return set(self._types[t] for t in self.type)
        else:
            return set(['UNKNOWN'])

    def _get_type_str(self):
        return '+'.join(self.get_interpretable_type())
예제 #3
0
class FeatureExtractor(object):
    FeatureTypes = Enum(['Categorical', 'Numerical', 'Binary'])
    '''
    Whether extract() can return features not registered by
    extract_subfeature_names when run on the same set of instances. Should be
    overridden in extractor classes where this is true.
    '''
    _EXTRACT_PRODUCES_VALUES_TO_IGNORE = False

    def __init__(self, name, extractor_fn, feature_type=None):
        if feature_type is None:
            feature_type = self.FeatureTypes.Categorical
        self.name = name
        self.feature_type = feature_type
        self._extractor_fn = extractor_fn

    def extract_subfeature_names(self, instances):
        if self.feature_type == self.FeatureTypes.Categorical:
            values_set = set(self._extractor_fn(part) for part in instances)
            return [
                self._get_categorical_feature_name(self.name, value)
                for value in values_set
            ]
        else:  # feature_type == Numerical or feature_type == Binary
            return [self.name]

    def extract(self, part):
        '''
        Returns a dictionary of subfeature name -> subfeature value. More
        complex feature extractor classes should override this function.
        '''
        feature_value = self._extractor_fn(part)
        if self.feature_type == self.FeatureTypes.Categorical:
            feature_name = self._get_categorical_feature_name(
                self.name, feature_value)
            return {feature_name: 1.0}
        else:  # feature_type == Numerical or feature_type == Binary
            return {self.name: feature_value}

    def extract_all(self, parts):
        return [self.extract(part) for part in parts]

    @staticmethod
    def _get_categorical_feature_name(base_name, value):
        return '%s=%s' % (base_name, value)

    def __repr__(self):
        return '<Feature extractor: %s>' % self.name
예제 #4
0
class StanfordNERStage(Stage):
    NER_TYPES = Enum(['Person', 'Organization', 'Location', 'O'])

    def __init__(self, name):
        self.name = name
        self.model = Model()

    def train(self, documents, instances_by_doc):
        pass

    def _test_documents(self, documents, sentences_by_doc, writer):
        model_path = path.join(FLAGS.stanford_ner_path, 'classifiers',
                               FLAGS.stanford_ner_model_name)
        jar_path = path.join(FLAGS.stanford_ner_path, FLAGS.stanford_ner_jar)
        tagger = SentenceSplitStanfordNERTagger(model_path, jar_path)
        tokens_by_sentence = [
            [StanfordParsedSentence.escape_token_text(token.original_text)
             # Omit fictitious tokens.
             for token in sentence.tokens if token.start_offset is not None]
            for sentence in chain.from_iterable(sentences_by_doc)]

        # Batch process sentences (faster than repeatedly running Stanford NLP)
        ner_results = tagger.tag_sents(tokens_by_sentence)
        all_sentences = chain.from_iterable(sentences_by_doc)
        for sentence, sentence_result in zip(all_sentences, ner_results):
            sentence_result_iter = iter(sentence_result)
            for token in sentence.tokens:
                if token.start_offset is None: # Ignore fictitious tokens.
                    token.ner_tag = None
                else:
                    # Throws StopIteration if result is too short.
                    _token_text, tag = next(sentence_result_iter)
                    token.ner_tag = self.NER_TYPES.index(tag.title())
            # Make sure there are no extra tags for the sentence. NLTK is dumb.
            try:
                next(sentence_result_iter)
                assert(False)
            except StopIteration:
                pass

            if writer:
                writer.instance_complete(sentence)
예제 #5
0
class RegexConnectiveModel(Model):
    def __init__(self, *args, **kwargs):
        super(RegexConnectiveModel, self).__init__(*args, **kwargs)
        self.regexes = []

    def _train_model(self, sentences):
        self.regexes = [
            (re.compile(pattern), matching_groups)
            for pattern, matching_groups in self._extract_patterns(sentences)
        ]

    def test(self, sentences):
        logging.info('Tagging possible connectives...')
        start_time = time.time()

        for sentence in sentences:
            sentence.possible_causations = []

            tokens = sentence.tokens[1:]  # skip ROOT
            if FLAGS.regex_include_pos:
                lemmas_to_match = [
                    '%s/%s' % (token.lemma, token.get_gen_pos())
                    for token in tokens
                ]
            else:
                lemmas_to_match = [token.lemma for token in tokens]
            # Remember bounds of tokens so that we can recover the correct
            # tokens from regex matches.
            token_bounds = []
            # Final space eases matching
            string_to_match = ' '.join(lemmas_to_match) + ' '
            next_start = 0
            for lemma in lemmas_to_match:
                token_bounds.append((next_start, next_start + len(lemma)))
                next_start += len(lemma) + 1

            # More than one pattern may match a given connective. We record
            # which patterns matched which sets of connective words.
            matches = defaultdict(list)
            for regex, matching_group_indices in self.regexes:
                match = regex.search(string_to_match)
                while match is not None:
                    # We need to add 1 to indices to account for root.
                    token_indices = tuple(
                        token_bounds.index(match.span(i)) + 1
                        for i in matching_group_indices)
                    matches[token_indices].append(regex.pattern)
                    # Skip past the first token that matched to start looking
                    # for the next match. This ensures that we won't match the
                    # same connective twice with this pattern.
                    # (We start from the end of the first group *after* the
                    # pattern start group.)
                    match = regex.search(string_to_match, pos=match.span(2)[1])

            for token_indices, matching_patterns in matches.items():
                connective_tokens = [sentence.tokens[i] for i in token_indices]
                true_causation_instance = None
                for causation_instance in sentence.causation_instances:
                    if causation_instance.connective == connective_tokens:
                        true_causation_instance = causation_instance

                possible_causation = PossibleCausation(
                    sentence, matching_patterns, connective_tokens,
                    true_causation_instance)
                sentence.possible_causations.append(possible_causation)

        elapsed_seconds = time.time() - start_time
        logging.info("Done tagging possible connectives in %0.2f seconds" %
                     elapsed_seconds)

    #####################################
    # Sentence preprocessing
    #####################################

    @staticmethod
    def _filter_sentences_for_pattern(sentences, pattern, connective_lemmas):
        possible_sentence_indices = []
        for i, sentence in enumerate(sentences):
            token_lemmas = [token.lemma for token in sentence.tokens]
            # TODO: Should we filter here by whether there are enough tokens in
            # the sentence to match the rest of the pattern, too?
            if all([
                    connective_lemma in token_lemmas
                    for connective_lemma in connective_lemmas
            ]):
                possible_sentence_indices.append(i)

        return possible_sentence_indices

    #####################################
    # Pattern generation
    #####################################

    CONNECTIVE_INTERJECTION_PATTERN = ARG_WORDS_PATTERN = '([\S]+ )+?'
    # Pattern can start after another word, or @ start of sentence
    PATTERN_START = '(^| )'
    TokenTypes = Enum(['Connective', 'Cause', 'Effect'])  # Also possible: None

    @staticmethod
    def _get_pattern(sentence, connective_tokens, cause_tokens, effect_tokens):
        connective_capturing_groups = []
        pattern = RegexConnectiveModel.PATTERN_START
        next_group_index = 2  # whole match is 0, and pattern start will add 1

        previous_token_type = None
        connective_tokens.sort(key=lambda token: token.index)  # just in case
        next_connective_index = 0
        for token in sentence.tokens[1:]:
            if (next_connective_index < len(connective_tokens) and token.index
                    == connective_tokens[next_connective_index].index):
                # We ensure above that every token lemma in the tested string
                # has a space after it, even the last token, so space is safe.
                if FLAGS.regex_include_pos:
                    pattern += '(%s/%s) ' % (token.lemma, token.get_gen_pos())
                else:
                    pattern += '(%s) ' % token.lemma
                previous_token_type = (
                    RegexConnectiveModel.TokenTypes.Connective)
                connective_capturing_groups.append(next_group_index)
                next_group_index += 1
                next_connective_index += 1
            else:
                if token in cause_tokens:
                    token_type = RegexConnectiveModel.TokenTypes.Cause
                elif token in effect_tokens:
                    token_type = RegexConnectiveModel.TokenTypes.Effect
                else:
                    token_type = None

                if previous_token_type != token_type:
                    if token_type is None:
                        # It's possible for a connective to be interrupted by a
                        # word that's not consistent enough to make it count as
                        # a connective token (e.g., a determiner).
                        if (token.index > connective_tokens[0].index
                                and next_connective_index <
                                len(connective_tokens)):
                            # We're in the middle of the connective
                            pattern += (RegexConnectiveModel.
                                        CONNECTIVE_INTERJECTION_PATTERN)
                            next_group_index += 1
                    else:  # we've transitioned from non-argument to argument
                        pattern += RegexConnectiveModel.ARG_WORDS_PATTERN
                        next_group_index += 1
                previous_token_type = token_type

        return pattern, connective_capturing_groups

    @staticmethod
    def _extract_patterns(sentences):
        # TODO: Extend this to work with cases of missing arguments.
        regex_patterns = []
        patterns_seen = set()

        if FLAGS.print_patterns:
            print 'Patterns:'
        for sentence in sentences:
            for instance in sentence.causation_instances:
                connective = instance.connective
                cause_tokens, effect_tokens = [
                    arg if arg is not None else []
                    for arg in [instance.cause, instance.effect]
                ]

                pattern, connective_capturing_groups = (
                    RegexConnectiveModel._get_pattern(sentence, connective,
                                                      cause_tokens,
                                                      effect_tokens))

                if pattern not in patterns_seen:
                    if FLAGS.print_patterns:
                        print ' ', pattern.encode('utf-8')
                        print '  Sentence:', sentence.original_text.encode(
                            'utf-8')
                        print
                    patterns_seen.add(pattern)
                    regex_patterns.append(
                        (pattern, connective_capturing_groups))
        return regex_patterns
예제 #6
0
class StanfordParsedSentence(object):
    PTB_ESCAPE_MAP = {'*': '\\*', '. . .': '...', '(': '-LRB-', ')': '-RRB-',
                      '{': '-LCB-', '}': '-RCB-', '[': '-LSB-', ']': '-RSB-'}
    PTB_UNESCAPE_MAP = {} # filled in later from PTB_ESCAPE_MAP below
    # TODO: Should we be allowing the parser to PTB-escape more things?
    PERIOD_SUBSTITUTES = '.:'
    SUBJECT_EDGE_LABELS = ['nsubj', 'csubj', 'nsubjpass', 'csubjpass']
    INCOMING_CLAUSE_EDGES = ['ccomp', 'xcomp', 'csubj', 'csubjpass', 'advcl',
                             'acl', 'acl:relcl'] # TODO: allow conj/parataxis?
    EDGE_REGEX = re.compile(
        "([A-Za-z_\\-/\\.':]+)\\((.+)-(\\d+)('*), (.+)-(\\d+)('*)\\)")
    DEPTH_EXCLUDED_EDGE_LABELS = ['ref']

    def __init__(self, tokenized_text, tagged_lemmas, penn_tree, edges,
                 document_text):
        '''
        `tokenized_text` and `tagged_lemmas` are the token and lemma strings
         from the parser.
         `edges` is a list of edge strings from the parser.
         `document_text` is an instance of
         `util.streams.CharacterTrackingStreamWrapper`. (Built-in stream types
         will *not* work.)
        '''
        self.next_sentence = None
        self.previous_sentence = None
        # TODO: move much of the initialization functionality, particularly
        # aligning tokens to text, into the reader class.
        self.tokens = []
        self.edge_labels = {} # maps (n1_index, n2_index) tuples to labels
        try:
            self.source_file_path = document_text.name
        except AttributeError:
            self.source_file_path = None

        # Declare a few variables that will be overwritten later, just so that
        # it's easy to tell what's in an instance of this class.
        self.edge_graph = csr_matrix((0, 0), dtype='float')
        self.document_char_offset = 0
        self.original_text = ''
        self.__depths = np.array([])
        self.path_predecessors = np.array([[]])
        self.path_costs = np.array([[]])

        token_strings, tag_strings = self.__get_token_strings(tokenized_text,
                                                              tagged_lemmas)

        copy_node_indices = self.__create_tokens(token_strings, tag_strings)
        self.__align_tokens_to_text(document_text)
        self.__create_edges(edges, copy_node_indices)

        if FLAGS.use_constituency_parse:
            self.constituency_tree = ImmutableParentedTree.fromstring(penn_tree)
            self.constituency_graph = nltk_tree_to_graph(self.constituency_tree)
            self.constituent_heads = collins_find_heads(self.constituency_tree)
        else:
            self.constituency_tree = None
            self.constituency_graph = None
            self.constituent_heads = None

    def __deepcopy__(self, memo): # Avoid massive memory and stack demands
        next_sent, prev_sent = self.next_sentence, self.previous_sentence
        self.next_sentence, self.previous_sentence = None, None
        copied = self.__class__.__new__(self.__class__)
        memo[id(self)] = copied
        for k, v in self.__dict__.items():
            setattr(copied, k, deepcopy(v, memo))
        self.next_sentence, self.previous_sentence = next_sent, prev_sent
        return copied

    @staticmethod
    def unescape_token_text(token_text):
        token_text = token_text.replace(u'\xa0', ' ')
        return StanfordParsedSentence.PTB_UNESCAPE_MAP.get(token_text,
                                                           token_text)

    @staticmethod
    def escape_token_text(token_text):
        token_text = token_text.replace(' ', u'\xa0')
        return StanfordParsedSentence.PTB_ESCAPE_MAP.get(token_text, token_text)

    @staticmethod
    def get_text_for_tokens(annotation_tokens):
        try:
            return ' '.join([token.original_text
                             for token in annotation_tokens])
        except TypeError: # Happens if None is passed
            return ''

    def get_depth(self, token):
        return self.__depths[token.index]

    def _token_is_preferred_for_head_to(self, new_token, old_token):
        # If the depths are equal, prefer verbs/copulas over nouns, and
        # nouns over others. This helps to get the correct heads for
        # fragmented spans, such as spans that consist of an xcomp and its
        # subject, as well as a few other edge cases.
        if self.is_clause_head(old_token):
            return False
        elif self.is_clause_head(new_token):
            return True
        elif old_token.pos in Token.NOUN_TAGS:
            return False
        elif new_token.pos in Token.NOUN_TAGS:
            return True
        else:
            return False

    def get_head(self, tokens):
        # TODO: Update to match SEMAFOR's heuristic algorithm?
        min_depth = np.inf
        head = None
        for token in tokens:
            # Ignore annotation tokens from outside this sentence.
            # Really this check should be an is, but we use != to make it work
            # on Frozen sentences.
            if token.parent_sentence != self:
                continue
            depth = self.get_depth(token)
            parent_of_current_head = head in self.get_children(token, '*')
            child_of_current_head = (head is not None and
                                     token in self.get_children(head, '*'))
            if ((depth < min_depth or parent_of_current_head)
                and not child_of_current_head):
                head = token
                min_depth = depth
            elif (depth == min_depth and
                  head is not None and
                  self._token_is_preferred_for_head_to(token, head)):
                logging.debug(
                    u"Preferring %s over %s as head of '%s' in '%s'" %
                    (token, head,
                     u' '.join([t.original_text for t in tokens]),
                     tokens[0].parent_sentence.original_text))
                head = token
                min_depth = depth

        if head is None:
            logging.warn('Returning null head for tokens %s'
                         % tokens);
        return head

    def count_words_between(self, token1, token2):
        ''' Counts words between tokens based purely on the token IDs,
            discounting punctuation tokens. '''
        assert (self.tokens[token1.index] == token1 and
                self.tokens[token2.index] == token2), "Tokens not in sentence"
        switch = token1.index > token2.index
        if switch:
            token1, token2 = token2, token1
        words_between = -1
        for token in self.tokens[token1.index : token2.index + 1]:
            if token.pos[0].isalnum():
                words_between += 1
        # return -words_between if switch else words_between
        return words_between

    def get_most_direct_parent(self, token):
        '''
        Returns a tuple (e, p), p is the parent of the given token along the
        shortest path to root, and e is the label of the edge from p to token.
        '''
        if token.parent_sentence is not self:
            return (None, None) # not in the parse tree

        # We can't use self.path_predecessors because it was computed in an
        # essentially undirected fashion. Instead, we find all parents, and
        # select the one whose directed depth is lowest (i.e., with the shortest
        # directed path to root).
        incoming = self.edge_graph[:, token.index]
        nonzero = incoming.nonzero()[0]
        if not nonzero.any():
            return (None, None)

        min_depth = np.inf
        for edge_start_index in nonzero:
            next_depth = self.__depths[edge_start_index]
            if next_depth < min_depth:
                min_depth = next_depth
                parent_index = edge_start_index
        edge_label = self.edge_labels[(parent_index, token.index)]
        return (edge_label, self.tokens[parent_index])

    def get_children(self, token, edge_type=None):
        '''
        If `edge_type` is given, returns a list of children of token related by
        an edge with label edge_type. Otherwise, returns a list of
        (edge_label, child_token) tuples.

        `edge_type` may be a single type or a list of types. The special value
        '*' indicates that all children should be returned, without edge labels.
        '''
        if token.parent_sentence is not self:
            return [] # not in the parse tree

        # Grab the sparse column of the edge matrix with the edges of this
        # token. Iterate over the edge end indices therein.
        if edge_type:
            if edge_type == '*':
                return [self.tokens[edge_end_index] for edge_end_index
                        in self.edge_graph[token.index].indices]
            else:
                edge_type = listify(edge_type)
                return [self.tokens[edge_end_index] for edge_end_index
                        in self.edge_graph[token.index].indices
                        if (self.edge_labels[(token.index, edge_end_index)]
                            in edge_type)]
        else:
            return [(self.edge_labels[(token.index, edge_end_index)],
                     self.tokens[edge_end_index])
                    for edge_end_index in self.edge_graph[token.index].indices]

    def is_copula_head(self, token):
        if token.parent_sentence is not self:
            return False

        # Grab the sparse column of the edge matrix with the edges of this
        # token, and check the labels on each non-zero edge.
        for edge_end_index in self.edge_graph[token.index].indices:
            # A copula edge to a child also indicates a clause.
            if self.edge_labels[(token.index, edge_end_index)] == 'cop':
                return True
        return False

    def is_clause_head(self, token):
        if token.parent_sentence is not self:
            return False

        if token.pos == 'ROOT':
            return False
        try:
            Token.VERB_TAGS.index(token.pos)
            if token.pos != 'MD': # Modals, though verbs, aren't clause heads
                return True
        except ValueError: # this POS wasn't in the list
            if self.is_copula_head(token):
                return True

            incoming = self.edge_graph[:, token.index]
            for edge_start_index in incoming.nonzero()[0]:
                # An incoming clause edge also indicates a clause.
                if (self.edge_labels[(edge_start_index, token.index)]
                    in self.INCOMING_CLAUSE_EDGES):
                    return True

        return False

    def extract_dependency_path(self, source, target, include_conj=True):
        assert source.parent_sentence is self and target.parent_sentence is self
        edges = []
        while target is not source:
            predecessor_index = self.path_predecessors[source.index,
                                                         target.index]
            if predecessor_index == -9999:
                raise DependencyPathError(source, target)
            predecessor = self.tokens[predecessor_index]

            try:
                # Normal case: the predecessor is the source of the edge.
                label = self.edge_labels[(predecessor_index, target.index)]
                start, end = predecessor, target
            except KeyError:
                # Back edge case: the predecessor is the target of the edge.
                label = self.edge_labels[(target.index, predecessor_index)]
                start, end = target, predecessor
            if label != 'conj' or include_conj:
                edges.append((start, end, label))
            target = predecessor
        return DependencyPath(source, reversed(edges))

    def get_closest_of_tokens(self, source, possible_targets, use_tree=True):
        '''
        Finds the token among possible_targets closest to source. If use_tree
        is True, distance is determined by distance in the parse tree;
        otherwise, distance is simple lexical distance (which may be negative).
        Returns the token, along with its distance. If none of the possible
        targets is reachable, returns (None, np.inf).
        '''
        if source.parent_sentence is not self:
            return (None, np.inf)

        if not possible_targets:
            raise ValueError("Can't find closest of 0 tokens")

        min_distance = np.inf
        for target in possible_targets:
            if target.parent_sentence is not self:
                continue
            if use_tree:
                next_distance = self.path_costs[source.index, target.index]
            else:
                next_distance = source.index - target.index
            if next_distance < min_distance:
                closest = target
                min_distance = next_distance
        if min_distance == np.inf: # source or all targets aren't in tree
            closest = None

        return closest, min_distance

    def get_constituency_node_for_tokens(self, tokens):
        # Token indices include ROOT, so we subtract 1 to get indices that will
        # match NLTK's leaf indices.
        indices = [token.index - 1 for token in tokens
                   if token.parent_sentence is self]
        try:
            treeposition = self.constituency_tree.treeposition_spanning_leaves(
                min(indices), max(indices) + 1) # +1 b/c of Python-style ranges
        except AttributeError: # self.constituency_tree is None
            if not FLAGS.use_constituency_parse:
                raise ValueError('Constituency parses not in use')
            else:
                raise

        node = self.constituency_tree[treeposition]
        if not isinstance(node, Tree): # We got a treeposition of a leaf string
            node = self.constituency_tree[treeposition[:-1]]
        return node

    def get_token_for_constituency_node(self, node):
        if not is_parent_of_leaf(node):
            raise ValueError("Node is not a parent of a leaf: %s" % node)
        node_leaf = node[0]
        for i, leaf in enumerate(node.root().leaves()):
            if leaf is node_leaf: # identity, not equality
                return self.tokens[i]
        if not FLAGS.use_constituency_parse:
            raise ValueError('Constituency parses not in use')
        else:
            raise ValueError("Somehow you passed a node whose leaf isn't under"
                             " its root. Wow.")

    DOMINATION_DIRECTION = Enum(['Dominates', 'DominatedBy', 'Independent'])
    def get_domination_relation(self, token1, token2):
        # TODO: do we need to worry about conj's here?
        path = self.extract_dependency_path(token1, token2, True)
        last_node = path.start
        all_forward = True
        all_backward = True
        for source, target, _dep_name in path:
            if source is last_node: # forward edge
                all_backward = False
                if not all_forward:
                    break
                last_node = target
            else: # back edge
                all_forward = False
                if not all_backward:
                    break
                last_node = source
        if all_forward:
            return self.DOMINATION_DIRECTION.Dominates
        elif all_backward:
            return self.DOMINATION_DIRECTION.DominatedBy
        else:
            return self.DOMINATION_DIRECTION.Independent

    @staticmethod
    def is_contiguous(tokens):
        last_index = tokens[0].index
        sentence = tokens[0].parent_sentence
        for token in tokens[1:]:
            if sentence is token.parent_sentence and (
                token.pos in Token.PUNCT_TAGS or token.index == last_index + 1):
                last_index = token.index
                sentence = token.parent_sentence
            else:
                return False

        return True

    ###########################################
    # Private initialization support functions
    ###########################################

    @staticmethod
    def __get_token_strings(tokenized_text, tagged_lemmas):
        '''
        This is basically a wrapper for the string split function, which also
        combines adjacent tokens if there are spaces within tokens. This is
        detected by looking for a lack of a '/' in the tagged lemma.
        '''
        token_strings = tokenized_text.split(' ')
        lemma_strings = tagged_lemmas.split(' ')
        assert len(token_strings) == len(lemma_strings), (
            "Tokens do not match tags")

        if all('/' in lemma for lemma in lemma_strings):
            return token_strings, lemma_strings

        final_token_strings = []
        final_lemma_strings = []
        tokens_to_accumulate = []
        lemmas_to_accumulate = []
        for token, lemma in zip(token_strings, lemma_strings):
            tokens_to_accumulate.append(token)
            lemmas_to_accumulate.append(lemma)
            if '/' in lemma:
                final_token_strings.append(' '.join(tokens_to_accumulate))
                final_lemma_strings.append(' '.join(lemmas_to_accumulate))
                tokens_to_accumulate = []
                lemmas_to_accumulate = []
        return final_token_strings, final_lemma_strings

    def __create_tokens(self, token_strings, tag_strings):
        # We need one more node than we have token strings (for root).
        copy_node_indices = [None for _ in range(len(token_strings) + 1)]
        root = self.__add_new_token('', 'ROOT', 'ROOT')
        copy_node_indices[0] = [root.index]

        for i, (token_str, tag_str) in (
                enumerate(zip(token_strings, tag_strings))):
            # Can't use str.partition because there may be a '/' in the token.
            slash_index = tag_str.rindex('/')
            lemma = tag_str[:slash_index]
            pos = tag_str[slash_index + 1:]
            new_token = self.__add_new_token(
                self.unescape_token_text(token_str), pos, lemma)
            # Detect duplicated tokens.
            if (lemma == '.' and pos == '.'
                    # Previous token is in self.tokens[i], not i-1: root is 0.
                    and self.tokens[i].original_text.endswith('.')):
                new_token.is_absent = True

            copy_node_indices[i + 1] = [new_token.index]

        return copy_node_indices

    def __add_new_token(self, *args, **kwargs):
        new_token = Token(len(self.tokens), self, *args, **kwargs)
        self.tokens.append(new_token)
        return new_token

    def __align_tokens_to_text(self, document_text):
        eat_whitespace(document_text)
        self.document_char_offset = document_text.character_position

        # Root has no alignment to source.
        self.tokens[0].start_offset = None
        self.tokens[0].end_offset = None

        non_root_tokens = self.tokens[1:]
        for i, token in enumerate(non_root_tokens):
            # i is one less than the index of the current token in self.tokens,
            # because root.
            original = token.original_text
            if token.is_absent:
                # Handle case of duplicated character, which is the only type of
                # absent token that will have been detected so far.
                prev_token = self.tokens[i]
                if prev_token.original_text.endswith(original):
                    # print "Found duplicated token:", (
                    #    token.original_text.encode('utf-8'))
                    token.start_offset = prev_token.end_offset - len(original)
                    token.end_offset = prev_token.end_offset
            elif original == '.' and i == len(non_root_tokens) - 1:
                # End-of-sentence period gets special treatment: the "real"
                # original text may have been a period substitute or missing.
                # (Other things can get converted to fake end-of-sentence
                # periods to make life easier for the parser.)
                start_pos = document_text.tell()
                eaten_ws = eat_whitespace(document_text, True)
                not_at_eof = not is_at_eof(document_text)
                next_char, next_is_period_sub = peek_and_revert_unless(
                    document_text,
                    lambda char: self.PERIOD_SUBSTITUTES.find(char) != -1)
                if (not_at_eof and next_is_period_sub):
                    # We've moved the stream over the period, so adjust offset.
                    token.start_offset = (document_text.character_position
                                          - self.document_char_offset - 1)
                    token.end_offset = token.start_offset + 1
                    token.original_text = next_char
                    self.original_text += eaten_ws + next_char
                else:
                    # The period is actually not there.
                    token.is_absent = True
                    token.original_text = ''
                    document_text.seek(start_pos)
            else: # Normal case: just read the next token.
                search_start = document_text.character_position
                # Our preprocessing may hallucinate periods onto the ends of
                # abbreviations, particularly "U.S." Deal with them.
                if original[-1] == '.':
                    token_text_to_find = original[:-1]
                else:
                    token_text_to_find = original

                text_until_token, found_token = (
                    read_stream_until(document_text, token_text_to_find, True))
                self.original_text += text_until_token
                assert found_token, (
                    (u'Could not find token "%s" starting at position %d '
                     '(accumulated: %s)') % (
                    original, search_start, self.original_text)).encode('utf-8')

                if original[-1] == '.':
                    # If it ends in a period, and the next character in the
                    # stream is a period, it's a duplicated period. Advance
                    # over the period and append it to the accumulated text.
                    _, is_period = peek_and_revert_unless(
                        document_text, lambda char: char == '.')
                    if is_period:
                        self.original_text += '.'
                token.end_offset = (document_text.character_position
                                    - self.document_char_offset)
                token.start_offset = token.end_offset - len(original)

            '''
            if not token.is_absent:
                print "Annotated token span: ", token.start_offset, ",", \
                    token.end_offset, 'for', \
                    token.original_text.encode('utf-8') + '. Annotated text:',\
                    (self.original_text[token.start_offset:token.end_offset]
                    ).encode('utf-8')
            '''

    def __make_token_copy(self, token_index, copy_num, copy_node_indices):
        copies = copy_node_indices[token_index]
        token = self.tokens[token_index]
        while copy_num >= len(copies):
            self.__add_new_token(token.original_text, token.pos, token.lemma,
                                 token.start_offset, token.end_offset,
                                 token.is_absent, token)
            copies.append(len(self.tokens) - 1)

    def __create_edges(self, edges, copy_node_indices):
        edge_lines = [line for line in edges if line] # skip blanks
        matches = [StanfordParsedSentence.EDGE_REGEX.match(edge_line)
                   for edge_line in edge_lines]

        # First, we need to create tokens for all the copy nodes so that we have
        # the right size matrix for the graph.
        for match_result, edge_line in zip(matches, edge_lines):
            assert match_result, \
                'Improperly constructed edge line: %s' % edge_line
            arg1_index, arg1_copy, arg2_index, arg2_copy = \
                match_result.group(3, 4, 6, 7)
            self.__make_token_copy(int(arg1_index), len(arg1_copy),
                                   copy_node_indices)
            self.__make_token_copy(int(arg2_index), len(arg2_copy),
                                   copy_node_indices)

        # Now, we can actually create the matrix and insert all the edges.
        num_nodes = len(self.tokens)
        self.edge_graph = lil_matrix((num_nodes, num_nodes), dtype='float')
        graph_excluded_edges = [] # edges that shouldn't be used for graph algs
        for match_result in matches:
            (relation, _arg1_lemma, arg1_index, arg1_copy, _arg2_lemma,
             arg2_index, arg2_copy) = match_result.group(*range(1,8))
            arg1_index = int(arg1_index)
            arg2_index = int(arg2_index)

            token_1_idx = copy_node_indices[arg1_index][len(arg1_copy)]
            token_2_idx = copy_node_indices[arg2_index][len(arg2_copy)]
            # TODO: What should we do about the cases where there are
            # multiple labels for the same edge? (e.g., conj and ccomp)
            self.edge_labels[(token_1_idx, token_2_idx)] = relation
            if relation in self.DEPTH_EXCLUDED_EDGE_LABELS:
                graph_excluded_edges.append((token_1_idx, token_2_idx))
            else:
                self.edge_graph[token_1_idx, token_2_idx] = 1.0
        self._initialize_graph(graph_excluded_edges)

    def _initialize_graph(self, graph_excluded_edges):
        # Convert to CSR for shortest path (which would do it anyway) and to
        # make self.get_children() below work.
        self.edge_graph = self.edge_graph.tocsr()
        self.__depths = bfs_shortest_path_costs(self.edge_graph, 0)

        '''
        For the undirected shortest paths we save, we'll want to:
         a) prefer xcomp-> __ ->nsubj paths over nsubj-> __ <-nsubj and
            nsubj<- __ ->xcomp paths.
         b) disprefer paths that rely on expletives and acls.
         c) treat the graph as undirected, EXCEPT for edges where we already
            have a reverse edge, in which case that edge's weight should be
            left alone.
        We adjust the graph accordingly.
        '''
        # Adjust edge weights to make better paths preferred.
        for edge, label in self.edge_labels.iteritems():
            if label == 'xcomp':
                self.edge_graph[edge] = 0.98
                edge_end_token = self.tokens[edge[1]]
                subj_children = self.get_children(
                    edge_end_token, self.SUBJECT_EDGE_LABELS)
                for child in subj_children:
                    self.edge_graph[edge[1], child.index] = 0.985
            elif label == 'expl' or label.startswith('acl'):
                self.edge_graph[edge] = 1.01

        # Create duplicate edges to simulate undirectedness, EXCEPT where we
        # already have an edge in the opposite direction. For this we use a
        # copy of the graph, since we don't actually want to pollute edge_graph
        # with the reverse arcs.
        pseudo_unweighted_graph = self.edge_graph.tolil()

        nonzero = set([(i, j) for (i, j) in zip(
            *pseudo_unweighted_graph.nonzero())])
        for (i, j) in nonzero:
            if (j, i) not in nonzero:
                pseudo_unweighted_graph[j, i] = pseudo_unweighted_graph[i, j]

        self.path_costs, self.path_predecessors = csgraph.shortest_path(
            pseudo_unweighted_graph, return_predecessors=True, directed=True)

        # Add in edges that we didn't want to use for distances/shortest path,
        # ignoring all the changes made to undirected_graph.
        # (Originally we were converting to LIL for efficiency, but it turned
        # out to hurt performance more than it helped.)
        # TODO: Should we convert if there were excluded edges?
        # self.edge_graph = self.edge_graph.tolil()
        for start, end in graph_excluded_edges:
            self.edge_graph[start, end] = 1.0
        self.edge_graph = self.edge_graph.tocsr()

    def __unicode__(self):
        parse_lines = [u'%s(%s-%d, %s-%d)'
                       % (label, self.tokens[edge[0]].lemma, edge[0],
                          self.tokens[edge[1]].lemma, edge[1])
                       for edge, label in sorted(self.edge_labels.iteritems())]
        return u'%s\n\n%s' % (self.original_text, u'\n'.join(parse_lines))

    def __str__(self):
        return self.__unicode__().encode('utf-8')
예제 #7
0
            0]  # ndarray-like nonzero produces only 1 dim
    else:
        incoming = graph.getrow(node)
        return incoming.nonzero()[1]


class CycleError(Exception):
    def __init__(self):
        super(Exception, self).__init__("Cycle detected; graph is not a DAG")


def topological_sort(tree, algorithm='tarjan'):
    return tarjan_topological_sort(tree)


_SORT_MARKS = Enum(['Unvisited', 'Visiting', 'Visited'])


def tarjan_topological_sort(tree):
    sorted_nodes = []
    # assumes a square matrix, which a graph should be
    marks = [_SORT_MARKS.Unvisited] * tree.shape[0]

    def visit(node):
        if isinstance(node, np.ndarray):
            raise Exception()
        if marks[node] == _SORT_MARKS.Visiting:
            raise CycleError()
        if marks[node] != _SORT_MARKS.Visited:
            marks[node] = _SORT_MARKS.Visiting
            for child in get_outgoing_indices(tree, node):
예제 #8
0
    causations_to_keep = []
    # Process connectives biggest to smallest, discarding any that reuse tokens.
    # If we have two connectives of the same length competing for a token, this
    # will arbitrarily choose the first one we find.
    for connective_length in sorted(causations_by_size.keys(), reverse=True):
        for causation in causations_by_size[connective_length]:
            for conn_token in causation.connective:
                if tokens_used[conn_token.index]:
                    break
            else: # Executes only if loop over tokens didn't break
                causations_to_keep.append(causation)
                for conn_token in causation.connective:
                    tokens_used[conn_token.index] = True

    sentence.causation_instances = causations_to_keep


RELATIVE_POSITIONS = Enum(['Before', 'Overlapping', 'After'])


def get_causation_tuple(connective_tokens, cause_head, effect_head):
    return (tuple(t.index for t in connective_tokens),
            cause_head.index if cause_head else None,
            effect_head.index if effect_head else None)


# Add some Colorama functionality.
for style, code in [('UNDERLINE', 4), ('BLINK', 5)]:
    setattr(colorama.Style, style, '\033[%dm' % code)