def _check_position(self, position: str, candidate: ArgumentCandidate,
                        matched_argument: ExtractedArgument):
        """
		Checks whether the given candidate word and this argument are compatible and can occur as the given position
		:param position: a possible position for the candidate (from constant position names, like PREFIX, DOBJ and more)
		:param candidate_token: a token candidate for this argument
		:param matched_argument: The appropriate argument object for this lexical argument
		:return: True if the three (this argument and the given position and candidate) are compatible with the regard to the linked argument, and False otherwise
		"""

        candidate_token = candidate.get_token()
        matched_position = position

        # Check that this argument is compatible with that position
        if position not in self.positions:
            return False

        if position == POS_PREFIX:
            # Empty pattern means that this argument isn't compatible with any prefix position
            #if self.prefix_pattern == "":
            #	return False

            # Check whether the candidate is compatible with the prefix pattern
            #matched_position = re.search(self.prefix_pattern, candidate_token._.subtree_text + " ", re.M)
            matched_position = self._check_prefix(candidate_token,
                                                  self.prefixes)
            if matched_position is None:
                return False

            #matched_position = matched_position.group().strip()
            #if not candidate.check_position(matched_position):
            #	return False

            # The prefix cannot be the entire argument
            if len(matched_position.split()) == len(
                    candidate_token._.subtree_text.split()):
                return False

        # a complement without a standard prefix position may also required a specific prefix constraint
        elif ARG_CONSTRAINT_REQUIRED_PREFIX in self.constraints and self._check_prefix(
                candidate_token, self.prefixes) is None:
            #					and re.search(self.prefix_pattern, candidate_token._.subtree_text + " ", re.M) is None:
            return False

        # Check wether the candidate isn't compatible with the *illegal* prefix pattern
        #if self.illegal_prefix_pattern != "" and \
        #		re.search(self.illegal_prefix_pattern, candidate_token._.subtree_text + " ", re.M) is not None:
        #	return False

        if self.illegal_prefixes != [] and self._check_prefix(
                candidate_token, self.illegal_prefixes):
            return False

        # Update the matched position of the given matched argument
        matched_argument.set_matched_position(matched_position)

        # Check the compatibility between the candidate and this argument
        return self._check_constraints(candidate_token, matched_argument)
    def determine_args_type2(self,
                             candidates_args,
                             predicate: ExtractedArgument,
                             verb,
                             default_subcat=False):
        # Determines the most appropriate type of each candidate, using a model

        uncertain_types = list(self.tagset.keys()) + (
            [COMP_PP1, COMP_PP2] if COMP_PP in self.tagset else [])
        uncertain_candidates = defaultdict(list)
        determined_dict = defaultdict(
            list)  # The determined arguments for each candidate
        predicate_token = predicate.get_token()

        # Each candidate should take one appropriate type, determined by the model
        for candidate, args in candidates_args.items():
            types = set([a.get_real_type() for a in args])
            candidate_span = args[0].as_span(trim_argument=False)

            # The candidate is compatible with some "certain" complements
            if not types.issubset(uncertain_types):
                determined_dict[candidate] = [
                    a for a in args if a.get_real_type() not in uncertain_types
                ]
                continue

            # Find the appropriate type and add this candidate to its list of options
            predicted_type, entropy, args = self.choose_arg_type(
                candidate_span, args, types, predicate_token, verb,
                default_subcat)
            if args != []:
                uncertain_candidates[predicted_type].append(
                    (candidate, entropy, args))

        # The predicate might also take an argument role
        predicate_type = predicate.get_name()
        if predicate_type:
            uncertain_candidates[predicate_type].append(
                (predicate_token, predicate.get_entropy(), [predicate]))

        # Now, determine for each complement type, the most appropriate candidates (using entropy)
        for arg_type, candidates_info in uncertain_candidates.items():
            candidates_info.sort(key=lambda c_info: c_info[1])

            # Choose the best candidate
            candidate, _, args = candidates_info[0]
            determined_dict[candidate] = args
            print(candidate, [arg.get_token() for arg in args])

            # The maximum number of compatible PP is 2 for the default subcat
            if arg_type == COMP_PP and len(
                    candidates_info) > 1 and default_subcat:
                candidate, _, args = candidates_info[1]
                determined_dict[candidate] = args

        return determined_dict
Пример #3
0
    def _update_unused_candidates(self,
                                  token_candidates: list,
                                  predicate_token: Token,
                                  used_tokens: list,
                                  extraction: dict,
                                  specify_none=False,
                                  trim_arguments=True):
        if not specify_none:
            return

        extraction[COMP_NONE] = []
        prepositions = list(
            itertools.chain.from_iterable([
                self.entries[DEFAULT_ENTRY].subcats[DEFAULT_SUBCAT].
                arguments[arg_type].prefixes
                for arg_type in [COMP_PP, COMP_IND_OBJ, COMP_SUBJ, COMP_OBJ]
            ]))

        # Add any candidate that isn't in the used tokens to the NONE complement
        for unused_candidate in difference_list(token_candidates, used_tokens):
            unused_token = unused_candidate.get_token()

            nom_links = [URELATION_NMOD, URELATION_COMPOUND, URELATION_ACL]
            verb_links = [
                URELATION_NSUBJ, URELATION_IOBJ, URELATION_DOBJ,
                URELATION_NMOD_POSS, URELATION_NSUBJPASS, URELATION_NMOD
            ]
            relevant_links = verb_links if self.is_verb else nom_links

            if unused_token.dep_ not in relevant_links or unused_token.i == predicate_token.i:
                continue

            if not unused_token.pos_.startswith("N"):
                continue

            if unused_token.dep_ in [URELATION_NMOD, URELATION_ACL]:
                found_prep = False
                candidate_text = unused_token._.subtree_text + " "

                for prefix in prepositions:
                    if candidate_text.startswith(prefix):
                        found_prep = True

                if not found_prep:
                    continue

            unused_arg = ExtractedArgument(unused_token, COMP_NONE)
            arg_span = unused_arg.as_span(trim_argument=trim_arguments)
            extraction[COMP_NONE].append(arg_span)
    def _check_arguments_compatibility(self,
                                       args_per_candidate: dict,
                                       argument_types: list,
                                       argument_candidates: list,
                                       predicate: ExtractedArgument,
                                       is_required=False):
        """
		Checks the compatibility of the given argument types with each argument candidate
		:param args_per_candidate: the possible extracted arguments for each candidate
		:param argument_types: a list of argument types
		:param argument_candidates: the candidates for the arguments of this subcat (as list of tokens)
		:param predicate: the predicate object of the arguments that we are after
		:return: None
		"""

        for complement_type in argument_types:
            #print(complement_type)
            argument = self.arguments[complement_type]
            found_match = False

            for candidate in argument_candidates:
                matched_argument = argument.check_match(
                    candidate, predicate.get_token())

                if matched_argument is not None:
                    args_per_candidate[candidate.get_token()].append(
                        matched_argument)
                    found_match = True

            if is_required and not found_match:
                return
    def check_match(self, candidate: ArgumentCandidate,
                    referenced_token: Token):
        """
		Checks whether the given candidate matches to to this argument
		:param candidate_token: a token candidate for this argument
		:param referenced_token: the predicate of the arguments that we are after
		:return: the matched argument (ExtractedArgument object) if a match was found, or None otherwise
		"""

        candidate_token = candidate.get_token()

        # Avoid arguments that contain the referenced token
        # This can happen when the dependency tree isn't projective
        referenced_i = referenced_token.i
        cand_start = candidate_token._.subtree_indices[0]
        cand_end = candidate_token._.subtree_indices[-1]
        if candidate_token != referenced_token and referenced_i >= cand_start and referenced_i <= cand_end:
            #@TODO-check that only a fair amount of arguments are being cut here
            #print(candidate_token)
            return None

        matched_argument = ExtractedArgument(candidate_token,
                                             self.get_complement_type())

        # Get the possible "position" type for the candidate (like DET-POSS, PREFIX and so on)
        # Based on the dependency relation that connects the candidate to the rest of the tree (its head relation)
        #possible_positions = relation_to_position(candidate_token, referenced_token, self.is_verb)
        candidate_positions = candidate.get_possible_positions()

        # Check the compatibility of each position with this argument and the candidate
        for position in candidate_positions:
            if self._check_position(position, candidate, matched_argument):
                return matched_argument

        return None
    def _check_root(self, candidate_token: Token,
                    matched_argument: ExtractedArgument):
        """
		Checks that the constraints on the root according to this argument works for the given root word
		:param candidate_token: a token candidate for this argument
		:param matched_argument: The appropriate argument object for this lexical argument
		:return: True if the root doesn't contradict the root constraints of this argument, and False otherwise
		"""

        # Check whether the matched position is a multi-word preposition and the candidate token is part of the preposition prefix
        # If so, then the "root" of the candidate should be the nearest connected token *after the preposition*, for the purpose of the next tests
        # Example- "... with regard to the man". The candidate token will be "regard". But we must check the constraints over "man"
        matched_position = matched_argument.get_position()
        if matched_position.islower():
            candidate_index_in_arg = candidate_token.i - candidate_token._.subtree_indices[
                0]
            prep_length = len(matched_position.split(" "))

            if prep_length > 1 and candidate_index_in_arg < prep_length:
                #@TODO- is it right to use "wild card" relation here?
                end_of_preposition_idx = candidate_token._.subtree_indices[
                    0] + prep_length
                candidate_token = get_word_in_relation(
                    candidate_token,
                    URELATION_ANY,
                    start_index=end_of_preposition_idx)

                if candidate_token is None:
                    return False

        if not check_relations(candidate_token, self.root_urelations):
            return False

        # ING and TO-INF complements may include be-form verb instead of the main verb of those complements
        # In such cases the "be" verb isn't the root, and the real root doesn't obey its contraints (cause it isn't the verb)
        if COMP_TO_INF in self.complement_type or "ING" in self.complement_type:
            needed_be_form = "be" if COMP_TO_INF in self.complement_type else "being"
            if check_relations(candidate_token,
                               [URELATION_COP + "_" + needed_be_form]):
                return True

        if self.root_upostags != [] and candidate_token.pos_ not in self.root_upostags:
            return False

        #@TODO- can a determiner that isn't possessive pronoun be an NP argument?
        if candidate_token.pos_ == UPOS_DET and self.complement_type in [
                COMP_SUBJ, COMP_OBJ, COMP_IND_OBJ, COMP_NP
        ] and candidate_token.orth_.lower() not in POSSESIVE_OPTIONS:
            return False

        # if self.root_pattern != "" and not re.search(self.root_pattern, candidate_token.orth_.lower(), re.M):
        # 	return False

        if PATTERN_ING in self.root_patterns and not candidate_token.orth_.lower(
        ).endswith("ing"):
            return False

        return True
    def _check_constraints(self, candidate_token: Token,
                           matched_argument: ExtractedArgument):
        """
		Checks whether the given candidate is compatible with the constraints of this argument
		:param candidate_token: a token candidate for this argument
		:param matched_argument: The appropriate argument object for this lexical argument
		:return: True if the candidate doesn't contradict the constraints, and False otherwise
		"""

        # Checks the constraints on the root
        if not self._check_root(candidate_token, matched_argument):
            return False

        ####################################
        # Check the boolean constraints

        # Handle optional possessive sub-argument
        if ARG_CONSTRAINT_OPTIONAL_POSSESSIVE in self.constraints:
            founded_possessive = False

            for relation in [URELATION_NMOD_POSS, URELATION_NSUBJ]:
                founded_token = get_word_in_relation(candidate_token, relation)

                if founded_token is None:
                    continue

                if self._is_possessive(founded_token):
                    founded_possessive = True
                    break

            # Change the argument name of the given matched argument object, if needed

            # Possessive should be included
            if founded_possessive and "POSSING" not in self.complement_type:
                matched_argument.set_name(
                    self.complement_type.replace("ING", "POSSING"))

            # Possessive should be excluded
            elif not founded_possessive and "POSSING" in self.complement_type:
                matched_argument.set_name(
                    self.complement_type.replace("POSSING", "ING-ARBC"))

        return True
Пример #8
0
    def _wrap_predicate(self,
                        word: Token,
                        word_entry: Entry,
                        arguments_predictor=None):
        noun_type = None
        entropy = None

        # Predicting the predicate type for noun
        if not self.is_verb and arguments_predictor and word_entry.is_default_entry(
        ):
            noun_type, entropy = arguments_predictor.determine_noun_type(word)

        predicate = ExtractedArgument(word,
                                      noun_type,
                                      matched_position=POS_NOM,
                                      entropy=entropy)
        return predicate
    def _get_extractions(self,
                         args_per_candidate: dict,
                         predicate: ExtractedArgument,
                         suitable_verb: str,
                         arguments_predictor=None):
        """
		Genetrates all the possible extractions of arguments and candidates, based on the possible arguments per candidate
		:param args_per_candidate: the possible argument types for each candidate
		:param predicate: the predicate object of the arguments that we are after
		:param suitable_verb: the appropriate verb for the given reference token
		:param arguments_predictor: the model-based extractor object to determine the argument type of a span (optional)
		:return: all the possible extractions for this subcat
		"""

        # Determine the arguments type of candidates with uncertainty about their complement type
        if arguments_predictor is not None and self.subcat_type == DEFAULT_SUBCAT:
            args_per_candidate = arguments_predictor.determine_args_type(
                args_per_candidate,
                predicate,
                suitable_verb,
                default_subcat=True)

        # Add a "None" argument option for each candidate, cause any candidate may not be an argument
        for candidate_token in args_per_candidate.keys():
            # args_per_candidate[candidate_token] = self._choose_informative_positions(args_per_candidate[candidate_token])
            args_per_candidate[candidate_token].append(None)

        candidates = args_per_candidate.keys()
        matches = [
            dict(zip(candidates, arguments))
            for arguments in product(*args_per_candidate.values())
        ]
        relevant_extractions = []

        for match in matches:
            extraction = Extraction(self, list(match.values()))

            # Check constraints on the current extraction
            if self.check_constraints(extraction, predicate.get_token()):
                relevant_extractions.append(extraction)

        return relevant_extractions
Пример #10
0
 def add_argument(self, argument: ExtractedArgument):
     self.match[argument.get_real_type()] = argument
    def determine_args_type(self,
                            candidates_args,
                            predicate: ExtractedArgument,
                            verb,
                            default_subcat=False):
        # Determines the most appropriate type of each candidate, using a model

        uncertain_types = list(self.tagset.keys()) + (
            [COMP_PP1, COMP_PP2] if COMP_PP in self.tagset else [])
        uncertain_candidates = {}
        predicate_token = predicate.get_token()
        determined_dict = {}
        none_spans = []

        # Each candidate should take one appropriate type, determined by the model
        for candidate_span, role_types in candidates_args.items():
            role_types = set(role_types)

            if predicate.get_token().i == candidate_span[
                    0].i or role_types.isdisjoint(uncertain_types):
                determined_dict[candidate_span] = role_types
                continue

            if candidate_span.lemma_ in [
                    "i", "he", "she", "it", "they", "we", "-PRON-"
            ]:
                determined_dict[candidate_span] = role_types
                continue

            role_types.add(COMP_NONE)
            logits = self.get_types_distribution(candidate_span, role_types,
                                                 predicate_token, verb,
                                                 default_subcat)

            if logits.argmax().item() == self.tagset[COMP_NONE]:
                none_spans.append(candidate_span)
            else:
                uncertain_candidates[candidate_span] = logits

        if len(uncertain_candidates) == 0:
            return determined_dict

        #print(dict(candidates_args))

        # if uncertain_candidates == {}:
        # 	return {}

        u = list(uncertain_candidates.keys())
        u += [None] * (len(uncertain_types) - 2
                       )  #(len(self.tagset) - 1 - len(uncertain_candidates))

        certain_types = [
        ]  #[list(types)[0] for types in determined_dict.values() if len(types) == 1]
        role_types = difference_list(uncertain_types,
                                     [COMP_NONE] + certain_types)

        #if len(predicate_types) == 1:
        #	role_types = difference_list(role_types, predicate_types)

        types_combinations = list(permutations(u, len(role_types)))
        empty_comb = tuple([None] * len(role_types))
        if empty_comb not in types_combinations:
            types_combinations.append(empty_comb)

        #print(predicate.get_token(), types_combinations)

        args_sum_logits = []

        for comb in types_combinations:
            # sum_logits = 0
            # for i, arg in enumerate(comb):
            # 	if arg:
            # 		print(i, role_types[i], uncertain_candidates[arg][self.tagset[role_types[i]]])
            # 		sum_logits += uncertain_candidates[arg][self.tagset[role_types[i]]].item()

            sum_logits = sum([
                uncertain_candidates[arg][self.tagset[role_types[i]]]
                for i, arg in enumerate(comb) if arg
            ])
            sum_logits += sum([
                uncertain_candidates[arg][self.tagset[COMP_NONE]].item()
                for arg in set(u).difference(comb) if arg
            ])
            args_sum_logits.append(sum_logits)

        #print(predicate.get_token(), args_sum_logits)
        max_idx = int(np.argmax(args_sum_logits))
        best = types_combinations[max_idx]

        determined_dict.update(
            {arg: [role_types[i]]
             for i, arg in enumerate(best) if arg})

        for arg in difference_list(candidates_args.keys(),
                                   determined_dict.keys()):
            determined_dict[arg] = difference_list(candidates_args[arg],
                                                   uncertain_types)

        #if predicate_span:
        #	determined_dict[predicate_span] = predicate_types

        #assert all([set(determined_dict[s]).isdisjoint(uncertain_types) for s in none_spans])

        #print(predicate.get_token(), len(types_combinations), determined_dict)
        return determined_dict