def _check_position(self, position: str, candidate: ArgumentCandidate, matched_argument: ExtractedArgument): """ Checks whether the given candidate word and this argument are compatible and can occur as the given position :param position: a possible position for the candidate (from constant position names, like PREFIX, DOBJ and more) :param candidate_token: a token candidate for this argument :param matched_argument: The appropriate argument object for this lexical argument :return: True if the three (this argument and the given position and candidate) are compatible with the regard to the linked argument, and False otherwise """ candidate_token = candidate.get_token() matched_position = position # Check that this argument is compatible with that position if position not in self.positions: return False if position == POS_PREFIX: # Empty pattern means that this argument isn't compatible with any prefix position #if self.prefix_pattern == "": # return False # Check whether the candidate is compatible with the prefix pattern #matched_position = re.search(self.prefix_pattern, candidate_token._.subtree_text + " ", re.M) matched_position = self._check_prefix(candidate_token, self.prefixes) if matched_position is None: return False #matched_position = matched_position.group().strip() #if not candidate.check_position(matched_position): # return False # The prefix cannot be the entire argument if len(matched_position.split()) == len( candidate_token._.subtree_text.split()): return False # a complement without a standard prefix position may also required a specific prefix constraint elif ARG_CONSTRAINT_REQUIRED_PREFIX in self.constraints and self._check_prefix( candidate_token, self.prefixes) is None: # and re.search(self.prefix_pattern, candidate_token._.subtree_text + " ", re.M) is None: return False # Check wether the candidate isn't compatible with the *illegal* prefix pattern #if self.illegal_prefix_pattern != "" and \ # re.search(self.illegal_prefix_pattern, candidate_token._.subtree_text + " ", re.M) is not None: # return False if self.illegal_prefixes != [] and self._check_prefix( candidate_token, self.illegal_prefixes): return False # Update the matched position of the given matched argument matched_argument.set_matched_position(matched_position) # Check the compatibility between the candidate and this argument return self._check_constraints(candidate_token, matched_argument)
def determine_args_type2(self, candidates_args, predicate: ExtractedArgument, verb, default_subcat=False): # Determines the most appropriate type of each candidate, using a model uncertain_types = list(self.tagset.keys()) + ( [COMP_PP1, COMP_PP2] if COMP_PP in self.tagset else []) uncertain_candidates = defaultdict(list) determined_dict = defaultdict( list) # The determined arguments for each candidate predicate_token = predicate.get_token() # Each candidate should take one appropriate type, determined by the model for candidate, args in candidates_args.items(): types = set([a.get_real_type() for a in args]) candidate_span = args[0].as_span(trim_argument=False) # The candidate is compatible with some "certain" complements if not types.issubset(uncertain_types): determined_dict[candidate] = [ a for a in args if a.get_real_type() not in uncertain_types ] continue # Find the appropriate type and add this candidate to its list of options predicted_type, entropy, args = self.choose_arg_type( candidate_span, args, types, predicate_token, verb, default_subcat) if args != []: uncertain_candidates[predicted_type].append( (candidate, entropy, args)) # The predicate might also take an argument role predicate_type = predicate.get_name() if predicate_type: uncertain_candidates[predicate_type].append( (predicate_token, predicate.get_entropy(), [predicate])) # Now, determine for each complement type, the most appropriate candidates (using entropy) for arg_type, candidates_info in uncertain_candidates.items(): candidates_info.sort(key=lambda c_info: c_info[1]) # Choose the best candidate candidate, _, args = candidates_info[0] determined_dict[candidate] = args print(candidate, [arg.get_token() for arg in args]) # The maximum number of compatible PP is 2 for the default subcat if arg_type == COMP_PP and len( candidates_info) > 1 and default_subcat: candidate, _, args = candidates_info[1] determined_dict[candidate] = args return determined_dict
def _update_unused_candidates(self, token_candidates: list, predicate_token: Token, used_tokens: list, extraction: dict, specify_none=False, trim_arguments=True): if not specify_none: return extraction[COMP_NONE] = [] prepositions = list( itertools.chain.from_iterable([ self.entries[DEFAULT_ENTRY].subcats[DEFAULT_SUBCAT]. arguments[arg_type].prefixes for arg_type in [COMP_PP, COMP_IND_OBJ, COMP_SUBJ, COMP_OBJ] ])) # Add any candidate that isn't in the used tokens to the NONE complement for unused_candidate in difference_list(token_candidates, used_tokens): unused_token = unused_candidate.get_token() nom_links = [URELATION_NMOD, URELATION_COMPOUND, URELATION_ACL] verb_links = [ URELATION_NSUBJ, URELATION_IOBJ, URELATION_DOBJ, URELATION_NMOD_POSS, URELATION_NSUBJPASS, URELATION_NMOD ] relevant_links = verb_links if self.is_verb else nom_links if unused_token.dep_ not in relevant_links or unused_token.i == predicate_token.i: continue if not unused_token.pos_.startswith("N"): continue if unused_token.dep_ in [URELATION_NMOD, URELATION_ACL]: found_prep = False candidate_text = unused_token._.subtree_text + " " for prefix in prepositions: if candidate_text.startswith(prefix): found_prep = True if not found_prep: continue unused_arg = ExtractedArgument(unused_token, COMP_NONE) arg_span = unused_arg.as_span(trim_argument=trim_arguments) extraction[COMP_NONE].append(arg_span)
def _check_arguments_compatibility(self, args_per_candidate: dict, argument_types: list, argument_candidates: list, predicate: ExtractedArgument, is_required=False): """ Checks the compatibility of the given argument types with each argument candidate :param args_per_candidate: the possible extracted arguments for each candidate :param argument_types: a list of argument types :param argument_candidates: the candidates for the arguments of this subcat (as list of tokens) :param predicate: the predicate object of the arguments that we are after :return: None """ for complement_type in argument_types: #print(complement_type) argument = self.arguments[complement_type] found_match = False for candidate in argument_candidates: matched_argument = argument.check_match( candidate, predicate.get_token()) if matched_argument is not None: args_per_candidate[candidate.get_token()].append( matched_argument) found_match = True if is_required and not found_match: return
def check_match(self, candidate: ArgumentCandidate, referenced_token: Token): """ Checks whether the given candidate matches to to this argument :param candidate_token: a token candidate for this argument :param referenced_token: the predicate of the arguments that we are after :return: the matched argument (ExtractedArgument object) if a match was found, or None otherwise """ candidate_token = candidate.get_token() # Avoid arguments that contain the referenced token # This can happen when the dependency tree isn't projective referenced_i = referenced_token.i cand_start = candidate_token._.subtree_indices[0] cand_end = candidate_token._.subtree_indices[-1] if candidate_token != referenced_token and referenced_i >= cand_start and referenced_i <= cand_end: #@TODO-check that only a fair amount of arguments are being cut here #print(candidate_token) return None matched_argument = ExtractedArgument(candidate_token, self.get_complement_type()) # Get the possible "position" type for the candidate (like DET-POSS, PREFIX and so on) # Based on the dependency relation that connects the candidate to the rest of the tree (its head relation) #possible_positions = relation_to_position(candidate_token, referenced_token, self.is_verb) candidate_positions = candidate.get_possible_positions() # Check the compatibility of each position with this argument and the candidate for position in candidate_positions: if self._check_position(position, candidate, matched_argument): return matched_argument return None
def _check_root(self, candidate_token: Token, matched_argument: ExtractedArgument): """ Checks that the constraints on the root according to this argument works for the given root word :param candidate_token: a token candidate for this argument :param matched_argument: The appropriate argument object for this lexical argument :return: True if the root doesn't contradict the root constraints of this argument, and False otherwise """ # Check whether the matched position is a multi-word preposition and the candidate token is part of the preposition prefix # If so, then the "root" of the candidate should be the nearest connected token *after the preposition*, for the purpose of the next tests # Example- "... with regard to the man". The candidate token will be "regard". But we must check the constraints over "man" matched_position = matched_argument.get_position() if matched_position.islower(): candidate_index_in_arg = candidate_token.i - candidate_token._.subtree_indices[ 0] prep_length = len(matched_position.split(" ")) if prep_length > 1 and candidate_index_in_arg < prep_length: #@TODO- is it right to use "wild card" relation here? end_of_preposition_idx = candidate_token._.subtree_indices[ 0] + prep_length candidate_token = get_word_in_relation( candidate_token, URELATION_ANY, start_index=end_of_preposition_idx) if candidate_token is None: return False if not check_relations(candidate_token, self.root_urelations): return False # ING and TO-INF complements may include be-form verb instead of the main verb of those complements # In such cases the "be" verb isn't the root, and the real root doesn't obey its contraints (cause it isn't the verb) if COMP_TO_INF in self.complement_type or "ING" in self.complement_type: needed_be_form = "be" if COMP_TO_INF in self.complement_type else "being" if check_relations(candidate_token, [URELATION_COP + "_" + needed_be_form]): return True if self.root_upostags != [] and candidate_token.pos_ not in self.root_upostags: return False #@TODO- can a determiner that isn't possessive pronoun be an NP argument? if candidate_token.pos_ == UPOS_DET and self.complement_type in [ COMP_SUBJ, COMP_OBJ, COMP_IND_OBJ, COMP_NP ] and candidate_token.orth_.lower() not in POSSESIVE_OPTIONS: return False # if self.root_pattern != "" and not re.search(self.root_pattern, candidate_token.orth_.lower(), re.M): # return False if PATTERN_ING in self.root_patterns and not candidate_token.orth_.lower( ).endswith("ing"): return False return True
def _check_constraints(self, candidate_token: Token, matched_argument: ExtractedArgument): """ Checks whether the given candidate is compatible with the constraints of this argument :param candidate_token: a token candidate for this argument :param matched_argument: The appropriate argument object for this lexical argument :return: True if the candidate doesn't contradict the constraints, and False otherwise """ # Checks the constraints on the root if not self._check_root(candidate_token, matched_argument): return False #################################### # Check the boolean constraints # Handle optional possessive sub-argument if ARG_CONSTRAINT_OPTIONAL_POSSESSIVE in self.constraints: founded_possessive = False for relation in [URELATION_NMOD_POSS, URELATION_NSUBJ]: founded_token = get_word_in_relation(candidate_token, relation) if founded_token is None: continue if self._is_possessive(founded_token): founded_possessive = True break # Change the argument name of the given matched argument object, if needed # Possessive should be included if founded_possessive and "POSSING" not in self.complement_type: matched_argument.set_name( self.complement_type.replace("ING", "POSSING")) # Possessive should be excluded elif not founded_possessive and "POSSING" in self.complement_type: matched_argument.set_name( self.complement_type.replace("POSSING", "ING-ARBC")) return True
def _wrap_predicate(self, word: Token, word_entry: Entry, arguments_predictor=None): noun_type = None entropy = None # Predicting the predicate type for noun if not self.is_verb and arguments_predictor and word_entry.is_default_entry( ): noun_type, entropy = arguments_predictor.determine_noun_type(word) predicate = ExtractedArgument(word, noun_type, matched_position=POS_NOM, entropy=entropy) return predicate
def _get_extractions(self, args_per_candidate: dict, predicate: ExtractedArgument, suitable_verb: str, arguments_predictor=None): """ Genetrates all the possible extractions of arguments and candidates, based on the possible arguments per candidate :param args_per_candidate: the possible argument types for each candidate :param predicate: the predicate object of the arguments that we are after :param suitable_verb: the appropriate verb for the given reference token :param arguments_predictor: the model-based extractor object to determine the argument type of a span (optional) :return: all the possible extractions for this subcat """ # Determine the arguments type of candidates with uncertainty about their complement type if arguments_predictor is not None and self.subcat_type == DEFAULT_SUBCAT: args_per_candidate = arguments_predictor.determine_args_type( args_per_candidate, predicate, suitable_verb, default_subcat=True) # Add a "None" argument option for each candidate, cause any candidate may not be an argument for candidate_token in args_per_candidate.keys(): # args_per_candidate[candidate_token] = self._choose_informative_positions(args_per_candidate[candidate_token]) args_per_candidate[candidate_token].append(None) candidates = args_per_candidate.keys() matches = [ dict(zip(candidates, arguments)) for arguments in product(*args_per_candidate.values()) ] relevant_extractions = [] for match in matches: extraction = Extraction(self, list(match.values())) # Check constraints on the current extraction if self.check_constraints(extraction, predicate.get_token()): relevant_extractions.append(extraction) return relevant_extractions
def add_argument(self, argument: ExtractedArgument): self.match[argument.get_real_type()] = argument
def determine_args_type(self, candidates_args, predicate: ExtractedArgument, verb, default_subcat=False): # Determines the most appropriate type of each candidate, using a model uncertain_types = list(self.tagset.keys()) + ( [COMP_PP1, COMP_PP2] if COMP_PP in self.tagset else []) uncertain_candidates = {} predicate_token = predicate.get_token() determined_dict = {} none_spans = [] # Each candidate should take one appropriate type, determined by the model for candidate_span, role_types in candidates_args.items(): role_types = set(role_types) if predicate.get_token().i == candidate_span[ 0].i or role_types.isdisjoint(uncertain_types): determined_dict[candidate_span] = role_types continue if candidate_span.lemma_ in [ "i", "he", "she", "it", "they", "we", "-PRON-" ]: determined_dict[candidate_span] = role_types continue role_types.add(COMP_NONE) logits = self.get_types_distribution(candidate_span, role_types, predicate_token, verb, default_subcat) if logits.argmax().item() == self.tagset[COMP_NONE]: none_spans.append(candidate_span) else: uncertain_candidates[candidate_span] = logits if len(uncertain_candidates) == 0: return determined_dict #print(dict(candidates_args)) # if uncertain_candidates == {}: # return {} u = list(uncertain_candidates.keys()) u += [None] * (len(uncertain_types) - 2 ) #(len(self.tagset) - 1 - len(uncertain_candidates)) certain_types = [ ] #[list(types)[0] for types in determined_dict.values() if len(types) == 1] role_types = difference_list(uncertain_types, [COMP_NONE] + certain_types) #if len(predicate_types) == 1: # role_types = difference_list(role_types, predicate_types) types_combinations = list(permutations(u, len(role_types))) empty_comb = tuple([None] * len(role_types)) if empty_comb not in types_combinations: types_combinations.append(empty_comb) #print(predicate.get_token(), types_combinations) args_sum_logits = [] for comb in types_combinations: # sum_logits = 0 # for i, arg in enumerate(comb): # if arg: # print(i, role_types[i], uncertain_candidates[arg][self.tagset[role_types[i]]]) # sum_logits += uncertain_candidates[arg][self.tagset[role_types[i]]].item() sum_logits = sum([ uncertain_candidates[arg][self.tagset[role_types[i]]] for i, arg in enumerate(comb) if arg ]) sum_logits += sum([ uncertain_candidates[arg][self.tagset[COMP_NONE]].item() for arg in set(u).difference(comb) if arg ]) args_sum_logits.append(sum_logits) #print(predicate.get_token(), args_sum_logits) max_idx = int(np.argmax(args_sum_logits)) best = types_combinations[max_idx] determined_dict.update( {arg: [role_types[i]] for i, arg in enumerate(best) if arg}) for arg in difference_list(candidates_args.keys(), determined_dict.keys()): determined_dict[arg] = difference_list(candidates_args[arg], uncertain_types) #if predicate_span: # determined_dict[predicate_span] = predicate_types #assert all([set(determined_dict[s]).isdisjoint(uncertain_types) for s in none_spans]) #print(predicate.get_token(), len(types_combinations), determined_dict) return determined_dict