def consider_candidate(self, node, candidate): # -> bool """ if "candidate" is an appropriate candidate for coreference with "node" COULD BE CHANGED IN THE FUTURE """ if (node == candidate): return False if (api.has_upostag(candidate, ["NOUN", "PRON", "VERB"])): return True return False
def search_sentence_for_candidates(self, node, sentence): # -> list of Nodes """ candidates from the given setences """ nodes = api.get_nodes(sentence) candidates = [] for candidate in nodes: if (self.consider_candidate(node, candidate)): candidates.append(candidate) return candidates
def search_candidates(self, node): # -> list of Nodes """ selects possible coreferents of the given node COULD BE CHANGED IN THE FUTURE """ actual_sentence = api.get_sentence(node) previous_sentence = api.previous_sentence(actual_sentence) next_sentence = api.next_sentence(actual_sentence) candidates = self.search_sentence_for_candidates(node, actual_sentence) backwards_distance = 3 # 3 previous sentences for i in range(backwards_distance): if (previous_sentence != None): candidates += self.search_sentence_for_candidates( node, previous_sentence) previous_sentence = api.previous_sentence(previous_sentence) if (next_sentence != None): candidates += self.search_sentence_for_candidates( node, next_sentence) # ...PPPAN... we search three sentences backwards, the actual and the next sentence return candidates
def get_corefs( self, doc ): # -> list of coreferenting expression (represented by Eval_coref_record object) in the given Document clusters = [] # list of Eval_cluster_records coreferents = [] # list of Eval_coref_records for paragraph in api.get_paragraphs( doc): # linear iteration through all nodes in the document for sentence in api.get_sentences(paragraph): # for node in api.get_nodes(sentence): # cluster_id_string = api.get_misc_by_name( node, "Coref") # getting coreference cluster number (string) if (cluster_id_string == None): cluster_id_string = api.get_misc_by_name( node, "Drop_coref" ) # !!! DROPS ARE IGNORED FOR NOW, IF THERE IS A NON-DROPPED COREFEREN !! if (cluster_id_string != None): cluster_id = int(cluster_id_string) # there should be at most one such cluster appropriate_clusters = [ cluster for cluster in clusters if (cluster.cluster_id == cluster_id) ] if ( appropriate_clusters == [] ): # first occurence of this cluster id - create a new instance cluster = Eval_cluster_record(cluster_id) clusters.append(cluster) else: # already existing cluster cluster = appropriate_clusters[ 0] # there is at most one element coref_id = api.get_full_id(node) cluster.add_coreferent( coref_id ) # adding coreferent to list of coreferents of the cluster if (api.has_upostag( node, ["PRON", "DET"] ) # if we are supposed to detect coreference of this cluster and api.has_feature(node, "PronType", ["Prs", "Rel", "Dem"])): # !!! PRO DROPS MISSING !!! coref = Eval_coref_record(coref_id, cluster) coreferents.append( coref ) # output list of all pronouns, for which the coreference was detected return coreferents
def process_document( self ): # -> list of quadruplets ( Node, Node, list of features, bool) pairs_with_vectors = [] for paragraph in api.get_paragraphs(self.document): for sentence in api.get_sentences(paragraph): for node in api.get_nodes( sentence ): # linear cycle through all nodes in the document if (api.has_upostag(node, ["PRON", "DET"]) and # if we are suppose to detect coreference api.has_feature(node, "PronType", ["Prs", "Rel", "Dem"])): # !!! CONDITION FOR PRO-DROPS MISSING !!! candidates = self.search_candidates( node) # list of candidates for coreference for candidate in candidates: feature_vector = self.build_feature_vector( node, candidate) target_value = api.are_coreferents(node, candidate) pairs_with_vectors.append( (node, candidate, feature_vector, target_value)) return pairs_with_vectors
def build_feature_vector( self, node, candidate): # -> list - feature vector for the given pair """ for now, list of bools, but ints (distances) are also considerable !!! SHOULD BE CHANGED IN THE FUTURE !!! """ feature_vector = [] same_sentence = api.in_same_sentence(node, candidate) # distances # feature_vector.append( same_sentence) # if ( same_sentence ): # feature_vector.append( True) # same paragraph # feature_vector.append( api.surface_node_distance( node, candidate)) # else: # same_paragraph = api.in_same_paragraph( node, candidate) # feature_vector.append( same_paragraph) # if ( same_paragraph ): # feature_vector.append( api.surface_sentence_distance( node, candidate)) # else: # feature_vector.append( api.surface_paragraph_distance( node, candidate)) # feature_vector.append( api.depth_distance( node, candidate)) # feature_vector.append( api.compound_distance( node, candidate)) # feature_vector.append( api.ccs_depth( node, candidate)) anaphoric_pronoun = api.get_id(node) > api.get_id( candidate) # the pronoun is after its antecedent - anaphora feature_vector.append(same_sentence and anaphoric_pronoun) # grammar # ? not only bool for equality, but also categories ? feature_vector.append( api.has_feature(node, "Case", api.get_features_by_name(candidate, "Case"))) # same case feature_vector.append( api.has_feature(node, "Gender", api.get_features_by_name(candidate, "Gender"))) # same gender feature_vector.append( api.has_feature(node, "Number", api.get_features_by_name(candidate, "Number"))) # same number # pronoun feature_vector.append(api.has_feature(node, "PronType", ["Dem"])) # demonstrative feature_vector.append(api.has_feature(node, "PronType", ["Prs"])) # personal feature_vector.append(api.has_feature(node, "PronType", ["Rel"])) # relative feature_vector.append(api.has_feature(node, "Reflex", ["Yes"])) # reflexive feature_vector.append(api.has_feature(node, "Poss", ["Yes"])) # possessive # candidate # part of speech feature_vector.append(api.has_upostag(candidate, ["NOUN"])) feature_vector.append(api.has_upostag(candidate, ["PRON"])) feature_vector.append(api.has_upostag(candidate, ["VERB"])) # function in the sentence feature_vector.append(api.has_deprel(candidate, ["nsubj"])) # nominal subject return feature_vector
input.close() selec = CoNLL_selector( document) # selecting of feature vectors with target values for evaluation vectors = selec.process_document() # feature_vectors = [] # similar as for training files target_values = [] # for i in vectors: # feature_vectors.append( i[2]) # target_values.append( i[3]) # results = list( knn.predict(feature_vectors)) # PREDICTION print( results.count(True), target_values.count(True)) coreference_triplets = [] # triplets ( pronound id, candidate id, bool value - are they coreferents? ) for i in range( len( vectors)): pronoun_id = api.get_full_id( vectors[i][0]) referent_id = api.get_full_id( vectors[i][1]) triplet = ( pronoun_id, referent_id, bool(results[i]) ) coreference_triplets.append( triplet) input = open( path + test + ".outp.conll", 'r', encoding="utf8") # test files WITHOUT coreference information output = open( path + test + ".test.conll", 'w', encoding="utf8") # adding detected coreference coref_adder = CoNLL_add_coreference( input, output) # coref_adder.add_coreference( coreference_triplets) # input.close() output.close() # EVALUATION - comparing of manually (gold) and automatically marked coreference gold_input = open( path + test + ".out.conll", 'r', encoding="utf8") # test file with coreference auto_input = output = open( path + test + ".test.conll", 'r', encoding="utf8") # output of adder - previous step evaluator = CoNLL_evaluator( gold_input, auto_input) # evaluating