예제 #1
0
 def consider_candidate(self, node, candidate):  # -> bool
     """
     if "candidate" is an appropriate candidate for coreference with "node"
     COULD BE CHANGED IN THE FUTURE
     """
     if (node == candidate):
         return False
     if (api.has_upostag(candidate, ["NOUN", "PRON", "VERB"])):
         return True
     return False
예제 #2
0
 def search_sentence_for_candidates(self, node,
                                    sentence):  # -> list of Nodes
     """
     candidates from the given setences
     """
     nodes = api.get_nodes(sentence)
     candidates = []
     for candidate in nodes:
         if (self.consider_candidate(node, candidate)):
             candidates.append(candidate)
     return candidates
예제 #3
0
    def search_candidates(self, node):  # -> list of Nodes
        """
        selects possible coreferents of the given node
        COULD BE CHANGED IN THE FUTURE
        """
        actual_sentence = api.get_sentence(node)
        previous_sentence = api.previous_sentence(actual_sentence)
        next_sentence = api.next_sentence(actual_sentence)

        candidates = self.search_sentence_for_candidates(node, actual_sentence)

        backwards_distance = 3  # 3 previous sentences
        for i in range(backwards_distance):
            if (previous_sentence != None):
                candidates += self.search_sentence_for_candidates(
                    node, previous_sentence)
                previous_sentence = api.previous_sentence(previous_sentence)

        if (next_sentence != None):
            candidates += self.search_sentence_for_candidates(
                node, next_sentence)
        # ...PPPAN... we search three sentences backwards, the actual and the next sentence

        return candidates
예제 #4
0
 def get_corefs(
     self, doc
 ):  # -> list of coreferenting expression (represented by Eval_coref_record object) in the given Document
     clusters = []  # list of Eval_cluster_records
     coreferents = []  # list of Eval_coref_records
     for paragraph in api.get_paragraphs(
             doc):  # linear iteration through all nodes in the document
         for sentence in api.get_sentences(paragraph):  #
             for node in api.get_nodes(sentence):  #
                 cluster_id_string = api.get_misc_by_name(
                     node,
                     "Coref")  # getting coreference cluster number (string)
                 if (cluster_id_string == None):
                     cluster_id_string = api.get_misc_by_name(
                         node, "Drop_coref"
                     )  # !!! DROPS ARE IGNORED FOR NOW, IF THERE IS A NON-DROPPED COREFEREN !!
                 if (cluster_id_string != None):
                     cluster_id = int(cluster_id_string)
                     # there should be at most one such cluster
                     appropriate_clusters = [
                         cluster for cluster in clusters
                         if (cluster.cluster_id == cluster_id)
                     ]
                     if (
                             appropriate_clusters == []
                     ):  # first occurence of this cluster id - create a new instance
                         cluster = Eval_cluster_record(cluster_id)
                         clusters.append(cluster)
                     else:  # already existing cluster
                         cluster = appropriate_clusters[
                             0]  # there is at most one element
                     coref_id = api.get_full_id(node)
                     cluster.add_coreferent(
                         coref_id
                     )  # adding coreferent to list of coreferents of the cluster
                     if (api.has_upostag(
                             node, ["PRON", "DET"]
                     )  # if we are supposed to detect coreference of this cluster
                             and api.has_feature(node, "PronType",
                                                 ["Prs", "Rel", "Dem"])):
                         # !!! PRO DROPS MISSING !!!
                         coref = Eval_coref_record(coref_id, cluster)
                         coreferents.append(
                             coref
                         )  # output list of all pronouns, for which the coreference was detected
     return coreferents
예제 #5
0
 def process_document(
         self
 ):  # -> list of quadruplets ( Node, Node, list of features, bool)
     pairs_with_vectors = []
     for paragraph in api.get_paragraphs(self.document):
         for sentence in api.get_sentences(paragraph):
             for node in api.get_nodes(
                     sentence
             ):  # linear cycle through all nodes in the document
                 if (api.has_upostag(node, ["PRON", "DET"])
                         and  # if we are suppose to detect coreference
                         api.has_feature(node, "PronType",
                                         ["Prs", "Rel", "Dem"])):
                     # !!! CONDITION FOR PRO-DROPS MISSING !!!
                     candidates = self.search_candidates(
                         node)  # list of candidates for coreference
                     for candidate in candidates:
                         feature_vector = self.build_feature_vector(
                             node, candidate)
                         target_value = api.are_coreferents(node, candidate)
                         pairs_with_vectors.append(
                             (node, candidate, feature_vector,
                              target_value))
     return pairs_with_vectors
예제 #6
0
    def build_feature_vector(
            self, node,
            candidate):  # -> list - feature vector for the given pair
        """
        for now, list of bools, but ints (distances) are also considerable
        !!! SHOULD BE CHANGED IN THE FUTURE !!!
        """
        feature_vector = []
        same_sentence = api.in_same_sentence(node, candidate)

        # distances
        # feature_vector.append( same_sentence)
        # if ( same_sentence ):
        #     feature_vector.append( True) # same paragraph
        #     feature_vector.append( api.surface_node_distance( node, candidate))
        # else:
        #     same_paragraph = api.in_same_paragraph( node, candidate)
        #     feature_vector.append( same_paragraph)
        #     if ( same_paragraph ):
        #         feature_vector.append( api.surface_sentence_distance( node, candidate))
        #     else:
        #         feature_vector.append( api.surface_paragraph_distance( node, candidate))
        # feature_vector.append( api.depth_distance( node, candidate))
        # feature_vector.append( api.compound_distance( node, candidate))
        # feature_vector.append( api.ccs_depth( node, candidate))

        anaphoric_pronoun = api.get_id(node) > api.get_id(
            candidate)  # the pronoun is after its antecedent - anaphora
        feature_vector.append(same_sentence and anaphoric_pronoun)

        # grammar
        # ? not only bool for equality, but also categories ?
        feature_vector.append(
            api.has_feature(node, "Case",
                            api.get_features_by_name(candidate,
                                                     "Case")))  # same case
        feature_vector.append(
            api.has_feature(node, "Gender",
                            api.get_features_by_name(candidate,
                                                     "Gender")))  # same gender
        feature_vector.append(
            api.has_feature(node, "Number",
                            api.get_features_by_name(candidate,
                                                     "Number")))  # same number

        # pronoun
        feature_vector.append(api.has_feature(node, "PronType",
                                              ["Dem"]))  # demonstrative
        feature_vector.append(api.has_feature(node, "PronType",
                                              ["Prs"]))  # personal
        feature_vector.append(api.has_feature(node, "PronType",
                                              ["Rel"]))  # relative
        feature_vector.append(api.has_feature(node, "Reflex",
                                              ["Yes"]))  # reflexive
        feature_vector.append(api.has_feature(node, "Poss",
                                              ["Yes"]))  # possessive

        # candidate
        # part of speech
        feature_vector.append(api.has_upostag(candidate, ["NOUN"]))
        feature_vector.append(api.has_upostag(candidate, ["PRON"]))
        feature_vector.append(api.has_upostag(candidate, ["VERB"]))
        # function in the sentence
        feature_vector.append(api.has_deprel(candidate,
                                             ["nsubj"]))  # nominal subject

        return feature_vector
예제 #7
0
 input.close()
 
 selec = CoNLL_selector( document)  # selecting of feature vectors with target values for evaluation
 vectors = selec.process_document() #
 
 feature_vectors = []              # similar as for training files
 target_values = []                # 
 for i in vectors:                 #
     feature_vectors.append( i[2]) #
     target_values.append( i[3])   #
     
 results = list( knn.predict(feature_vectors)) # PREDICTION
 print( results.count(True), target_values.count(True))
 coreference_triplets = [] # triplets ( pronound id, candidate id, bool value - are they coreferents? )
 for i in range( len( vectors)):
     pronoun_id = api.get_full_id( vectors[i][0])
     referent_id = api.get_full_id( vectors[i][1])
     triplet = ( pronoun_id, referent_id, bool(results[i]) )
     coreference_triplets.append( triplet)
     
 input = open( path + test + ".outp.conll", 'r', encoding="utf8") # test files WITHOUT coreference information
 output = open( path + test + ".test.conll", 'w', encoding="utf8") # adding detected coreference
 coref_adder = CoNLL_add_coreference( input, output)               #
 coref_adder.add_coreference( coreference_triplets)                #
 input.close()
 output.close()
 
 # EVALUATION - comparing of manually (gold) and automatically marked coreference
 gold_input = open( path + test + ".out.conll", 'r', encoding="utf8") # test file with coreference
 auto_input = output = open( path + test + ".test.conll", 'r', encoding="utf8") # output of adder - previous step
 evaluator = CoNLL_evaluator( gold_input, auto_input) # evaluating