Пример #1
0
 def __init__(self,datapath,features):
     #self.diclist = self.load_dict(datapath)
     self.stemmer = Stemmer()
     self.features= features
     self.classifier = MegamClassifier( modelfile=datapath+".megam")
     return
Пример #2
0
class FunctionLabeller:

    def __init__(self,datapath,features):
        #self.diclist = self.load_dict(datapath)
        self.stemmer = Stemmer()
        self.features= features
        self.classifier = MegamClassifier( modelfile=datapath+".megam")
        return

    def label_tree(self,root):
        decorate_tree(root)
        self.label_functions(root)

    def label_functions(self,root):
        if not root.is_terminal_sym():
            if find_head(root).label == "VN": # root.label in ["SENT","Sinf","Ssub","Sint","Srel","VN","VPinf","VPpart"] : #
                dep_node_list= []
                for child in root.children:
                    if child.funlabel == None:
                        if child.label == "VN":
                            dep_node_list += self.do_clitics(child)
                        # marie : skip nodes that are supposed to be None
                        #elif not child.head:
                        elif not child.head and not(child.is_terminal_sym() or child.label == 'COORD'):
                            dep_node_list.append(child)
                lines_seq = []
                for elt in dep_node_list:
                    featvector = grab_features(root,elt,self.features.keys())
                    # call stemmer simply to escape punctuation marks etc.
                    for i in xrange(len(featvector)):
                        featname = self.features.keys()[i]
                        if featname.endswith("WORD"):
                            featvector[i] = self.stemmer.stem(featvector[i],stem_len=100)
                    lines_seq.append(featvector)
                funlabelsequence = self.label_sequence(lines_seq,self.features.keys()) # return list de classes avec ordre préservé class[i] = dep[i]
                for i in range(len(dep_node_list)):
                    if funlabelsequence[i] == "None":
                        funlabelsequence[i] = None
                        
                    dep_node_list[i].funlabel = funlabelsequence[i]
            for child in root.children:
                self.label_functions(child)


    def do_clitics(self,tree):
        """
        This actually grabs data lines for every node under VN 
        """
        for child in tree.children:
            dep_node_list = []
            if child.label in ['CL','CLS','CLO','CLR'] and not child.head:
                dep_node_list.append(child)
            return dep_node_list



    def label_sequence( self, seq_list, feat_names ):
        labels = []
        # call constructor Sequence 
        seq = Sequence( seq_list, feat_names )
        # pointwise classification of dependent
        for i in range( len(seq.dependents) ):
            inst = PointwiseInstance( seq.dependents, i )
            # classify 
            cl = self.classifier.get_best_label( inst.feature_vector() )
            # store label for each dependent
            labels.append( cl )
        return labels



   #  def label_sequence( self, seq_list, feat_names, beamsize=3 ):
#         # call constructor Sequence 
#         seq = Sequence( seq_list, feat_names )
#         dependents = seq.dependents
#         # maintain N-best sequences of dependent assignments
#         sequences = [([],0.0)]  # log prob.
#         for i in range( len(dependents) ):
#             n_best_sequences = []
#             # compute static features (these are cached)
#             cached_inst = PointwiseInstance( dependents, i )
#             for j in range( len(sequences) ):
#                 seq_j,log_pr_j = sequences[j]
#                 deps_j = seq_j+dependents[i:]
#                 # add sequential features
#                 inst = PointwiseInstance( deps_j, i )
#                 inst.fv = deepcopy(cached_inst.fv)
#                 inst.add_sequential_features( deps_j, i )
#                 # get pr distrib for different classes
#                 label_pr_distrib = self.classifier.get_label_probs(inst.feature_vector())
#                 # extend sequence with dependent i
#                 for (cl,pr) in label_pr_distrib:
#                     dep = deepcopy(dependents[i])
#                     dep.cl = cl
#                     n_best_sequences.append((seq_j+[dep],log_pr_j+math.log(pr)))
#             # sort sequences
#             n_best_sequences.sort(lambda x,y:cmp(x[1],y[1]))
#             # keep N best
#             sequences = n_best_sequences[-beamsize:]
#         # best sequence is sequence with highest prob. 
#         best_sequence = sequences[-1][0]
#         # return labels for best_sequence
#         return [d.cl for d in best_sequence]



    def load_dict(self,datapath):
        f = open(datapath+".dict")
        diclist = pickle.load(f)
        f.close()
        return diclist