예제 #1
0
파일: mira.py 프로젝트: ssutee/naist-parser
 def _create_F(self,W,T):
     b_table = create_between_pos_table(T)
     F = {}
     for i in range(len(W)):
         for j in range(i+1,len(W),1):
             for di in ['L','R']:
                 F['%d-%d-%s'%(i,j,di)] = self._f_model.convert_data(i,j,W,T,di,b_table)
     return F
예제 #2
0
파일: mira.py 프로젝트: ssutee/naist-parser
    def read_training_file(self,input_file):
        lines = open(input_file).readlines()
        print 'Reading Training Data...'
        total = 0
        # save words and tags
        tmp = []
        for line in lines:
            if line.strip() != '':
                tmp.append(line.strip())
            else:
                W = tmp[0].strip().split('\t')
                if len(W) < 2:
                    tmp = []
                    continue
                W = ['<root>'] + map(lambda x: x.strip().replace(' ','_'),W)
                T = ['<root-POS>'] + tmp[1].strip().split('\t')
                A = ['<no-type>'] + tmp[2].strip().split('\t')
                H = [-1] + map(int,tmp[3].strip().split('\t'))

                for i,w in enumerate(W):
                    if T[i] == 'npn':
                        W[i] = '<npn>'
                    else:
                        W[i] = encode_number(w)

                self._training_data.append((W,T,A,H))
                b_table = create_between_pos_table(T)
                total += 1
                if total % 10 == 0: print total

                for j,i in enumerate(H):
                    if i == -1: continue
                    if i > j:
                        self._f_model.insert_data(j,i,W,T,'L',b_table)
                    else:
                        self._f_model.insert_data(i,j,W,T,'R',b_table)

                tmp = []
        self.features_size = self._f_model.get_size_of_features()
        print total
        print 'Number of Features:',self.features_size

        self._create_features_instance()