def _create_F(self,W,T): b_table = create_between_pos_table(T) F = {} for i in range(len(W)): for j in range(i+1,len(W),1): for di in ['L','R']: F['%d-%d-%s'%(i,j,di)] = self._f_model.convert_data(i,j,W,T,di,b_table) return F
def read_training_file(self,input_file): lines = open(input_file).readlines() print 'Reading Training Data...' total = 0 # save words and tags tmp = [] for line in lines: if line.strip() != '': tmp.append(line.strip()) else: W = tmp[0].strip().split('\t') if len(W) < 2: tmp = [] continue W = ['<root>'] + map(lambda x: x.strip().replace(' ','_'),W) T = ['<root-POS>'] + tmp[1].strip().split('\t') A = ['<no-type>'] + tmp[2].strip().split('\t') H = [-1] + map(int,tmp[3].strip().split('\t')) for i,w in enumerate(W): if T[i] == 'npn': W[i] = '<npn>' else: W[i] = encode_number(w) self._training_data.append((W,T,A,H)) b_table = create_between_pos_table(T) total += 1 if total % 10 == 0: print total for j,i in enumerate(H): if i == -1: continue if i > j: self._f_model.insert_data(j,i,W,T,'L',b_table) else: self._f_model.insert_data(i,j,W,T,'R',b_table) tmp = [] self.features_size = self._f_model.get_size_of_features() print total print 'Number of Features:',self.features_size self._create_features_instance()