def score_it(self, bigram, incoming_char): feature = feature_gen(bigram, incoming_char) raw_score = self.eval([u_str.encode('utf-8') for u_str in feature], "T") # maxent_model only takes utf-8 string as input #print '\t\tRaw score=',raw_score return math.log(raw_score, 10)
def gen_instance_by_traversal_lattice(valid_state, backward_lattice, sent, dummy_end): instance = [] display_flag = False if display_flag: print "instance gen..." for i in range(1, len(backward_lattice) + 1): incoming_char = get_incoming_char(sent, i, dummy_end) if display_flag: print '\n\ni=', i cached_bigram = backward_lattice[i - 1] for j in cached_bigram: if display_flag: print '\tj=', j if j == 0: label = u"F" if (cached_bigram[0], incoming_char) in valid_state: label = u"T" feature = feature_gen(cached_bigram[0], incoming_char) instance.append((label, feature)) if display_flag: print '\t## label/bigram/incoming_char=', label, u"-".join(cached_bigram[0]) \ , incoming_char, 'feature=', u"/".join(feature) else: for k in cached_bigram[j]: bigram = cached_bigram[j][k] label = u"F" if (bigram, incoming_char) in valid_state: label = u"T" feature = feature_gen(bigram, incoming_char) instance.append((label, feature)) if display_flag: print '\t\tk=', k, 'label/bigram/incoming_char=', label, u"-".join( bigram), incoming_char, 'feature=', u"/".join(feature) return instance
def score_it(self, bigram, incoming_char): feature = feature_gen(bigram, incoming_char) raw_score = self.eval( [u_str.encode('utf-8') for u_str in feature], "T") # maxent_model only takes utf-8 string as input #print '\t\tRaw score=',raw_score return math.log(raw_score, 10)
def score_it(bigram, incoming_char): #return -1.0 feature = feature_gen(bigram, incoming_char) print "feature:", u" ".join(feature) score = -len(u"".join(feature)) return score
def score_it(bigram, incoming_char): # return -1.0 feature = feature_gen(bigram, incoming_char) print "feature:", u" ".join(feature) score = -len(u"".join(feature)) return score
print(clf.best_estimator_) #print the best estimator print(clf.score(X_test, y_test)) #print accuracy performed on the test set l_1mer = list_1mer() l_2mer = dict_2mer(l_1mer) word_train = open(r'./temp/word_train.txt', mode='r', encoding='utf-8') pinyin_train = open(r'./temp/pinyin_train.txt', mode='r', encoding='utf-8') Data_Feature = [] Data_Label = [] content1 = word_train.readlines() for i in getrandom( len(content1), 2000): #take 2000 word samples randomly for searching best paramaters Data_Feature.append(feature_gen(l_2mer, content1[i])) Data_Label.append(1) #data_label of word is 1 content2 = pinyin_train.readlines() for j in getrandom( len(content2), 2000 ): #take 2000 pinyin samples randomly for searching best paramaters Data_Feature.append(feature_gen(l_2mer, content2[j])) Data_Label.append(0) #data_label of word is 0 SVC_search( Data_Feature, Data_Label) #training various estimators and searching best paramaters print('Program running time:') end = time.process_time()
from feature_gen import feature_gen start = time.process_time() l_1mer = list_1mer() l_2mer = dict_2mer(l_1mer) classifile = open(r'./input/tokens.txt', mode='r', encoding='utf-8') pinyin = open(r'./output/pinyin.txt', mode='w', encoding='utf-8') words = open(r'./output/words.txt', mode='w', encoding='utf-8') Data_Feature = [] clf_new = joblib.load( r'./model/clf.pkl') #load trained SVM classifer clf.pkl as clf_new content = classifile.readlines() for line in content: Data_Feature.append(feature_gen( l_2mer, line)) #generate data_feature for tokens.txt Data_Label = clf_new.predict( Data_Feature) #predict strings in tokens.txt as word or pinyin #Data_Proba = clf_new.predict_proba(Data_Feature) #view the probabilities of prediction for i in range(len(Data_Label)): if Data_Label[i] == 1: words.write(content[i]) #words.write(content[i].strip() + ' ' + str(Data_Proba[i]) + '\n') #write the probabilities of prediction into words.txt as well if Data_Label[i] == 0: pinyin.write(content[i]) #pinyin.write(content[i].strip() + ' ' + str(Data_Proba[i]) + '\n') #write the probabilities of prediction into pinyin.txt as well classifile.close() pinyin.close() words.close()
from feature_gen import feature_gen from mln_generator import mln_generator import sys fg = feature_gen() fg.generate_features()