def ctm_train_model(sample_save_path,svm_type,param,model_save_path): '''训练模型,输入样本文件,训练的参数,模型的保存地址,最后会给出模型在训练样本上的测试结果。''' tms_svm.set_svm_type(svm_type) y,x = tms_svm.read_problem(sample_save_path) m = tms_svm.train(y,x,param) tms_svm.save_model(model_save_path,m) labels = {}.fromkeys(y).keys() if len(labels)>2: pred_labels, (Micro, Macro, ACC), pred_values = tms_svm.predict(y,x,m) print "(Micro=%g, Macro=%g, ACC=%g)"%(Micro, Macro, ACC) else: pred_labels, (f_score,recall,presion), pred_values=tms_svm.predict(y,x,m) print "(f_score=%g,recall=%g,presion=%g)"%(f_score,recall,presion) return m
def predict_rsl(config_file, data_file_name, result_save_path, param): '''输入标签,模型,待预测的文本,词典,以及词分词用的符号 返回的是一个预测标签与得分,如果是二分类,返回的是直接得分,如果为多分类,返回的是经过计算的综合分数。 ''' print "-----------------正在加载训练模型-------------------\n" """ model = libsvm.svm_load_model(model_file_name) if not model: print("can't open model file %s" % model_file_name) model=None else: model = toPyModel(model) print "Find model" """ option = '-b ' + str(param) print config_file model = load_tms_model(config_file)[3] print model print "-----------------正在对样本进行预测-------------------\n" i = 1 f_data = open(data_file_name, 'r') lines = f_data.readlines() f_data.close() fs = open(result_save_path, 'w') for line in lines: line = line.split(None, 1) #print line # In case an instance with all zero features if len(line) == 1: line += [''] label, features = line xi = {} y = [float(label)] #print "--------------y--------------:\n",y for e in features.split(): ind, val = e.split(":") xi[int(ind)] = float(val) #print "----------------x----------------:\n",pprint(xi) #在这里要判定是二分类还是多分类,如果为二分类,返回相应的分数,如果为多分类,则返回预测的标签。 p_lab, p_acc, p_sc = tms_svm.predict(y, [xi], model, option) label = p_lab[0] if param: #sc=tms_svm.classer_value(p_sc[0]) if str(label) == str(1): sc = p_sc[0][1] else: sc = p_sc[0][0] #sc=p_sc[0] else: sc = tms_svm.classer_value(p_sc[0]) print sc fs.write(str(label) + "\t" + str(sc) + "\t") #print '-------------%d---------------'%i i += 1 fs.write(str(y[0]) + "\t") fs.write("\n") fs.close() print "-----------------预测完毕-------------------"
def cal_sc_optim(lab,m,text,dic_list,local_fun,global_weight,str_splitTag): '''输入标签,模型,待预测的文本,词典,以及词分词用的符号 返回的是一个预测标签与得分 ''' local_fun = measure.local_f(local_fun) y,x = cons_pro_for_svm(lab,text.strip().split(str_splitTag),dic_list,local_fun,global_weight) p_lab,p_acc,p_sc=tms_svm.predict(y,x,m) return p_lab[0],tms_svm.classer_value(p_sc[0])
def predict_rsl(config_file,data_file_name,result_save_path,param): '''输入标签,模型,待预测的文本,词典,以及词分词用的符号 返回的是一个预测标签与得分,如果是二分类,返回的是直接得分,如果为多分类,返回的是经过计算的综合分数。 ''' print "-----------------正在加载训练模型-------------------\n" """ model = libsvm.svm_load_model(model_file_name) if not model: print("can't open model file %s" % model_file_name) model=None else: model = toPyModel(model) print "Find model" """ option = '-b '+str(param) print config_file model=load_tms_model(config_file)[3] print model print "-----------------正在对样本进行预测-------------------\n" i=1 f_data=open(data_file_name,'r') lines=f_data.readlines() f_data.close() fs = open(result_save_path,'w') for line in lines: line = line.split(None, 1) #print line # In case an instance with all zero features if len(line) == 1: line += [''] label, features = line xi = {} y= [float(label)] #print "--------------y--------------:\n",y for e in features.split(): ind, val = e.split(":") xi[int(ind)] = float(val) #print "----------------x----------------:\n",pprint(xi) #在这里要判定是二分类还是多分类,如果为二分类,返回相应的分数,如果为多分类,则返回预测的标签。 p_lab,p_acc,p_sc=tms_svm.predict(y,[xi],model,option) label=p_lab[0] if param: #sc=tms_svm.classer_value(p_sc[0]) if str(label) == str(1): sc=p_sc[0][1] else: sc=p_sc[0][0] #sc=p_sc[0] else: sc=tms_svm.classer_value(p_sc[0]) print sc fs.write(str(label)+"\t"+str(sc)+"\t") #print '-------------%d---------------'%i i+=1 fs.write(str(y[0])+"\t") fs.write("\n") fs.close() print "-----------------预测完毕-------------------"
def cal_sc_optim(lab,m,text,dic_list,local_fun,global_weight,str_splitTag): '''输入标签,模型,待预测的文本,词典,以及词分词用的符号 返回的是一个预测标签与得分,如果是二分类,返回的是直接得分,如果为多分类,返回的是经过计算的综合分数。 ''' local_fun = measure.local_f(local_fun) y,x = ctmutil.cons_pro_for_svm(lab,text.strip().split(str_splitTag),dic_list,local_fun,global_weight) p_lab,p_acc,p_sc=tms_svm.predict(y,x,m) #在这里要判定是二分类还是多分类,如果为二分类,返回相应的分数,如果为多分类,则返回预测的标签。 return p_lab[0],tms_svm.classer_value(p_sc[0])
def extract_im_feature(filename,content_indexs,feature_indexs,dic_path,svm_model,delete,str_splitTag,tc_splitTag): '''''' m = tms_svm.load_model(svm_model) f = file(filename,'r') for line in f.readlines(): text = line.strip().split(tc_splitTag) text_temp="" for i in content_indexs: text_temp+=str_splitTag+text[i] p_lab,p_acc,p_sc =tms_svm.predict()
def cal_sc_optim(lab, m, text, dic_list, local_fun, global_weight, str_splitTag): '''输入标签,模型,待预测的文本,词典,以及词分词用的符号 返回的是一个预测标签与得分 ''' local_fun = measure.local_f(local_fun) y, x = cons_pro_for_svm(lab, text.strip().split(str_splitTag), dic_list, local_fun, global_weight) p_lab, p_acc, p_sc = tms_svm.predict(y, x, m) return p_lab[0], tms_svm.classer_value(p_sc[0])
def save_train_for_lsa(test_path,model_save_path,lsa_train_save_path): '''predict trainset using the initial classifier ,and save the trainset with lsa format : label score feature ''' y,x = tms_svm.read_problem(test_path) m = tms_svm.load_model(model_save_path) p_lab,p_acc,p_sc = tms_svm.predict(y,x,m) f= file(lsa_train_save_path,'w') for i in range(len(y)): f.write(str(int(y[i]))+"\t"+str(p_sc[i][0])+"\t") dic =x[i] sorted_x = sorted(dic.items(),key = lambda dic:dic[0]) for key in sorted_x: f.write(str(key[0])+":"+str(key[1])+"\t") f.write("\n") f.close()
def save_train_for_lsa(test_path, model_save_path, lsa_train_save_path): '''predict trainset using the initial classifier ,and save the trainset with lsa format : label score feature ''' y, x = tms_svm.read_problem(test_path) m = tms_svm.load_model(model_save_path) p_lab, p_acc, p_sc = tms_svm.predict(y, x, m) f = file(lsa_train_save_path, 'w') for i in range(len(y)): f.write(str(int(y[i])) + "\t" + str(p_sc[i][0]) + "\t") dic = x[i] sorted_x = sorted(dic.items(), key=lambda dic: dic[0]) for key in sorted_x: f.write(str(key[0]) + ":" + str(key[1]) + "\t") f.write("\n") f.close()
def cal_sc_optim(model_file_name, data_file_name, result_save_path): '''输入标签,模型,待预测的文本,词典,以及词分词用的符号 返回的是一个预测标签与得分,如果是二分类,返回的是直接得分,如果为多分类,返回的是经过计算的综合分数。 ''' #local_fun = measure.local_f(local_fun) #y,x = ctmutil.cons_pro_for_svm(lab,text.strip().split(str_splitTag),dic_list,local_fun,global_weight) """ svm_load_model(model_file_name) -> model Load a LIBSVM model from model_file_name and return. """ print "-----------------正在加载训练模型-------------------\n" model = libsvm.svm_load_model(model_file_name) if not model: print("can't open model file %s" % model_file_name) model = None else: model = toPyModel(model) print "-----------------正在对样本进行预测-------------------\n" for line in open(data_file_name): line = line.split(None, 1) # In case an instance with all zero features if len(line) == 1: line += [''] label, features = line xi = {} for e in features.split(): ind, val = e.split(":") xi[int(ind)] = float(val) y = [float(label)] x = xi p_lab, p_acc, p_sc = tms_svm.predict(y, x, m) #在这里要判定是二分类还是多分类,如果为二分类,返回相应的分数,如果为多分类,则返回预测的标签。 label = p_lab[0] sc = tms_svm.classer_value(p_sc[0]) fs = file(result_save_path, 'w') fs.write(str(label) + "\t" + str(sc) + "\t") #for index in result_indexes: #fs.write(text[index]+"\t") fs.write("\n") fs.close() print "-----------------预测完毕-------------------"
def cal_sc_optim(model_file_name,data_file_name,result_save_path): '''输入标签,模型,待预测的文本,词典,以及词分词用的符号 返回的是一个预测标签与得分,如果是二分类,返回的是直接得分,如果为多分类,返回的是经过计算的综合分数。 ''' #local_fun = measure.local_f(local_fun) #y,x = ctmutil.cons_pro_for_svm(lab,text.strip().split(str_splitTag),dic_list,local_fun,global_weight) """ svm_load_model(model_file_name) -> model Load a LIBSVM model from model_file_name and return. """ print "-----------------正在加载训练模型-------------------\n" model = libsvm.svm_load_model(model_file_name) if not model: print("can't open model file %s" % model_file_name) model=None else: model = toPyModel(model) print "-----------------正在对样本进行预测-------------------\n" for line in open(data_file_name): line = line.split(None, 1) # In case an instance with all zero features if len(line) == 1: line += [''] label, features = line xi = {} for e in features.split(): ind, val = e.split(":") xi[int(ind)] = float(val) y= [float(label)] x = xi p_lab,p_acc,p_sc=tms_svm.predict(y,x,m) #在这里要判定是二分类还是多分类,如果为二分类,返回相应的分数,如果为多分类,则返回预测的标签。 label=p_lab[0] sc=tms_svm.classer_value(p_sc[0]) fs = file(result_save_path,'w') fs.write(str(label)+"\t"+str(sc)+"\t") #for index in result_indexes: #fs.write(text[index]+"\t") fs.write("\n") fs.close() print "-----------------预测完毕-------------------"
def ctm_model_predict(test_path, m): '''模型预测,输入测试样本,然后读入进行测试''' y, x = tms_svm.read_problem(test_path) return tms_svm.predict(y, x, m)
def ctm_model_predict(test_path,m): '''模型预测,输入测试样本,然后读入进行测试''' y,x = tms_svm.read_problem(test_path) return tms_svm.predict(y,x,m)