def training(filename, path): tms.tms_train(filename, indexes=[1], main_save_path=path, seg=1, local_fun="tf", global_fun="one")
import tms tms.tms_train("../data/Traindata_noseg.txt",main_save_path="./",seg=1) tms.tms_predict("../data/Testdata.txt","./model/tms.config",result_save_path="./tms.result")
#!/usr/bin/python import sys sys.path.insert( 0, "D:\\09Limited_buffer\\earlywarningbyci\\classification\\trainmodel\\src") sys.path.insert( 0, "D:\\09Limited_buffer\\earlywarningbyci\\classification\\trainmodel\\") import tms #_*_ coding: utf-8 _*_ tms.tms_train("D:\\09Limited_buffer\\earlywarningbyci\\classification\\trainmodel\\data\\ad.txt",\ main_save_path="./data/",\ seg=1,indexes=[2],\ str_splitTag=" ",\ stopword_filename="./chinese_stopword.txt",\ ratio=1) tms.tms_predict("D:\\09Limited_buffer\\earlywarningbyci\\classification\\trainmodel\\data\\adtrain3.txt","D:\\09Limited_buffer\\earlywarningbyci\\classification\\trainmodel\\data\\model\\tms.config",\ result_save_path="./data/pre.result",\ seg=1,\ indexes=[2]) tms.tms_analysis(".\\data\\pre.result")
"""假设data文件夹下有一个post.train和post.test的训练样本和测试样本,每一行有3个字段:label title content。样本都没有分词 该例子需要完成: 1、对title进行分词、训练,模型保存在../data/post/ 下,所有的文件都有title命名,SVM模型选择使用libsvm,核函数使用rbf,选择选择保留top 40%的词,特征权重使用tf*idf 2、对title和content一起进行分词、训练,模型保存在../data/post/ 下,所有的文件都有title_content命名,SVM模型选择使用libsvm,选择选择保留top 20%的词,特征权重使用tf 3、先对post.test进行分词,然后使用已经训练好的模型对post.test进行预测。结果以post.result命名,将原label与结果一同输出。 4、计算模型的预测F值、Recall、Precision,并将结果输出在屏幕上。 5、计算从[0,1]区间内各个阈值下对应的F值、Recall、Precision,将结果保存在post.analysis """ tms.tms_train( "../data/post.train", indexes=[1], main_save_path="../data/", stopword_filename="../data/stopwords.txt", svm_type="libsvm", svm_param="-t 2", config_name="title.config", dic_name="title.key", model_name="title.model", train_name="title.train", param_name="title.param", ratio=0.4, seg=1, local_fun="tf", global_fun="idf", ) # tms.tms_train("../data/post.train",indexes=[1,2],main_save_path="../data/",stopword_filename="../data/stopwords.txt",svm_type="libsvm",config_name="title_content.config",dic_name="title_content.key",model_name="title_content.model",train_name="title_content.train",param_name="title_content.param",ratio=0.2,seg=1,local_fun="tf",global_fun="one") # tms.tms_predict_multi("../data/post.test",config_files=["../data/model/title.config","../data/model/title_content.config"],indexes_lists=[[1],[1,2]],result_save_path="../data/post.result",result_indexes=[0],seg=1) # tms.tms_analysis("../data/post.result",step=2,output_file="",indexes=[0,1,2],predicted_label_index=0,predicted_value_index=1,true_label_index=2) # tms.tms_analysis("../data/post.result",step=4,output_file="../data/post.analysis",min=0,max=1,indexes=[0,1,2],predicted_label_index=0,predicted_value_index=1,true_label_index=2)
#tms.tms_analysis("../result/linear_title.result",step=4,output_file="../data/linear_title.analysis",indexes=[0,1,2],predicted_label_index=0,predicted_value_index=1,true_label_index=2,min=0,max=2)======= #import tms #tms.tms_predict_multi("../data/weijin.test", ["../data/aliws/model/lineartitle.config","../data/aliws/model/lineartitle_content.config","../data/aliws/model/svmtitle.config","../data/aliws/model/svmtitle_content.config"],[[2],[2,3],[2],[2,3]],result_indexes=[0,1,2,3,4],result_save_path="../data/weijin.result")>>>>>>> .r167 #先根据训练样本取得词典后,然后再根据词典把测试样本转换为相应的格式。 filepath = u"E:\\算法与技术学习\\y语料库\\PaperCopus\\" trainFile = u"reuters_raw_train_0.txt" testFile = u"reuters_raw_test_0.txt" tms.tms_train( os.path.join(filepath, trainFile), main_save_path=filepath, stopword_filename=u"E:\\算法与技术学习\\y语料库\\PaperCopus\\stopwords.txt", svm_type="liblinear", config_name="weijin_ik_20120110.config", dic_name="weijin_ik_20120110.key", model_name="weijin_ik_20120110.model", train_name="weijin_ik_20120110.train", param_name="weijin_ik_20120110.param", ratio=0.4, seg=0, local_fun="tf", global_fun="rf", str_splitTag=" ", tc_splitTag="\t") tms.cons_train_sample_for_svm( os.path.join(filepath, testFile), u"E:\\算法与技术学习\\y语料库\\PaperCopus\\model\\weijin_ik_20120110.key", u"E:\\算法与技术学习\\y语料库\\PaperCopus\\new.txt", local_fun="tf", str_splitTag=" ", tc_splitTag="\t")
#-*- coding: utf-8 -*- import tms if __name__ == '__main__': tms.tms_train("../twitterData/emotion/emotionTrain.txt",main_save_path="./",seg=1) #tms.tms_segment("../twitterData/emotion/twitterTestData.txt",[1],"../twitterData/emotion/twitterTestData1.txt","^","\t",1) #tms.tms_predict("../twitterData/emotion/twitterTestData1.txt","./model/tms.config",result_save_path="../twitterData/emotion/tms.result")
# -*- coding: utf-8 -*- import tms import datetime import time def timediff(timestart, timestop): t = timestop - timestart time_day = t.days s_time = t.seconds ms_time = t.microseconds / 1000000 usedtime = int(s_time + ms_time) time_hour = usedtime / 60 / 60 time_minute = (usedtime - time_hour * 3600) / 60 time_second = usedtime - time_hour * 3600 - time_minute * 60 time_micsecond = (t.microseconds - t.microseconds / 1000000) / 1000 retstr = "%d天%d小时%d分%d秒%d毫秒" % (time_day, time_hour, time_minute, time_second, time_micsecond) return retstr if __name__ == "__main__": beginTime = datetime.datetime.now() tms.tms_train("../twitterData/twitterTrain.txt", main_save_path="./", seg=1, global_fun="idf") tms.tms_segment("../twitterData/twitterTestData.txt", [1], "../twitterData/twitterTestData1.txt", "^", "\t", 1) tms.tms_predict( "../twitterData/twitterTestData1.txt", "./model/tms.config", result_save_path="../twitterData/tms.result" ) endTime = datetime.datetime.now() d = timediff(beginTime, endTime) print d
#tms.tms_predict_multi("../data/binary_seged.test", ["../data/libsvm_model/tms.config","../data/liblinear_model/tms.config"],indexes_lists=[[1],[1]],result_save_path="../data/binary_seged.result") #tms.tms_analysis("../data/binary_seged.result",indexes=[0,1,2,3,4],true_label_index=4) '''对文件进行分词''' #tms.tms_segment("../data/binary.train", indexes=[1]) '''特征选择''' #tms.tms_feature_select("../data/binary_seged.train", indexes=[1], global_fun="idf", dic_name="test.key", ratio=0.05, stopword_filename="") '''将输入文件构造为libsvm和liblinear的输入格式''' #tms.cons_train_sample_for_svm("../data/binary_seged.train", "../data/model/dic.key", "../data/tms.train", [1]) '''对SVM模型选择最优的参数''' '''对没有经过分词的文件进行训练''' #tms.tms_train("../data/binary.train",seg=1) '''假设data文件夹下有一个post.train和post.test的训练样本和测试样本,每一行有3个字段:label title content。样本都没有分词 该例子需要完成: 1、对title进行分词、训练,模型保存在../data/post/ 下,所有的文件都有title命名,SVM模型选择使用libsvm,核函数使用rbf,选择选择保留top 40%的词,特征权重使用tf*idf 2、对title和content一起进行分词、训练,模型保存在../data/post/ 下,所有的文件都有title_content命名,SVM模型选择使用liblinear,选择选择保留top 20%的词,特征权重使用tf 3、先对post.test进行分词,然后使用已经训练好的模型对post.test进行预测。结果以post.result命名,将原label与结果一同输出。 4、计算模型的预测F值、Recall、Precision,并将结果输出在屏幕上。 5、计算从[0,1]区间内各个阈值下对应的F值、Recall、Precision,将结果保存在post.analysis ''' tms.tms_train("../data/post.train",indexes=[1],main_save_path="../data/",stopword_filename="../data/stopwords.txt",svm_type="libsvm",svm_param="-t 2",config_name="title.config",dic_name="title.key",model_name="title.model",train_name="title.train",param_name="title.param",ratio=0.4,seg=1,local_fun="tf",global_fun="idf") #tms.tms_train("../data/post.train",indexes=[1,2],main_save_path="../data/",stopword_filename="../data/stopwords.txt",svm_type="liblinear",config_name="title_content.config",dic_name="title_content.key",model_name="title_content.model",train_name="title_content.train",param_name="title_content.param",ratio=0.2,seg=1,local_fun="tf",global_fun="one") #tms.tms_predict_multi("../data/post.test",config_files=["../data/model/title.config","../data/model/title_content.config"],indexes_lists=[[1],[1,2]],result_save_path="../data/post.result",result_indexes=[0],seg=1) #tms.tms_analysis("../data/post.result",step=2,output_file="",indexes=[0,1,2],predicted_label_index=0,predicted_value_index=1,true_label_index=2) #tms.tms_analysis("../data/post.result",step=4,output_file="../data/post.analysis",min=0,max=1,indexes=[0,1,2],predicted_label_index=0,predicted_value_index=1,true_label_index=2)
#-*- coding: utf-8 -*- import tms import datetime import time def timediff(timestart, timestop): t = (timestop-timestart) time_day = t.days s_time = t.seconds ms_time = t.microseconds / 1000000 usedtime = int(s_time + ms_time) time_hour = usedtime / 60 / 60 time_minute = (usedtime - time_hour * 3600 ) / 60 time_second = usedtime - time_hour * 3600 - time_minute * 60 time_micsecond = (t.microseconds - t.microseconds / 1000000) / 1000 retstr = "%d天%d小时%d分%d秒%d毫秒" %(time_day, time_hour, time_minute, time_second, time_micsecond) return retstr if __name__ == '__main__': beginTime = datetime.datetime.now() tms.tms_train("../twitterData/twitterTrain.txt",main_save_path="./",seg=1,global_fun ='idf') tms.tms_segment("../twitterData/twitterTestData.txt",[1],"../twitterData/twitterTestData1.txt","^","\t",1) tms.tms_predict("../twitterData/twitterTestData1.txt","./model/tms.config",result_save_path="../twitterData/tms.result") endTime = datetime.datetime.now() d = timediff(beginTime , endTime) print d
import tms tms.tms_train("../data/Traindata_noseg.txt", main_save_path="./", seg=1) tms.tms_predict("../data/Testdata.txt", "./model/tms.config", result_save_path="./tms.result")
#-*- coding: utf-8 -*- import tms if __name__ == '__main__': tms.tms_train("../twitterData/emotion/emotionTrain.txt", main_save_path="./", seg=1) #tms.tms_segment("../twitterData/emotion/twitterTestData.txt",[1],"../twitterData/emotion/twitterTestData1.txt","^","\t",1) #tms.tms_predict("../twitterData/emotion/twitterTestData1.txt","./model/tms.config",result_save_path="../twitterData/emotion/tms.result")
#!/usr/bin/python #_*_ coding: utf-8 _*_ #author: 张知临 [email protected] import tms,os #tms.tms_train("../data/weijin.train",indexes=[4],main_save_path="../model/",stopword_filename="../data/stopwords.txt",svm_type="liblinear",config_name="linear_title.config",dic_name="linear_title.key",model_name="linear_title.model",train_name="linear_title.train",param_name="linear_title.param",ratio=0.4,seg=1,local_fun="tf",global_fun="rf") #tms.tms_predict_multi("../data/weijin.test",config_files=["../model/model/linear_title.config"],indexes_lists=[[4]],result_save_path="../result/linear_title.result",result_indexes=[0],seg=1) #tms.tms_analysis("../result/linear_title.result",step=4,output_file="../data/linear_title.analysis",indexes=[0,1,2],predicted_label_index=0,predicted_value_index=1,true_label_index=2,min=0,max=2)======= #import tms #tms.tms_predict_multi("../data/weijin.test", ["../data/aliws/model/lineartitle.config","../data/aliws/model/lineartitle_content.config","../data/aliws/model/svmtitle.config","../data/aliws/model/svmtitle_content.config"],[[2],[2,3],[2],[2,3]],result_indexes=[0,1,2,3,4],result_save_path="../data/weijin.result")>>>>>>> .r167 #先根据训练样本取得词典后,然后再根据词典把测试样本转换为相应的格式。 filepath = u"E:\\算法与技术学习\\y语料库\\PaperCopus\\" trainFile=u"reuters_raw_train_0.txt" testFile=u"reuters_raw_test_0.txt" tms.tms_train(os.path.join(filepath,trainFile),main_save_path=filepath,stopword_filename=u"E:\\算法与技术学习\\y语料库\\PaperCopus\\stopwords.txt",svm_type="liblinear",config_name="weijin_ik_20120110.config",dic_name="weijin_ik_20120110.key",model_name="weijin_ik_20120110.model",train_name="weijin_ik_20120110.train",param_name="weijin_ik_20120110.param",ratio=0.4,seg=0,local_fun="tf",global_fun="rf",str_splitTag=" ",tc_splitTag="\t") tms.cons_train_sample_for_svm(os.path.join(filepath,testFile), u"E:\\算法与技术学习\\y语料库\\PaperCopus\\model\\weijin_ik_20120110.key",u"E:\\算法与技术学习\\y语料库\\PaperCopus\\new.txt", local_fun="tf",str_splitTag=" ",tc_splitTag="\t")