示例#1
0
def training(filename, path):
    tms.tms_train(filename,
                  indexes=[1],
                  main_save_path=path,
                  seg=1,
                  local_fun="tf",
                  global_fun="one")
def training(filename, path):
    tms.tms_train(filename, indexes=[1], main_save_path=path, seg=1, local_fun="tf", global_fun="one")
示例#3
0
import tms
tms.tms_train("../data/Traindata_noseg.txt",main_save_path="./",seg=1)
tms.tms_predict("../data/Testdata.txt","./model/tms.config",result_save_path="./tms.result")
示例#4
0
#!/usr/bin/python

import sys
sys.path.insert(
    0,
    "D:\\09Limited_buffer\\earlywarningbyci\\classification\\trainmodel\\src")
sys.path.insert(
    0, "D:\\09Limited_buffer\\earlywarningbyci\\classification\\trainmodel\\")
import tms
#_*_ coding: utf-8 _*_
tms.tms_train("D:\\09Limited_buffer\\earlywarningbyci\\classification\\trainmodel\\data\\ad.txt",\
              main_save_path="./data/",\
              seg=1,indexes=[2],\
              str_splitTag=" ",\
              stopword_filename="./chinese_stopword.txt",\
              ratio=1)

tms.tms_predict("D:\\09Limited_buffer\\earlywarningbyci\\classification\\trainmodel\\data\\adtrain3.txt","D:\\09Limited_buffer\\earlywarningbyci\\classification\\trainmodel\\data\\model\\tms.config",\
                result_save_path="./data/pre.result",\
                seg=1,\
                indexes=[2])

tms.tms_analysis(".\\data\\pre.result")
"""假设data文件夹下有一个post.train和post.test的训练样本和测试样本,每一行有3个字段:label title content。样本都没有分词
该例子需要完成:
1、对title进行分词、训练,模型保存在../data/post/ 下,所有的文件都有title命名,SVM模型选择使用libsvm,核函数使用rbf,选择选择保留top 40%的词,特征权重使用tf*idf
2、对title和content一起进行分词、训练,模型保存在../data/post/ 下,所有的文件都有title_content命名,SVM模型选择使用libsvm,选择选择保留top 20%的词,特征权重使用tf
3、先对post.test进行分词,然后使用已经训练好的模型对post.test进行预测。结果以post.result命名,将原label与结果一同输出。
4、计算模型的预测F值、Recall、Precision,并将结果输出在屏幕上。
5、计算从[0,1]区间内各个阈值下对应的F值、Recall、Precision,将结果保存在post.analysis
"""
tms.tms_train(
    "../data/post.train",
    indexes=[1],
    main_save_path="../data/",
    stopword_filename="../data/stopwords.txt",
    svm_type="libsvm",
    svm_param="-t 2",
    config_name="title.config",
    dic_name="title.key",
    model_name="title.model",
    train_name="title.train",
    param_name="title.param",
    ratio=0.4,
    seg=1,
    local_fun="tf",
    global_fun="idf",
)
# tms.tms_train("../data/post.train",indexes=[1,2],main_save_path="../data/",stopword_filename="../data/stopwords.txt",svm_type="libsvm",config_name="title_content.config",dic_name="title_content.key",model_name="title_content.model",train_name="title_content.train",param_name="title_content.param",ratio=0.2,seg=1,local_fun="tf",global_fun="one")
# tms.tms_predict_multi("../data/post.test",config_files=["../data/model/title.config","../data/model/title_content.config"],indexes_lists=[[1],[1,2]],result_save_path="../data/post.result",result_indexes=[0],seg=1)
# tms.tms_analysis("../data/post.result",step=2,output_file="",indexes=[0,1,2],predicted_label_index=0,predicted_value_index=1,true_label_index=2)
# tms.tms_analysis("../data/post.result",step=4,output_file="../data/post.analysis",min=0,max=1,indexes=[0,1,2],predicted_label_index=0,predicted_value_index=1,true_label_index=2)
示例#6
0
#tms.tms_analysis("../result/linear_title.result",step=4,output_file="../data/linear_title.analysis",indexes=[0,1,2],predicted_label_index=0,predicted_value_index=1,true_label_index=2,min=0,max=2)=======
#import tms
#tms.tms_predict_multi("../data/weijin.test", ["../data/aliws/model/lineartitle.config","../data/aliws/model/lineartitle_content.config","../data/aliws/model/svmtitle.config","../data/aliws/model/svmtitle_content.config"],[[2],[2,3],[2],[2,3]],result_indexes=[0,1,2,3,4],result_save_path="../data/weijin.result")>>>>>>> .r167

#先根据训练样本取得词典后,然后再根据词典把测试样本转换为相应的格式。
filepath = u"E:\\算法与技术学习\\y语料库\\PaperCopus\\"
trainFile = u"reuters_raw_train_0.txt"
testFile = u"reuters_raw_test_0.txt"
tms.tms_train(
    os.path.join(filepath, trainFile),
    main_save_path=filepath,
    stopword_filename=u"E:\\算法与技术学习\\y语料库\\PaperCopus\\stopwords.txt",
    svm_type="liblinear",
    config_name="weijin_ik_20120110.config",
    dic_name="weijin_ik_20120110.key",
    model_name="weijin_ik_20120110.model",
    train_name="weijin_ik_20120110.train",
    param_name="weijin_ik_20120110.param",
    ratio=0.4,
    seg=0,
    local_fun="tf",
    global_fun="rf",
    str_splitTag=" ",
    tc_splitTag="\t")
tms.cons_train_sample_for_svm(
    os.path.join(filepath, testFile),
    u"E:\\算法与技术学习\\y语料库\\PaperCopus\\model\\weijin_ik_20120110.key",
    u"E:\\算法与技术学习\\y语料库\\PaperCopus\\new.txt",
    local_fun="tf",
    str_splitTag=" ",
    tc_splitTag="\t")
#-*- coding: utf-8 -*-
import tms

if __name__ == '__main__':
    tms.tms_train("../twitterData/emotion/emotionTrain.txt",main_save_path="./",seg=1)
    #tms.tms_segment("../twitterData/emotion/twitterTestData.txt",[1],"../twitterData/emotion/twitterTestData1.txt","^","\t",1)
    #tms.tms_predict("../twitterData/emotion/twitterTestData1.txt","./model/tms.config",result_save_path="../twitterData/emotion/tms.result")
# -*- coding: utf-8 -*-
import tms
import datetime
import time


def timediff(timestart, timestop):
    t = timestop - timestart
    time_day = t.days
    s_time = t.seconds
    ms_time = t.microseconds / 1000000
    usedtime = int(s_time + ms_time)
    time_hour = usedtime / 60 / 60
    time_minute = (usedtime - time_hour * 3600) / 60
    time_second = usedtime - time_hour * 3600 - time_minute * 60
    time_micsecond = (t.microseconds - t.microseconds / 1000000) / 1000
    retstr = "%d天%d小时%d分%d秒%d毫秒" % (time_day, time_hour, time_minute, time_second, time_micsecond)
    return retstr


if __name__ == "__main__":
    beginTime = datetime.datetime.now()
    tms.tms_train("../twitterData/twitterTrain.txt", main_save_path="./", seg=1, global_fun="idf")
    tms.tms_segment("../twitterData/twitterTestData.txt", [1], "../twitterData/twitterTestData1.txt", "^", "\t", 1)
    tms.tms_predict(
        "../twitterData/twitterTestData1.txt", "./model/tms.config", result_save_path="../twitterData/tms.result"
    )
    endTime = datetime.datetime.now()
    d = timediff(beginTime, endTime)
    print d
示例#9
0
#tms.tms_predict_multi("../data/binary_seged.test", ["../data/libsvm_model/tms.config","../data/liblinear_model/tms.config"],indexes_lists=[[1],[1]],result_save_path="../data/binary_seged.result")
#tms.tms_analysis("../data/binary_seged.result",indexes=[0,1,2,3,4],true_label_index=4)

'''对文件进行分词'''
#tms.tms_segment("../data/binary.train", indexes=[1])

'''特征选择'''
#tms.tms_feature_select("../data/binary_seged.train", indexes=[1], global_fun="idf", dic_name="test.key", ratio=0.05, stopword_filename="")

'''将输入文件构造为libsvm和liblinear的输入格式'''
#tms.cons_train_sample_for_svm("../data/binary_seged.train", "../data/model/dic.key", "../data/tms.train", [1])

'''对SVM模型选择最优的参数'''


'''对没有经过分词的文件进行训练'''
#tms.tms_train("../data/binary.train",seg=1)

'''假设data文件夹下有一个post.train和post.test的训练样本和测试样本,每一行有3个字段:label title content。样本都没有分词
该例子需要完成:
1、对title进行分词、训练,模型保存在../data/post/ 下,所有的文件都有title命名,SVM模型选择使用libsvm,核函数使用rbf,选择选择保留top 40%的词,特征权重使用tf*idf
2、对title和content一起进行分词、训练,模型保存在../data/post/ 下,所有的文件都有title_content命名,SVM模型选择使用liblinear,选择选择保留top 20%的词,特征权重使用tf
3、先对post.test进行分词,然后使用已经训练好的模型对post.test进行预测。结果以post.result命名,将原label与结果一同输出。
4、计算模型的预测F值、Recall、Precision,并将结果输出在屏幕上。
5、计算从[0,1]区间内各个阈值下对应的F值、Recall、Precision,将结果保存在post.analysis
'''
tms.tms_train("../data/post.train",indexes=[1],main_save_path="../data/",stopword_filename="../data/stopwords.txt",svm_type="libsvm",svm_param="-t 2",config_name="title.config",dic_name="title.key",model_name="title.model",train_name="title.train",param_name="title.param",ratio=0.4,seg=1,local_fun="tf",global_fun="idf")
#tms.tms_train("../data/post.train",indexes=[1,2],main_save_path="../data/",stopword_filename="../data/stopwords.txt",svm_type="liblinear",config_name="title_content.config",dic_name="title_content.key",model_name="title_content.model",train_name="title_content.train",param_name="title_content.param",ratio=0.2,seg=1,local_fun="tf",global_fun="one")
#tms.tms_predict_multi("../data/post.test",config_files=["../data/model/title.config","../data/model/title_content.config"],indexes_lists=[[1],[1,2]],result_save_path="../data/post.result",result_indexes=[0],seg=1)
#tms.tms_analysis("../data/post.result",step=2,output_file="",indexes=[0,1,2],predicted_label_index=0,predicted_value_index=1,true_label_index=2)
#tms.tms_analysis("../data/post.result",step=4,output_file="../data/post.analysis",min=0,max=1,indexes=[0,1,2],predicted_label_index=0,predicted_value_index=1,true_label_index=2)
示例#10
0
#-*- coding: utf-8 -*-
import tms
import datetime
import time

def timediff(timestart, timestop):
    t  = (timestop-timestart)
    time_day = t.days
    s_time = t.seconds
    ms_time = t.microseconds / 1000000
    usedtime = int(s_time + ms_time)
    time_hour = usedtime / 60 / 60
    time_minute = (usedtime - time_hour * 3600 ) / 60
    time_second =  usedtime - time_hour * 3600 - time_minute * 60
    time_micsecond = (t.microseconds - t.microseconds / 1000000) / 1000
    retstr = "%d天%d小时%d分%d秒%d毫秒"  %(time_day, time_hour, time_minute, time_second, time_micsecond)
    return retstr

if __name__ == '__main__':
    beginTime = datetime.datetime.now()
    tms.tms_train("../twitterData/twitterTrain.txt",main_save_path="./",seg=1,global_fun ='idf')
    tms.tms_segment("../twitterData/twitterTestData.txt",[1],"../twitterData/twitterTestData1.txt","^","\t",1)
    tms.tms_predict("../twitterData/twitterTestData1.txt","./model/tms.config",result_save_path="../twitterData/tms.result")
    endTime = datetime.datetime.now()
    d = timediff(beginTime , endTime)
    print d
示例#11
0
import tms
tms.tms_train("../data/Traindata_noseg.txt", main_save_path="./", seg=1)
tms.tms_predict("../data/Testdata.txt",
                "./model/tms.config",
                result_save_path="./tms.result")
示例#12
0
#-*- coding: utf-8 -*-
import tms

if __name__ == '__main__':
    tms.tms_train("../twitterData/emotion/emotionTrain.txt",
                  main_save_path="./",
                  seg=1)
    #tms.tms_segment("../twitterData/emotion/twitterTestData.txt",[1],"../twitterData/emotion/twitterTestData1.txt","^","\t",1)
    #tms.tms_predict("../twitterData/emotion/twitterTestData1.txt","./model/tms.config",result_save_path="../twitterData/emotion/tms.result")
示例#13
0
#!/usr/bin/python
#_*_ coding: utf-8 _*_
#author: 张知临 [email protected]
import tms,os
#tms.tms_train("../data/weijin.train",indexes=[4],main_save_path="../model/",stopword_filename="../data/stopwords.txt",svm_type="liblinear",config_name="linear_title.config",dic_name="linear_title.key",model_name="linear_title.model",train_name="linear_title.train",param_name="linear_title.param",ratio=0.4,seg=1,local_fun="tf",global_fun="rf")
#tms.tms_predict_multi("../data/weijin.test",config_files=["../model/model/linear_title.config"],indexes_lists=[[4]],result_save_path="../result/linear_title.result",result_indexes=[0],seg=1)
#tms.tms_analysis("../result/linear_title.result",step=4,output_file="../data/linear_title.analysis",indexes=[0,1,2],predicted_label_index=0,predicted_value_index=1,true_label_index=2,min=0,max=2)=======
#import tms
#tms.tms_predict_multi("../data/weijin.test", ["../data/aliws/model/lineartitle.config","../data/aliws/model/lineartitle_content.config","../data/aliws/model/svmtitle.config","../data/aliws/model/svmtitle_content.config"],[[2],[2,3],[2],[2,3]],result_indexes=[0,1,2,3,4],result_save_path="../data/weijin.result")>>>>>>> .r167


#先根据训练样本取得词典后,然后再根据词典把测试样本转换为相应的格式。
filepath = u"E:\\算法与技术学习\\y语料库\\PaperCopus\\"
trainFile=u"reuters_raw_train_0.txt"
testFile=u"reuters_raw_test_0.txt"
tms.tms_train(os.path.join(filepath,trainFile),main_save_path=filepath,stopword_filename=u"E:\\算法与技术学习\\y语料库\\PaperCopus\\stopwords.txt",svm_type="liblinear",config_name="weijin_ik_20120110.config",dic_name="weijin_ik_20120110.key",model_name="weijin_ik_20120110.model",train_name="weijin_ik_20120110.train",param_name="weijin_ik_20120110.param",ratio=0.4,seg=0,local_fun="tf",global_fun="rf",str_splitTag=" ",tc_splitTag="\t")
tms.cons_train_sample_for_svm(os.path.join(filepath,testFile), u"E:\\算法与技术学习\\y语料库\\PaperCopus\\model\\weijin_ik_20120110.key",u"E:\\算法与技术学习\\y语料库\\PaperCopus\\new.txt", local_fun="tf",str_splitTag=" ",tc_splitTag="\t")