def reducer(self, key, values):
        '''reducer function'''
        prob_y = []
        prob_x = []
        line = key.split(None, 1)
        #设置SVM训练的参数
        if sum([1 for i in line]) == 1:
            svm_param = " -v 5 -c " + str(line[0])
        else:
            if sum([1 for i in line]) >= 2:
                svm_param = " -v 5 -c " + str(line[0]) + " -g " + str(line[1])

        #对训练样本进行汇总整理
        for value in values:
            value = value.split(None, 1)
            if len(value) == 1: value += ['']
            label, features = value
            xi = {}
            for e in features.split():
                ind, val = e.split(":")
                xi[int(ind)] = float(val)
            prob_y += [float(label)]
            prob_x += [xi]

        #对得到的参数与训练样本进行训练
        tms_svm.set_svm_type(svm_type)
        ratio = tms_svm.train(prob_y, prob_x, svm_param)
        self.write_output(key, str(ratio))
Exemplo n.º 2
0
def ctm_predict_multi(text,indexes_lists,dic_list,model_list,local_fun_list,global_weight_list,str_splitTag):
    '''多个模型的预测,如一个文本有多个模型需要预测
    其中indexes_lists为二维度的。
    the main process to predict the text score.
    support deoloy more than one model.each model contains indexes,dic and model

    '''            
    k = len(dic_list) #得到预测模型的个数
    label_list =[0]*k
    score_list=[0]*k
   
    for j in range(k):
        indexes = indexes_lists[j]
        model = model_list[j]
        dic = dic_list[j]
        local_fun = local_fun_list[j]
        global_weight = global_weight_list[j]
        if len(text)<indexes[len(indexes)-1]+1 or len(text)<tms_predict_config.result_indexes[len(tms_predict_config.result_indexes)-1]+1:
            label =0
            sc=0  
        else:     
            text_temp=""
            for index in indexes:
                text_temp+=str_splitTag+text[index]                   
            if dir(model).count("get_svm_type")==1:
                tms_svm.set_svm_type("libsvm")
            if dir(model).count("get_nr_feature")==1:
                tms_svm.set_svm_type("liblinear")
            label,sc=cal_sc_optim(1,model,text_temp,dic,local_fun,global_weight,str_splitTag)
        score_list[j]=float(sc)
        label_list[j]=float(label)
    return label_list,score_list
    def reducer(self, key, values):
        '''reducer function'''
        prob_y=[]
        prob_x=[]
        line=key.split(None,1)
        #设置SVM训练的参数
        if sum([1 for i in line])==1:
            svm_param = " -v 5 -c "+str(line[0])
        else :
            if sum([1 for i in line])>=2:
                svm_param = " -v 5 -c "+str(line[0])+" -g "+str(line[1])

        #对训练样本进行汇总整理
        for value in values:
            value = value.split(None,1)
            if len(value)==1: value+=['']
            label, features = value
            xi={}
            for e in features.split():
                ind, val = e.split(":")
                xi[int(ind)] = float(val)
            prob_y +=[float(label)]
            prob_x +=[xi]

        #对得到的参数与训练样本进行训练
        tms_svm.set_svm_type(svm_type)
        ratio = tms_svm.train(prob_y,prob_x,svm_param)
        self.write_output( key, str(ratio))
Exemplo n.º 4
0
def tms_train_model(problem_path,svm_type="libsvm",param="",model_save_path="../svm.model"):
    '''训练模型程序。输入参数,可以训练libsvm与liblinear的模型。
    必须参数:
       problem_path :输入问题的路径即名称:
    可选参数 :
        svm_type :svm类型:libsvm 或liblinear 。默认为"libsvm"
        param  用户自己设定的svm的参数,这个要区分libsvm与liblinear参数的限制。默认" " 
        model_save_path :模型保存的路径,默认情况下路径为"../svm.model"
    '''
    tms_svm.set_svm_type(svm_type)
    train_model.ctm_train_model(problem_path, param, model_save_path)
Exemplo n.º 5
0
def tms_train_model(problem_path,svm_type="libsvm",param="",model_save_path="../svm.model"):
    '''训练模型程序。输入参数,可以训练libsvm与liblinear的模型。
    必须参数:
       problem_path :输入问题的路径即名称:
    可选参数 :
        svm_type :svm类型:libsvm 或liblinear 。默认为"libsvm"
        param  用户自己设定的svm的参数,这个要区分libsvm与liblinear参数的限制。默认" " 
        model_save_path :模型保存的路径,默认情况下路径为"../svm.model"
    '''
    tms_svm.set_svm_type(svm_type)
    train_model.ctm_train_model(problem_path, param, model_save_path)
Exemplo n.º 6
0
def grid(problem_path,result_save_path,svm_type,coarse_c_range,coarse_g_range,fine_c_step,fine_g_step):
    '''搜索的主文件;
    svm_type :使用的模型的类型。"libsvm"或者"liblinear"
    coarse_c_range :粗粒度c搜索的范围,为一个truple (begin,end,step)
    coarse_g_range :粗粒度g搜索的范围,为一个truple (begin,end,step)
    fine_c_step :细粒度c搜索的步长,搜索范围为(fine_c-coarse_c_step,fine_c+coarse_c_step,fine_c_step),如果为0,则固定为(fine_c,fine_c,fine_c)
    fine_g_step :细粒度g搜索的步长,搜索范围为(fine_g-coarse_g_step,fine_g+coarse_g_step,fine_g_step),如果为0,则固定为(fine_g,fine_g,fine_g)
    '''
    tms_svm.set_svm_type(svm_type)
    y,x  = tms_svm.read_problem(problem_path)
    fw= file(result_save_path,'w')
    c,g=grid_search_for_large_data(y,x,fw,coarse_c_range,coarse_g_range,fine_c_step,fine_g_step)  
    fw.close()
    return c,g
def ctm_train_model(sample_save_path,svm_type,param,model_save_path):
    '''训练模型,输入样本文件,训练的参数,模型的保存地址,最后会给出模型在训练样本上的测试结果。'''
    tms_svm.set_svm_type(svm_type)
    y,x = tms_svm.read_problem(sample_save_path)
    m = tms_svm.train(y,x,param)
    tms_svm.save_model(model_save_path,m)
    labels = {}.fromkeys(y).keys()
    if len(labels)>2:
        pred_labels, (Micro, Macro, ACC), pred_values = tms_svm.predict(y,x,m)
        print "(Micro=%g, Macro=%g, ACC=%g)"%(Micro, Macro, ACC)
    else:
        pred_labels, (f_score,recall,presion), pred_values=tms_svm.predict(y,x,m)
        print "(f_score=%g,recall=%g,presion=%g)"%(f_score,recall,presion)
    return m
Exemplo n.º 8
0
def grid(problem_path, result_save_path, svm_type, coarse_c_range,
         coarse_g_range, fine_c_step, fine_g_step):
    '''搜索的主文件;
    svm_type :使用的模型的类型。"libsvm"或者"liblinear"
    coarse_c_range :粗粒度c搜索的范围,为一个truple (begin,end,step)
    coarse_g_range :粗粒度g搜索的范围,为一个truple (begin,end,step)
    fine_c_step :细粒度c搜索的步长,搜索范围为(fine_c-coarse_c_step,fine_c+coarse_c_step,fine_c_step),如果为0,则固定为(fine_c,fine_c,fine_c)
    fine_g_step :细粒度g搜索的步长,搜索范围为(fine_g-coarse_g_step,fine_g+coarse_g_step,fine_g_step),如果为0,则固定为(fine_g,fine_g,fine_g)
    '''
    tms_svm.set_svm_type(svm_type)
    y, x = tms_svm.read_problem(problem_path)
    fw = file(result_save_path, 'w')
    c, g = grid_search_for_large_data(y, x, fw, coarse_c_range, coarse_g_range,
                                      fine_c_step, fine_g_step)
    fw.close()
    return c, g
def load_tms_model(config_file):
    '''通过模型配置文件加载词典、全局因子、局部因子、SVM模型'''
    model_main_path = os.path.dirname(config_file)
    f = file(config_file,'r')
    for line in f.readlines():
        text = line.split(":")
        if text[0].strip()=="DicName":
            dic,global_weight = fileutil.read_dic_ex(os.path.join(model_main_path,text[1].strip()),dtype=str)
        if text[0].strip()=="ModelName":
            tms_svm.set_svm_type(tms_svm.detect_svm_type(os.path.join(model_main_path,text[1].strip())))
            model= tms_svm.load_model(os.path.join(model_main_path,text[1].strip()))
        if text[0].strip()=="LocalFun":
            local_fun = measure.local_f(text[1].strip())
        if text[0].strip()=="WordSeg":
            seg = int(float(text[1]))
    return local_fun,dic,global_weight,model,seg
Exemplo n.º 10
0
def ctm_predict_multi(text, indexes_lists, dic_list, model_list,
                      local_fun_list, global_weight_list, str_splitTag):
    '''多个模型的预测,如一个文本有多个模型需要预测
    其中indexes_lists为二维度的。
    the main process to predict the text score.
    support deoloy more than one model.each model contains indexes,dic and model

    '''
    k = len(dic_list)  #得到预测模型的个数
    label_list = [0] * k
    score_list = [0] * k

    for j in range(k):
        indexes = indexes_lists[j]
        model = model_list[j]
        dic = dic_list[j]
        local_fun = local_fun_list[j]
        global_weight = global_weight_list[j]
        if len(text) < indexes[len(indexes) - 1] + 1 or len(
                text) < tms_predict_config.result_indexes[
                    len(tms_predict_config.result_indexes) - 1] + 1:
            label = 0
            sc = 0
        else:
            text_temp = ""
            for index in indexes:
                text_temp += str_splitTag + text[index]
            if dir(model).count("get_svm_type") == 1:
                tms_svm.set_svm_type("libsvm")
            if dir(model).count("get_nr_feature") == 1:
                tms_svm.set_svm_type("liblinear")
            label, sc = cal_sc_optim(1, model, text_temp, dic, local_fun,
                                     global_weight, str_splitTag)
        score_list[j] = float(sc)
        label_list[j] = float(label)
    return label_list, score_list
Exemplo n.º 11
0
def ctm_predict_multi(filename,
                      config_files,
                      indexes_lists,
                      result_save_path,
                      result_indexes,
                      str_splitTag,
                      tc_splitTag,
                      seg,
                      delete=False,
                      change_decode=False,
                      in_decode="UTF-8",
                      out_encode="GBK"):
    '''多个模型的预测,如一个文本有多个模型需要预测
    其中title_indexes,dic_path ,model_path为二维度的。
    '''
    if seg != 0:
        print "-----------------正在对源文本进行分词-------------------"
        all_index = list()
        for index in indexes_lists:
            all_index.extend(index)
        segment_file = os.path.dirname(filename) + "/segmented"
        segment.file_seg(filename, all_index, segment_file, str_splitTag,
                         tc_splitTag, seg)
        filename = segment_file
    k = len(config_files)  #得到预测模型的个数
    dic_list = []
    local_fun_list = []
    model_list = []
    global_weight_list = []
    for i in range(k):
        local_fun, dic, global_weight, model, seg_ori = load_tms_model(
            config_files[i])
        dic_list.append(dic)
        local_fun_list.append(local_fun)
        model_list.append(model)
        global_weight_list.append(global_weight)

    print "-----------------正在对源文本进行词干化处理-------------------"
    stem.stemFile(filename, str_splitTag, tc_splitTag)

    f = file(filename, 'r')
    fs = file(result_save_path, 'w')
    print "-----------------正在对样本进行预测-------------------"
    for line in f.readlines():
        if len(line.strip()) < 1:
            continue
        if change_decode == True:
            line = line.decode(in_decode).encode(out_encode, 'ignore')
        text = line.strip().split(tc_splitTag)

        for j in range(k):
            indexes = indexes_lists[j]
            model = model_list[j]
            dic = dic_list[j]
            local_fun = local_fun_list[j]
            if len(text) < indexes[len(indexes) - 1] + 1 or len(
                    text) < result_indexes[len(result_indexes) - 1] + 1:
                label = 0
                sc = 0
            else:
                text_temp = ""
                for index in indexes:
                    text_temp += str_splitTag + text[index]
                if dir(model).count("get_svm_type") == 1:
                    tms_svm.set_svm_type("libsvm")
                if dir(model).count("get_nr_feature") == 1:
                    tms_svm.set_svm_type("liblinear")
                label, sc = cal_sc_optim(1, model, text_temp, dic, local_fun,
                                         global_weight, str_splitTag)
            fs.write(str(label) + "\t" + str(sc) + "\t")
        for index in result_indexes:
            if index > len(text) - 1:
                break
            fs.write(text[index] + "\t")
        fs.write("\n")
    f.close()
    fs.close()
    print "-----------------预测完毕-------------------"
def ctm_predict_multi(filename,config_files,indexes_lists,result_save_path,result_indexes,str_splitTag,tc_splitTag,seg,delete=False,change_decode=False,in_decode="UTF-8",out_encode="GBK"):
    '''多个模型的预测,如一个文本有多个模型需要预测
    其中title_indexes,dic_path ,model_path为二维度的。
    '''
    if seg!=0:
        print "-----------------正在对源文本进行分词-------------------"
        all_index = list()
        for index in indexes_lists:
            all_index.extend(index)
        segment_file = os.path.dirname(filename)+"/segmented"
        segment.file_seg(filename,all_index,segment_file,str_splitTag,tc_splitTag,seg)
        filename = segment_file
    k = len(config_files) #得到预测模型的个数
    dic_list=[]
    local_fun_list=[]
    model_list=[]
    global_weight_list = []
    for i in range(k):
        local_fun,dic,global_weight,model,seg_ori = load_tms_model(config_files[i]) 
        dic_list.append(dic)
        local_fun_list.append(local_fun)
        model_list.append(model)
        global_weight_list .append(global_weight)

    print "-----------------正在对源文本进行词干化处理-------------------"
    stem.stemFile(filename,str_splitTag,tc_splitTag)
            
    f= file(filename,'r')
    fs = file(result_save_path,'w')    
    print "-----------------正在对样本进行预测-------------------"
    for line in f.readlines():
        if len(line.strip())<1:
            continue
        if change_decode ==True:
            line = line.decode(in_decode).encode(out_encode,'ignore')
        text = line.strip().split(tc_splitTag)
        
        for j in range(k):
            indexes = indexes_lists[j]
            model = model_list[j]
            dic = dic_list[j]
            local_fun = local_fun_list[j]
            if len(text)<indexes[len(indexes)-1]+1 or len(text)<result_indexes[len(result_indexes)-1]+1:
                label =0
                sc=0  
            else:     
                text_temp=""
                for index in indexes:
                    text_temp+=str_splitTag+text[index]                   
                if dir(model).count("get_svm_type")==1:
                    tms_svm.set_svm_type("libsvm")
                if dir(model).count("get_nr_feature")==1:
                    tms_svm.set_svm_type("liblinear")
                label,sc=cal_sc_optim(1,model,text_temp,dic,local_fun,global_weight,str_splitTag)
            fs.write(str(label)+"\t"+str(sc)+"\t")
        for index in result_indexes:
            if index>len(text)-1:
                break
            fs.write(text[index]+"\t")
        fs.write("\n")
    f.close()
    fs.close()
    print u"-----------------预测完毕-------------------"
Exemplo n.º 13
0
def ctm_train(filename, indexes, main_save_path, stopword_filename, svm_param,
              config_name, dic_name, model_name, train_name, svm_type,
              param_name, ratio, delete, str_splitTag, tc_splitTag, seg,
              param_select, global_fun, local_fun, label_file):
    '''训练的自动化程序,分词,先进行特征选择,重新定义词典,根据新的词典,自动选择SVM最优的参数。
    然后使用最优的参数进行SVM分类,最后生成训练后的模型。
    需要保存的文件:(需定义一个主保存路径)
                 模型文件:词典.key+模型.model
                临时文件 :svm分类数据文件.train
    filename 训练文本所在的文件名
    indexs需要训练的指标项
    main_save_path 模型保存的路径
    stopword_filename 停用词的名称以及路径 ;默认不适用停用词
    svm_type :svm类型:libsvm 或liblinear
    svm_param  用户自己设定的svm的参数,这个要区分libsvm与liblinear参数的限制;例如"-s 0 -t 2 -c 0.2 "
    dic_name 用户自定义词典名称;例如“dic.key”
    model_name用户自定义模型名称 ;例如"svm.model"
    train_name用户自定义训练样本名称 ;例如“svm.train”
    param_name用户自定义参数文件名称 ;例如"svm.param"
    ratio 特征选择保留词的比例 ;例如 0.4
    delete对于所有特征值为0的样本是否删除,True or False
    str_splitTag 分词所用的分割符号 例如"^"
    tc_splitTag训练样本中各个字段分割所用的符号 ,例如"\t"
    seg 分词的选择:0为不进行分词;1为使用mmseg分词;2为使用aliws分词
    param_select ;是否进行SVM模型参数的搜索。True即为使用SVM模型grid.搜索,False即为不使用参数搜索。
    local_fun:即对特征向量计算特征权重时需要设定的计算方式:x(i,j) = local(i,j)*global(i).可选的有tf,logtf
    global_fun :全局权重的计算方式:有"one","idf","rf"
    label_file:类标签的解释说明文件。
    '''

    print "-----------------创建模型文件保存的路径-----------------"
    if os.path.exists(main_save_path):
        if os.path.exists(os.path.join(main_save_path, "model")) is False:
            os.makedirs(os.path.join(main_save_path, "model"))
    if os.path.exists(main_save_path):
        if os.path.exists(os.path.join(main_save_path, "temp")) is False:
            os.makedirs(os.path.join(main_save_path, "temp"))

    #设定SVM模型的类型。

    tms_svm.set_svm_type(svm_type)

    #如果没有给出停用词的文件名,则默认不使用停用词
    if stopword_filename == "":
        stop_words_dic = dict()
    else:
        stop_words_dic = fileutil.read_dic(stopword_filename)

    #如果需要分词,则对原文件进行分词
    if seg != 0:
        print "-----------------正在对源文本进行分词-------------------"
        segment_file = os.path.dirname(filename) + "/segmented"
        segment.file_seg(filename, indexes, segment_file, str_splitTag,
                         tc_splitTag, seg)
        filename = segment_file

    #对原训练样本进行词干化处理
    print "-----------------正在对源文本进行词干化处理-------------------"
    stem.stemFile(filename, str_splitTag, tc_splitTag)

    print "-----------------现在正在进行特征选择---------------"
    dic_path = os.path.join(main_save_path, "model", dic_name)
    feature_select(filename,
                   indexes,
                   global_fun,
                   dic_path,
                   ratio,
                   stop_words_dic,
                   str_splitTag=str_splitTag,
                   tc_splitTag=tc_splitTag)

    print "-----------------再根据特征选择后的词典构造新的SVM分类所需的训练样本------------------- "
    problem_save_path = os.path.join(main_save_path, "temp", train_name)
    local_fun_str = local_fun
    local_fun = measure.local_f(local_fun)
    label = cons_train_sample_for_cla(filename, indexes, local_fun, dic_path,
                                      problem_save_path, delete, str_splitTag,
                                      tc_splitTag)

    if param_select == True:
        print "--------------------选择最优的c,g------------------------------"
        search_result_save_path = main_save_path + "temp/" + param_name
        if svm_type == "libsvm":
            coarse_c_range = (-5, 7, 2)
            coarse_g_range = (3, -10, -2)
            fine_c_step = 0.5
            fine_g_step = 0.5
            c, g = grid_search_param.grid(problem_save_path,
                                          search_result_save_path, svm_type,
                                          coarse_c_range, coarse_g_range,
                                          fine_c_step, fine_g_step)
            svm_param = svm_param + " -c " + str(c) + " -g " + str(g)
        if svm_type == "liblinear" or (svm_type == "libsvm" and
                                       is_linear_kernal(svm_param) is True):
            coarse_c_range = (-5, 7, 2)
            coarse_g_range = (1, 1, 1)
            fine_c_step = 0.5
            fine_g_step = 0
            c, g = grid_search_param.grid(problem_save_path,
                                          search_result_save_path, svm_type,
                                          coarse_c_range, coarse_g_range,
                                          fine_c_step, fine_g_step)
            svm_param = svm_param + " -c " + str(c)

    print "-----------------训练模型,并将模型进行保存----------"
    model_save_path = main_save_path + "model/" + model_name
    ctm_train_model(problem_save_path, svm_type, svm_param, model_save_path)

    print "-----------------保存模型配置-----------------"
    f_config = file(os.path.join(main_save_path, "model", config_name), 'w')
    save_config(f_config, dic_name, model_name, local_fun_str, global_fun, seg,
                svm_type, svm_param, label_file, label)
    f_config.close()
def ctm_train(filename,indexes,main_save_path,stopword_filename,svm_param,config_name,dic_name,model_name,train_name,svm_type,param_name,ratio,delete,str_splitTag,tc_splitTag,seg,param_select,global_fun,local_fun,label_file):
    '''训练的自动化程序,分词,先进行特征选择,重新定义词典,根据新的词典,自动选择SVM最优的参数。
    然后使用最优的参数进行SVM分类,最后生成训练后的模型。
    需要保存的文件:(需定义一个主保存路径)
                 模型文件:词典.key+模型.model
                临时文件 :svm分类数据文件.train
    filename 训练文本所在的文件名
    indexs需要训练的指标项
    main_save_path 模型保存的路径
    stopword_filename 停用词的名称以及路径 ;默认不适用停用词
    svm_type :svm类型:libsvm 或liblinear
    svm_param  用户自己设定的svm的参数,这个要区分libsvm与liblinear参数的限制;例如"-s 0 -t 2 -c 0.2 "
    dic_name 用户自定义词典名称;例如“dic.key”
    model_name用户自定义模型名称 ;例如"svm.model"
    train_name用户自定义训练样本名称 ;例如“svm.train”
    param_name用户自定义参数文件名称 ;例如"svm.param"
    ratio 特征选择保留词的比例 ;例如 0.4
    delete对于所有特征值为0的样本是否删除,True or False
    str_splitTag 分词所用的分割符号 例如"^"
    tc_splitTag训练样本中各个字段分割所用的符号 ,例如"\t"
    seg 分词的选择:0为不进行分词;1为使用mmseg分词;2为使用aliws分词
    param_select ;是否进行SVM模型参数的搜索。True即为使用SVM模型grid.搜索,False即为不使用参数搜索。
    local_fun:即对特征向量计算特征权重时需要设定的计算方式:x(i,j) = local(i,j)*global(i).可选的有tf,logtf
    global_fun :全局权重的计算方式:有"one","idf","rf"
    label_file:类标签的解释说明文件。
    '''

    print "-----------------创建模型文件保存的路径-----------------"
    if os.path.exists(main_save_path):
        if os.path.exists(os.path.join(main_save_path,"model")) is False:
            os.makedirs(os.path.join(main_save_path,"model"))
    if os.path.exists(main_save_path):
        if os.path.exists(os.path.join(main_save_path,"temp")) is False:
            os.makedirs(os.path.join(main_save_path,"temp"))
    
    #设定SVM模型的类型。  
    
    tms_svm.set_svm_type(svm_type)   
        
    #如果没有给出停用词的文件名,则默认不使用停用词
    if stopword_filename =="":
        stop_words_dic=dict()
    else:
        stop_words_dic = fileutil.read_dic(stopword_filename)
    
    #如果需要分词,则对原文件进行分词
    if seg!=0:
        print "-----------------正在对源文本进行分词-------------------"
        segment_file = os.path.dirname(filename)+"/segmented"
        segment.file_seg(filename,indexes,segment_file,str_splitTag,tc_splitTag,seg)
        filename = segment_file
    
    #对原训练样本进行词干化处理
    print "-----------------正在对源文本进行词干化处理-------------------"
    stem.stemFile(filename,str_splitTag,tc_splitTag)
    
    print "-----------------现在正在进行特征选择---------------"  
    dic_path= os.path.join(main_save_path,"model",dic_name)    
    feature_select(filename,indexes,global_fun,dic_path,ratio,stop_words_dic,str_splitTag=str_splitTag,tc_splitTag=tc_splitTag)
    
    print "-----------------再根据特征选择后的词典构造新的SVM分类所需的训练样本------------------- "
    problem_save_path  = os.path.join(main_save_path,"temp",train_name)
    local_fun_str = local_fun
    local_fun = measure.local_f(local_fun)
    label = cons_train_sample_for_cla(filename,indexes,local_fun,dic_path,problem_save_path,delete,str_splitTag,tc_splitTag)
    
    if param_select ==True:
        print"--------------------选择最优的c,g------------------------------"
        search_result_save_path  = main_save_path +"temp/"+param_name
        if svm_type=="libsvm":
           coarse_c_range=(-5,7,2)
           coarse_g_range=(3,-10,-2)
           fine_c_step=0.5
           fine_g_step=0.5
           c,g=grid_search_param.grid(problem_save_path,search_result_save_path,svm_type,coarse_c_range,coarse_g_range,fine_c_step,fine_g_step)
           svm_param = svm_param + " -c "+str(c)+" -g "+str(g)
        if svm_type=="liblinear" or (svm_type=="libsvm" and is_linear_kernal(svm_param) is True):
           coarse_c_range=(-5,7,2)
           coarse_g_range=(1,1,1)
           fine_c_step=0.5
           fine_g_step=0
           c,g=grid_search_param.grid(problem_save_path,search_result_save_path,svm_type,coarse_c_range,coarse_g_range,fine_c_step,fine_g_step)
           svm_param = svm_param + " -c "+str(c)
    
    print "-----------------训练模型,并将模型进行保存----------"
    model_save_path  = main_save_path+"model/"+model_name
    ctm_train_model(problem_save_path,svm_type,svm_param,model_save_path)
    
    print "-----------------保存模型配置-----------------"
    f_config = file(os.path.join(main_save_path,"model",config_name),'w')
    save_config(f_config,dic_name,model_name,local_fun_str,global_fun,seg,svm_type,svm_param,label_file,label)
    f_config.close()
Exemplo n.º 15
0
def main():
    usage = "usage:%prog [options] version=%prog 1.0"
    parser = OptionParser(usage=usage)
    parser.add_option("-s",
                      "--step",
                      type="choice",
                      choices=["1", "2", "3", "4", "5"],
                      dest="step",
                      help="step1 is auto training the svm model")
    parser.add_option("-p", "--path", dest="save_main_path")
    parser.add_option("-P", "--problem_path", dest="problem_save_path")
    parser.add_option("-i",
                      "--indexes",
                      dest="indexes",
                      action="callback",
                      type="string",
                      default=[1],
                      callback=list_callback)
    parser.add_option("-w",
                      "--stopword",
                      action="store_false",
                      dest="stopword",
                      default=True)
    parser.add_option("-n",
                      "--config_name",
                      dest="config_name",
                      default="tms.config")
    parser.add_option("-d", "--dic_name", dest="dic_name", default="dic.key")
    parser.add_option("-D", "--dic_path", dest="dic_path")
    parser.add_option("-m",
                      "--model_name",
                      dest="model_name",
                      default="tms.model")
    parser.add_option("-t",
                      "--train_name",
                      dest="train_name",
                      default="tms.train")
    parser.add_option("-a",
                      "--param_name",
                      dest="param_name",
                      default="tms.param")
    parser.add_option("-r", "--ratio", dest="ratio", type="float", default=0.4)
    parser.add_option("-A",
                      "--svm_param",
                      dest="svm_param",
                      default="'-s 0 -t 2 -c 1.0 -g 0.25'")
    parser.add_option("-T",
                      "--tc_splitTag",
                      dest="tc_splitTag",
                      type="string",
                      default="\t")
    parser.add_option("-S",
                      "--str_splitTag",
                      dest="str_splitTag",
                      type="string",
                      default="^")
    parser.add_option("-v",
                      "--svm_type",
                      dest="svm_type",
                      default="libsvm",
                      type="choice",
                      choices=["libsvm", "liblinear"])
    parser.add_option("-e",
                      "--segment",
                      type="choice",
                      dest="segment",
                      default=0,
                      choices=[0, 1, 2])
    parser.add_option("-c",
                      "--param_select",
                      action="store_false",
                      dest="param_select",
                      default=True)
    parser.add_option("-g",
                      "--global_fun",
                      dest="global_fun",
                      default="one",
                      type="choice",
                      choices=["one", "idf", "rf"])
    parser.add_option("-l",
                      "--local_fun",
                      dest="local_fun",
                      default="tf",
                      type="choice",
                      choices=["tf"])
    parser.add_option("-b",
                      "--label_file",
                      dest="label_file",
                      type="string",
                      default="")
    options, args = parser.parse_args()
    if options.indexes:
        indexes = [int(i) for i in options.indexes]
    if options.step:
        step = int(options.step)

    if options.stopword == False:
        stopword_filename = ""
    else:
        stopword_filename = os.path.dirname(args[0]) + "/stopwords.txt"

    if options.svm_param:
        svm_param = options.svm_param.replace("'", "")
    if step == 1:
        train_model.ctm_train(args[0],
                              indexes,
                              options.save_main_path,
                              stopword_filename,
                              config_name=options.config_name,
                              svm_type=options.svm_type,
                              segment=options.segment,
                              param_select=options.param_select,
                              global_fun=options.global_fun,
                              local_fun=options.local_fun,
                              svm_param=svm_param,
                              dic_name=options.dic_name,
                              model_name=options.model_name,
                              train_name=options.train_name,
                              param_name=options.param_name,
                              ratio=options.ratio,
                              delete=True,
                              str_splitTag=options.str_splitTag,
                              tc_splitTag=options.tc_splitTag,
                              label_file=options.label_file)
    if step == 2:
        train_model.ctm_feature_select(args[0],
                                       indexes,
                                       options.global_fun,
                                       options.save_main_path,
                                       options.dic_name,
                                       options.ratio,
                                       stopword_filename,
                                       str_splitTag=options.str_splitTag,
                                       tc_splitTag=options.tc_splitTag)

    if step == 3:
        if os.path.exists(options.save_main_path):
            if os.path.exists(options.save_main_path + "temp/") is False:
                os.makedirs(options.save_main_path + "temp/")
        sample_save_path = options.save_main_path + "temp/svm.train"
        train_model.cons_train_sample_for_cla(
            args[0],
            indexes,
            options.local_fun,
            options.dic_path,
            sample_save_path,
            delete=True,
            str_splitTag=options.str_splitTag,
            tc_splitTag=options.tc_splitTag)

    if step == 4:
        search_result_save_path = options.save_main_path + "temp/" + "svm.param"
        tms_svm.set_svm_type(options.svm_type)
        if options.svm_type == "libsvm":
            coarse_c_range = (-5, 7, 2)
            coarse_g_range = (3, -10, -2)
            fine_c_step = 0.5
            fine_g_step = 0.5
        if options.svm_type == "liblinear":
            coarse_c_range = (-5, 7, 2)
            coarse_g_range = (1, 1, 1)
            fine_c_step = 0.5
            fine_g_step = 0
        c, g = grid_search_param.grid(args[0], search_result_save_path,
                                      options.svm_type, coarse_c_range,
                                      coarse_g_range, fine_c_step, fine_g_step)
        print "best c = %s\t g = %s\n" % (c, g)

    if step == 5:
        model_save_path = options.save_main_path + "model/" + options.model_name
        train_model.ctm_train_model(options.problem_save_path, svm_param,
                                    model_save_path)
Exemplo n.º 16
0
def main():
    usage ="usage:%prog [options] version=%prog 1.0"
    parser = OptionParser(usage=usage)
    parser.add_option("-s","--step",type="choice",choices=["1","2","3","4","5"],dest="step",help="step1 is auto training the svm model")
    parser.add_option("-p","--path",dest="save_main_path")
    parser.add_option("-P","--problem_path",dest="problem_save_path")
    parser.add_option("-i","--indexes",dest="indexes",action="callback",type="string",default=[1],callback=list_callback)
    parser.add_option("-w","--stopword",action="store_false",dest="stopword",default=True)
    parser.add_option("-n","--config_name",dest="config_name",default="tms.config")
    parser.add_option("-d","--dic_name",dest="dic_name",default="dic.key")
    parser.add_option("-D","--dic_path",dest="dic_path")
    parser.add_option("-m","--model_name",dest="model_name",default="tms.model")
    parser.add_option("-t","--train_name",dest="train_name",default="tms.train")
    parser.add_option("-a","--param_name",dest="param_name",default="tms.param")
    parser.add_option("-r","--ratio",dest="ratio",type="float",default=0.4)
    parser.add_option("-A","--svm_param",dest="svm_param",default="'-s 0 -t 2 -c 1.0 -g 0.25'")
    parser.add_option("-T","--tc_splitTag",dest="tc_splitTag",type="string",default="\t")
    parser.add_option("-S","--str_splitTag",dest="str_splitTag",type="string",default="^")
    parser.add_option("-v","--svm_type",dest="svm_type",default="libsvm",type="choice",choices=["libsvm","liblinear"])
    parser.add_option("-e","--segment",type="choice",dest="segment",default=0,choices=[0,1,2])
    parser.add_option("-c","--param_select",action="store_false",dest="param_select",default=True)
    parser.add_option("-g","--global_fun",dest="global_fun",default="one",type="choice",choices=["one","idf","rf"])
    parser.add_option("-l","--local_fun",dest="local_fun",default="tf",type="choice",choices=["tf"])
    parser.add_option("-b","--label_file",dest="label_file",type="string",default="")
    options, args = parser.parse_args() 
    if options.indexes:
        indexes = [int(i) for i in options.indexes]
    if options.step:
        step = int(options.step)

    if options.stopword ==False:
        stopword_filename=""
    else:
        stopword_filename = os.path.dirname(args[0])+"/stopwords.txt"
        
    if options.svm_param:
        svm_param = options.svm_param.replace("'","") 
    if step==1:
        train_model.ctm_train(args[0],indexes,options.save_main_path,stopword_filename,config_name=options.config_name,svm_type =options.svm_type,segment=options.segment,param_select=options.param_select,global_fun=options.global_fun,local_fun=options.local_fun,svm_param=svm_param,dic_name=options.dic_name,model_name=options.model_name,train_name=options.train_name,param_name=options.param_name,ratio=options.ratio,delete=True,str_splitTag=options.str_splitTag,tc_splitTag=options.tc_splitTag,label_file=options.label_file)
    if step==2:
        train_model.ctm_feature_select(args[0],indexes,options.global_fun,options.save_main_path,options.dic_name,options.ratio,stopword_filename,str_splitTag=options.str_splitTag,tc_splitTag=options.tc_splitTag)
    
    if step==3:
        if os.path.exists(options.save_main_path):
            if os.path.exists(options.save_main_path+"temp/") is False:
                os.makedirs(options.save_main_path+"temp/")
        sample_save_path  = options.save_main_path +"temp/svm.train"
        train_model.cons_train_sample_for_cla(args[0],indexes,options.local_fun,options.dic_path,sample_save_path,delete=True,str_splitTag=options.str_splitTag,tc_splitTag=options.tc_splitTag)
    
    if step==4:
        search_result_save_path  = options.save_main_path +"temp/"+"svm.param"
        tms_svm.set_svm_type(options.svm_type)
        if options.svm_type=="libsvm":
            coarse_c_range=(-5,7,2)
            coarse_g_range=(3,-10,-2)
            fine_c_step=0.5
            fine_g_step=0.5
        if options.svm_type =="liblinear":
            coarse_c_range=(-5,7,2)
            coarse_g_range=(1,1,1)
            fine_c_step=0.5
            fine_g_step=0
        c,g=grid_search_param.grid(args[0],search_result_save_path,options.svm_type,coarse_c_range,coarse_g_range,fine_c_step,fine_g_step)
        print "best c = %s\t g = %s\n"%(c,g)
    
    if step==5:
        model_save_path  = options.save_main_path+"model/"+options.model_name
        train_model.ctm_train_model(options.problem_save_path,svm_param,model_save_path)