示例#1
0
def train_liblinear(args):
    model_name, gold_dir, dirs = args[0], args[1], args[2:]
    vectors, predicates = get_data(gold_dir, dirs)
    prob = problem(map(num_to_class, predicates), vectors)
    param = parameter('-s 0')
    model = train(prob, param)
    save_model(model_name, model)
示例#2
0
文件: trainer.py 项目: zbxzc35/HunTag
 def save(self):
     sys.stderr.write('saving model...')
     save_model(self.modelName + '.model', self.model)
     sys.stderr.write('done\nsaving label and feature lists...')
     self.labelCounter.saveToFile(self.modelName + '.labelNumbers')
     self.featCounter.saveToFile(self.modelName + '.featureNumbers')
     sys.stderr.write('done\n')
示例#3
0
 def save(self):
     sys.stderr.write('saving model...')
     save_model(self.modelName+'.model', self.model)
     sys.stderr.write('done\nsaving label and feature lists...')
     self.labelCounter.saveToFile(self.modelName+'.labelNumbers')
     self.featCounter.saveToFile(self.modelName+'.featureNumbers')
     sys.stderr.write('done\n')
示例#4
0
def train_liblinear(args):
    model_name, gold_dir, dirs = args[0], args[1], args[2:]
    vectors, predicates = get_data(gold_dir, dirs)
    prob = problem(map(num_to_class, predicates), vectors)
    param = parameter('-s 0')
    model = train(prob, param)
    save_model(model_name, model)
示例#5
0
 def save(self, model_path):
     """Saves the model as a directory at the given path."""
     if os.path.exists(model_path):
         subprocess.check_output(["rm", "-rf", model_path])
     os.makedirs(model_path)
     pickle.dump(self.__feature_extractor,
                 open(os.path.join(model_path, "feature_extractor"), "wb"),
                 protocol=pickle.HIGHEST_PROTOCOL)
     liblinearutil.save_model(os.path.join(model_path, "liblinear_model"),
                              self.__liblinear_model)
示例#6
0
 def save(self, model_path):
     """Saves the model as a directory at the given path."""
     if os.path.exists(model_path):
         subprocess.check_output(["rm", "-rf", model_path])
     os.makedirs(model_path)
     pickle.dump(self.__feature_extractor,
                 open(os.path.join(model_path, "feature_extractor"), "wb"),
                 protocol=pickle.HIGHEST_PROTOCOL)
     liblinearutil.save_model(os.path.join(model_path, "liblinear_model"),
                              self.__liblinear_model)
示例#7
0
def train_and_save(name, method):
    '''Runs training.
    
    Must use the liblinear library.
    '''
    print 'Training dataset {} with method {}.'.format(name, method)
    model = get_training_method(method)(name)
    try:
        llb.save_model('models/{}.model'.format(name), model)
        print 'Saved model.'
    except:
        print 'Could not save model.'
示例#8
0
def train( C, Y_train, X_train, x_lines ):
    """
    This function takes in the training labels and features and creates a model and saves that model
    :param C       : list containing parameter C
    :param X_train : training features
    :param Y_train : training labels
    :return None
    """
    # for c in C:
    param = '-s 2 -c ' + str(C)
    model = lu.train(Y_train, X_train, param)
    lu.save_model("model/lmods2_tamper" + str(round(C,2)) + "_" + str(x_lines) + "l.model", model)
示例#9
0
def train(c, Y_train, X_train):
    """
    This function takes in the training labels and features and creates a model and saves that model
    :param C       : list containing parameter C
    :param X_train : training features
    :param Y_train : training labels
    :return None
    """
    #for c in C:
    param = '-s 2 -c ' + str(c)
    model = lu.train(Y_train, X_train, param)
    lu.save_model("model/lmods2_"+str(round(c,2))+".model", model)
示例#10
0
 def SaveModel(self):
     if(self.classifierType == "SVM" and self.packageType == "liblinear"):
         from liblinearutil import save_model
         save_model(self.featuresSerializationFileName, self.classifierModel)
     elif(self.classifierType == "DecisionTree" and self.packageType == "nltk"):
         # Open the serialization file
         serializationFile = open(self.featuresSerializationFileName, 'wb')
         
         # Save the model
         pickle.dump(self.classifierModel, serializationFile)
         
         # Close the serialization file
         serializationFile.close()
     else:
         print("Unsupported classifier and package type for SaveModel")
     '''
示例#11
0
    def SaveModel(self):
        if (self.classifierType == "SVM" and self.packageType == "liblinear"):
            from liblinearutil import save_model
            save_model(self.featuresSerializationFileName,
                       self.classifierModel)
        elif (self.classifierType == "DecisionTree"
              and self.packageType == "nltk"):
            # Open the serialization file
            serializationFile = open(self.featuresSerializationFileName, 'wb')

            # Save the model
            pickle.dump(self.classifierModel, serializationFile)

            # Close the serialization file
            serializationFile.close()
        else:
            print("Unsupported classifier and package type for SaveModel")
        '''
示例#12
0
	def _save_models(self, model_dir, label_samples, train_samples, train_labels, train_feature_samples):
		out_json = {}
		out_json['tuples'] = self.model_keys
		out_json['train_samples_file'] = 'train_samples.json'
		out_json['feature_lexicon_file'] = 'feature_lexicon.json'
		out_json['ontology_file'] = 'ontology.json'
		output = codecs.open(os.path.join(model_dir, 'config.json'), 'w', 'utf-8')
		json.dump(out_json, output, indent=4)
		output.close()

		# save ontology file
		shutil.copyfile(self.ontology_file, os.path.join(model_dir,out_json['ontology_file']))

		# save train samples
		output = codecs.open(os.path.join(model_dir, out_json['train_samples_file']), 'w', 'utf-8')
		train_json = {}
		train_json['train_samples'] = train_samples
		train_json['label_samples'] = label_samples
		train_json['train_feature_samples'] = train_feature_samples
		train_json['train_labels'] = train_labels
		json.dump(train_json, output, indent=4)
		output.close()

		# save train sample nums
		output = codecs.open(os.path.join(model_dir, 'train_samples_number.json'), 'w', 'utf-8')
		train_number_json={}
		for key, labels in train_labels.items():
			pos_num = 0
			neg_num = 0
			for label in labels:
				if label == 0:
					neg_num += 1
				elif label == 1:
					pos_num += 1
			train_number_json[key] = {0:neg_num, 1:pos_num}
		json.dump(train_number_json, output, indent=4)
		output.close()

		# save feature
		self.feature.save_Lexicon(os.path.join(model_dir, out_json['feature_lexicon_file']))

		# save svm models
		for model_key in self.model_keys:
			save_model(os.path.join(model_dir, '%s.svm.m' %(model_key)), self.models[model_key])
    def save(self, model_path):
        """
		Saves the model as a directory at the given path

		@type model_path: str
		@param model_path: path to save the trained model
		"""
        ## if-else statement added on 06.02.2017
        if (self.__feature_extractor.feature_template
                == "relational") and (self.__feature_extractor.parser_type
                                      == "spacy"):
            print("Relational model with spaCy parser cannot be saved")
        else:
            pickle.dump(self.__feature_extractor,
                        open(
                            os.path.join(self.model_path, "feature_extractor"),
                            "wb"),
                        protocol=pickle.HIGHEST_PROTOCOL)
            # save trained model in the model_path directory
            liblinearutil.save_model(
                os.path.join(self.model_path, "liblinear_model"),
                self.__liblinear_model)
示例#14
0
文件: pos.py 项目: Adderty/Pyrus
	def save(self, modelname, featuresname):
		svm.save_model(modelname, self._svm_model)
		self._features.save(open(featuresname, 'wb'))
示例#15
0
def train(instance_file, model_file, param):
    y, x = ll.svm_read_problem(instance_file)
    prob = ll.problem(y, x)
    m = ll.train(prob, param)
    ll.save_model(model_file, m)
    print 'done training', model_file
示例#16
0
    def TrainFromAttrDictFiles(self, attr_dict_file, feature_list, percent, tokenizer_mode, model_dir):
        # deal with model dir
        if os.path.exists(model_dir):
            shutil.rmtree(model_dir, True)
        os.mkdir(model_dir)

        input = codecs.open(attr_dict_file, "r", "utf-8")
        attr_data = json.load(input)
        input.close()

        # stat train samples
        print "stat train samples"
        self.feature = attr_feature(feature_list, percent, tokenizer_mode)
        self.feature.stat_Lexicon(attr_data)
        self.attrs = attr_data["attr_data_index"].keys()

        self.train_samples = []
        for utter in attr_data["sub_utter_data"]:
            feature_vector = self.feature.ExtractFeatureFromSent(utter)
            self.train_samples.append(feature_vector)

        self.attrs_labels = {}
        for attr in self.attrs:
            self.attrs_labels[attr] = [0] * len(self.train_samples)
            for index in attr_data["attr_data_index"][attr]:
                self.attrs_labels[attr][index] = 1

                # train svm model
        print "train svm models"
        for attr in self.attrs:
            print "Train attr: %s" % (attr)
            prob = problem(self.attrs_labels[attr], self.train_samples)
            param = parameter("-s 0 -c 1")
            self.models[attr] = liblinear.train(prob, param)

            # save model
        print "save models"
        out_json = {}
        out_json["attrs"] = self.attrs
        out_json["train_samples_file"] = "train_samples.json"
        out_json["feature_lexicon_file"] = "feature_lexicon.json"
        out_json["ontology_file"] = "ontology.json"
        output = codecs.open(os.path.join(model_dir, "config.json"), "w", "utf-8")
        json.dump(out_json, output, indent=4)
        output.close()

        # save train samples
        output = codecs.open(os.path.join(model_dir, out_json["train_samples_file"]), "w", "utf-8")
        train_sample_json = {}
        train_sample_json["samples"] = self.train_samples
        train_sample_json["label_index"] = self.attrs_labels
        json.dump(train_sample_json, output, indent=4)
        output.close()

        # save feature
        self.feature.save_Lexicon(os.path.join(model_dir, out_json["feature_lexicon_file"]))

        # save svm models
        for attr in self.attrs:
            save_model(os.path.join(model_dir, "%s.svm.m" % (attr)), self.models[attr])

        print "Done!"
示例#17
0
 def save(self, path):
     liblinear.save_model(path + '-model', self._model)
     del (self._model)
     f = open(path, 'wb')
     pickle.dump((self._labels, self._features), f)
     f.close()
示例#18
0
def train_model():
    """训练模型
    """
    y, x = svm_read_problem(TRAIN_INPUT_FILE)
    m = train(y, x, "-c 4")
    save_model(SVM_MODEL_FILE, m)
示例#19
0
文件: svm.py 项目: Imperat/Pyrus
 def save(self, path):
     liblinear.save_model(path + "-model", self._model)
     del (self._model)
     ml.Classifier.save(self, path)
示例#20
0
文件: svm.py 项目: akartbayev/yatk
	def save(self, path):
		liblinear.save_model(path + '-model', self._model)
		del(self._model)
		f = open(path, 'wb')
		pickle.dump((self._labels, self._features), f)
		f.close()
示例#21
0
文件: svm.py 项目: ranxian/treeparser
 def train(self):
     if os.path.isfile("svm.model") and self.useModel:
         self.model = llu.load_model("svm.model")
     else:
         self.model = llu.train(self.ys, self.xs, self.train_param)
         llu.save_model("svm.model", self.model)
def train(word_dict):
    get_feature(word_dict, "data/train.dat", "data/train.format")
    get_feature(word_dict, "data/test.dat", "data/test.format")
    train_y, train_x = linear.svm_read_problem("data/train.format")
    model = linear.train(train_y, train_x) 
    linear.save_model("model.dat", model)
示例#23
0
def insert_predictions(data, features, active_features, baseline_file, date_0, date_A):
    folds = get_folds(data)
    test_data = folds[0]
    train_data = {}
    train_data.update(folds[1])
    train_data.update(folds[2])
    train_data.update(folds[3])
    train_data.update(folds[4])

    print "Normalize"
    normalized_train_features, mean, std = normalize_train(train_data, features)
    normalized_test_features = normalize_test(test_data, features, mean, std)

    print "Tune"
    best_c = get_best_c(train_data, normalized_train_features)
    
    print "Train"
    model = train(train_data, normalized_train_features, best_c)

    print "Predict"
    predictions = predict(test_data, normalized_test_features, model)

    print "Insert"
    conn = pymongo.MongoClient('localhost')
    db = conn['nicta']
    pred_coll = db['predictions_' + date_A]
    vid_coll = db['videos']
    w_coll = db['weights_' + date_A]

    feat_coll = db['features_' + date_0 + '_10']
    sample_tweets = {}
    print 'Getting features'
    feat_idx = 0
    for result in feat_coll.find():
        feat_idx += 1
        if feat_idx % 10000 == 0:
            print feat_idx, len(sample_tweets)

        vid_id = result['_id']
        if vid_id not in predictions:
            continue

        tweets = result['value']['sample_tweets']
        authors = result['value']['sample_authors']
        all_tweets = result['value']['all_tweets']

        text_concat = ""
        for twt in db['tweet'].find({'_id': {'$in' : all_tweets}}, ['text']):
            text_concat += twt['text']

        compressed = zlib.compress(text_concat.encode('utf8'))

        sample_tweets[vid_id] = {
            'tweets' : tweets,
            'authors' : authors,
            'average' : result['value']['tweet_count'] / float(10),
            'compressed' : len(compressed)
        }

    print 'Videos with sample tweets:', len(sample_tweets)

    for line in open(baseline_file):
        tokens = line.strip().split(',')
        vid_id = tokens[0]
        if vid_id not in test_data:
            continue

        A_score = int(tokens[1])

        test_data[vid_id]['train_views'] = A_score


    print 'Inserting'
    pred_coll.drop()
    for vid_id in predictions:
        if vid_id not in sample_tweets:
            continue

        pred_coll.insert({
            'id' : vid_id,
            'score' : predictions[vid_id]['score'],
            'actual' : test_data[vid_id]['class'],
            'upload_date' : test_data[vid_id]['upload_date'],
            'B_views' : test_data[vid_id]['views'],
            'A_views' : test_data[vid_id]['train_views'],
            'features' : features[vid_id],
            'diversity' : sample_tweets[vid_id]['compressed'],
            'active_features' : active_features[vid_id],
            'sample_tweets' : sample_tweets[vid_id]['tweets'],
            'sample_authors' : sample_tweets[vid_id]['authors'],
            'normalized_features' : normalized_test_features[vid_id],
            'average_tweets' : sample_tweets[vid_id]['average']
        })

    pred_coll.ensure_index("A_views")

    liblinearutil.save_model('MODEL.DAT', model)
    weight_line = False
    weights = []
    for line in open('MODEL.DAT'):
        if not weight_line and line.strip() != 'w':
            continue
        elif line.strip() == 'w':
            weight_line = True
            continue
        else:
            weights.append(float(line.strip()))

    w_coll.remove()
    w_coll.insert({'weights' : weights})

    conn.close()
    print "Done"
示例#24
0
def predict(input_file,
            input_type,
            labels_files,
            output_dir,
            fill_diagonal=None,
            rows_file=None,
            test=False,
            save_model=False):
    if input_type == "edgelist":
        data, IDs = load_edgelist(input_file, fill_diagonal)
    elif input_type == "adjacency":
        data, IDs = load_adjacency(input_file, rows_file, fill_diagonal)
    elif input_type == "numpy":
        data, IDs = load_numpy(input_file, rows_file, fill_diagonal)
    else:
        data, IDs = load_feature_row_file(input_file, rows_file)

    labeled_IDs = []
    labels = []
    weights = []
    for labels_file in labels_files:
        file_labeled_IDs, file_labels, file_weights = \
            load_weighted_labels(labels_file)
        labeled_IDs += file_labeled_IDs
        labels += list(file_labels)
        weights += list(file_weights)
    labels = np.array(labels)
    weights = np.array(weights)

    train_data, _, train_labels, train_weights = get_class_data(
        data, IDs, labeled_IDs, labels, weights, classes=[1, -1])

    print("Training...")
    model = ll.train(train_weights, train_labels, train_data, "-s 7")

    print("Predicting...")
    if test:
        test_data, test_IDs, test_labels, _ = get_class_data(data,
                                                             IDs,
                                                             labeled_IDs,
                                                             labels,
                                                             weights,
                                                             classes=[0])
    else:
        test_data = data
        test_IDs = IDs

    pred_labels, _, probs = \
        ll.predict([], test_data, model, "-b 1")

    # keep only the probability that the gene is positive
    probs = np.array([sublist[0] for sublist in probs])

    print("\nSaving predictions...")
    output_file = os.path.join(
        output_dir, "_".join(os.path.basename(labels_files[0]).split("_")[1:]))
    with open(output_file, "w") as f:
        for ID, label, prob in zip(test_IDs, pred_labels, probs):
            f.write("{0}\t{1}\n".format(ID, prob))

    if save_model:
        print("\nSaving model...")
        model_file = os.path.basename(labels_file).split(".")[0] + ".model"
        ll.save_model(os.path.join(output_dir, model_file), model)
示例#25
0
    L = 12

    codes = np.random.randint(2, size=(train_features.shape[0], L))

    for i in range(mu.shape[0]):
        print('----------')
        print('[ITER] {:3d} mu = {:.4f}'.format(i, mu[i]))
        t_start = timeit.default_timer()
        models = h_step(train_features, codes, verbose=True)
        (A, old_Z) = f_step(train_features, models, verbose=True)
        (codes, loss) = z_step(train_features, models, A, old_Z, mu[i])
        t_end = timeit.default_timer()

        recon_error = np.linalg.norm(train_features - codes.dot(A), axis=1)**2
        print('[ITER] {:3d} train set recon error: {:.4f}'.format(
            i,
            sum(recon_error) / train_features.shape[0]))
        print('[ITER] {:3d} test  set recon error: {:.4f}'.format(
            i, test_recon(test_features, models, A)))
        print('[ITER] {:3d} {:.4f} seconds elapsed'.format(i, t_end - t_start))
        # print('[ITER] {:3d} loss: {:.4f}'.format(i, loss))

    for (m, i) in zip(models, range(len(models))):
        liblinearutil.save_model(
            'models/tr{0:05d}-L{1:02d}-b{2:02d}.model'.format(
                train_features.shape[0], L, i), m)

    hash(features, num_train, L)
    print(calc_mean_ap(labels, num_test, num_train, L))
    print(calc_precision_at_k(labels, num_test, num_train, L, 50))
示例#26
0
def train_model():
    """训练模型
    """
    y, x = svm_read_problem(TRAIN_INPUT_FILE)
    m = train(y, x, '-c 4')
    save_model(SVM_MODEL_FILE, m)
示例#27
0
 def save(self, modelname, featuresname):
     svm.save_model(modelname, self._svm_model)
     self._features.save(open(featuresname, 'wb'))
示例#28
0
	def TrainFromSubsegFiles(self, ontology_file, sub_segments_file, feature_list, percent, tokenizer_mode, model_dir):
		# deal with model dir
		if os.path.exists(model_dir):
			shutil.rmtree(model_dir,True)
		os.mkdir(model_dir)

		self.ontology_file = ontology_file
		self.tagsets = ontology_reader.OntologyReader(ontology_file).get_tagsets()

		input = codecs.open(sub_segments_file, 'r', 'utf-8')
		sub_segments = json.load(input)
		input.close()

		# stat train samples
		print 'stat train samples'
		self.feature = feature(self.tagsets, feature_list, percent, tokenizer_mode)
		self.feature.stat_Lexicon(sub_segments)
		self.slots = self._stat_slot(sub_segments)
		for slot in self.slots:
			self.train_samples[slot] = [[],[]]

		for session in sub_segments['sessions']:
			print '%d' %(session['session_id'])
			for sub_seg in session['sub_segments']:
				feature_vector = self.feature.ExtractFeatureFromSubseg(sub_seg)
				mentioned_slots = []
				for slot, values in sub_seg['frame_label'].items():
					if slot == 'INFO':
						for value in values:
							slot_name = '%s:%s' %(slot,value)
							if slot_name not in mentioned_slots:
								mentioned_slots.append(slot_name)
					else:
						if slot not in mentioned_slots:
							mentioned_slots.append(slot)
				for slot, train_samples in self.train_samples.items():
					if slot in mentioned_slots:
						train_samples[0].append(1)
					else:
						train_samples[0].append(0)

					train_samples[1].append(feature_vector)

		# train svm model
		print 'train svm models'
		for slot, train_samples in self.train_samples.items():
			print 'Train slot: %s' %(slot)
			prob = problem(train_samples[0], train_samples[1])
			param = parameter('-s 0 -c 1')
			self.models[slot] = liblinear.train(prob, param)

		# save model
		print 'save models'
		out_json = {}
		out_json['slots'] = self.slots
		out_json['train_samples_file'] = 'train_samples.json'
		out_json['feature_lexicon_file'] = 'feature_lexicon.json'
		out_json['ontology_file'] = 'ontology.json'
		output = codecs.open(os.path.join(model_dir, 'config.json'), 'w', 'utf-8')
		json.dump(out_json, output, indent=4)
		output.close()

		# save ontology file
		shutil.copyfile(self.ontology_file, os.path.join(model_dir,out_json['ontology_file']))

		# save train samples
		output = codecs.open(os.path.join(model_dir, out_json['train_samples_file']), 'w', 'utf-8')
		json.dump(self.train_samples, output, indent=4)
		output.close()

		# save feature
		self.feature.save_Lexicon(os.path.join(model_dir, out_json['feature_lexicon_file']))

		# save svm models
		for slot in self.slots:
			save_model(os.path.join(model_dir, '%s.svm.m' %(slot)), self.models[slot])

		print 'Done!'
示例#29
0
 def save(self, path):
     liblinear.save_model(path + '-model', self._model)
     del (self._model)
     ml.Classifier.save(self, path)
示例#30
0
 featp = parsePath(featp)
 scores = []
 for line in open(testfile, "r").readlines():
     filename, score, label = line.strip().split()
     scores.append((filename, float(score)))
 scores.sort(key=operator.itemgetter(1), reverse=True)
 # binary weigting , CPRF
 poslst = ["%s%s" % (featp, one[0]) for one in scores[:topn]]
 neglst = ["%s%s" % (featp, one[0]) for one in scores[-botm:]]
 if (oripos != ""):
     poslst.extend([line.strip() for line in open(oripos, "r").readlines()])
 if (orineg != ""):
     neglst.extend([line.strip() for line in open(orineg, "r").readlines()])
 rerankingmodel = trainmodel(poslst, neglst)
 if (rerankingmodelf != ""):
     save_model(rerankingmodelf, rerankingmodel)
 #print rerankingmodel.get_decfun()[0][:5] # len() 12288
 model = rerankingmodel.get_decfun()[0]
 #thres = -0.7
 featfiles = getFiles(featp, "bof")
 featfiles.sort()
 for i in xrange(len(featfiles)):
     filename = os.path.basename(featfiles[i])
     feat = parseLS(open(featfiles[i], "r").read())
     score = predict_this(model, feat)
     if (smoothing):
         scores = [score]
         for j in xrange(i - 2, i + 3):
             if ((j < len(featfiles)) and (j >= 0)):
                 feat = parseLS(open(featfiles[j], "r").read())
                 scores.append(predict_this(model, feat))