def gen_top_k_group_by_model(group_by_field, click_weight = utils.click_weight, year = 'all'):
	"""
	Generate and dump the group by model with top k hotel clusters 
	:param group_by_field: group by field to generate the group with hotel cluster relevance scores
	:param click_weight: the weight for the clicks
	:param year: Year filter on training data
	:return: the topk group by model with respect to group_by_field
	"""
	dump_path = utils.model_path + \
		'_'.join(['top', str(utils.k), 'cw', str(utils.click_weight), 'group', group_by_field, 'year', year]) + '.pkl'

	if os.path.exists(dump_path):
		print 'file: ' + dump_path + ' exists!'
		return

	source = train
	if year == '2013':
		source = train_2013
	elif year == '2014':
		source = train_2014
	
	agg = source.groupby([group_by_field, 'hotel_cluster'])['is_booking'].agg(['sum','count'])
	agg['count'] -= agg['sum']
	agg = agg.rename(columns = {'sum':'bookings','count':'clicks'})
	agg['relevance'] = agg['bookings'] + click_weight * agg['clicks'] # the weighted sum of bookings count and clicks count
	agg.reset_index(inplace = True)
	top_clusters = agg.groupby([group_by_field]).apply(top_k_relevence)
	top_clusters = pd.DataFrame(top_clusters).rename(columns={0:'hotel_cluster'})

	joblib.dump(top_clusters, dump_path)
示例#2
0
def training_stage3(dftrain,dfvalid,cat1,i):
    fname = ddir + 'joblib/stage3_'+str(cat1)+ext
    df = dftrain[dftrain.Categorie1 == cat1].reset_index(drop=True)
    dfv = dfvalid[dfvalid.Categorie1 == cat1].reset_index(drop=True)
    labels = np.unique(df.Categorie3)
    if len(labels)==1:
        joblib.dump((labels,None,None),fname)
        scv = -1
        sct = -1
        print 'training',cat1,'\t\t(',i,') : N=',len(df),'K=',len(labels)
        print 'training',cat1,'\t\t(',i,') : training=',sct,'validation=',scv
        return (sct,scv)
    vec,X = vectorizer_stage3(df.txt)
    Y = df['Categorie3'].values
    cla = LogisticRegression(C=best_regularisation.get(cat1,100))
    cla.fit(X,Y)
    labels = np.unique(df.Categorie3)
    sct = cla.score(X[:min(10000,len(df))],Y[:min(10000,len(df))])
    if len(dfv)==0:
        scv = -1
    else:
        Xv = vec.transform(dfv.txt)
        Yv = dfv['Categorie3'].values
        scv = cla.score(Xv,Yv)
    print 'training',cat1,'\t\t(',i,') : N=',len(df),'K=',len(labels)
    print 'training',cat1,'\t\t(',i,') : training=',sct,'validation=',scv
    joblib.dump((labels,vec,cla),fname)
    del vec,cla
    return (sct,scv)
示例#3
0
 def predict_test(self,clf, tag):
     np.random.seed(1919) 
     if os.path.isdir('../model/'+tag) == False: 
         os.mkdir('../model/'+tag)   
     print "Dir made : "+str(datetime.datetime.now())
     
     print "Fit Started : "+str(datetime.datetime.now())
     clf.fit(self.X, self.y)    
     
     print "Dump Started : "+str(datetime.datetime.now())    
     joblib.dump(clf, '../model/'+tag+'/'+tag+'.pkl')
     
     print "Prediction Started : "+str(datetime.datetime.now())
     output_arr = clf.predict_proba(self.x_test)
     
     f = open("../data/output_"+str(tag), "w")
     f.write("id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9\n")
     i=1
     for row in output_arr:
         row = map(str, row)
         f.write(str(i)+","+str(",".join(row))+"\n")
         i += 1
     f.close()
     
     print "ALL DONE : "+str(datetime.datetime.now())
示例#4
0
def main():
	pos_features_path = '/home/retailyze/Downloads/INRIAPerson/checkb/cropped/svm/featuresPos160_60.npy'
	neg_features_path = '/home/retailyze/Downloads/INRIAPerson/checkb/cropped/svm/featuresNeg160_60.npy'

	saving_loc = '/home/retailyze/Downloads/INRIAPerson/checkb/cropped/svm/'

	pos_features = np.load(pos_features_path)[:, 0::3]
	neg_features = np.load(neg_features_path) [:, 0::3]
	train, val = prepare_features(pos_features, neg_features, True, saving_loc)
	del pos_features
	del neg_features

	clf = svm.SVC(kernel='rbf')

	logging.info('starts training')
	clf.fit(train[:, 1:], train[:, 0])
	del train
	logging.info('starts predicting')
	predicted = clf.predict(val[:, 1:])
	conf_mat = confusion_matrix(predicted, val[:, 0])
	acc = accuracy_score(val[:, 0], predicted)
	del val
	del predicted
	logging.info('Confusion matrix: %s' %conf_mat)
	logging.info('Accuracy: %s' %acc)
	logging.info('saving model')
	joblib.dump(clf, join(saving_loc, 'svm_rbf_scaled.pkl'))
示例#5
0
def transform_data():
    from solaris.run import load_data
    from sklearn.externals import joblib

    data = load_data('data/data.pkl')

    kringing = PertubatedKriging()
    #kringing = PertubatedSpline()

    data['description'] = '%r: %r' % (kringing, kringing.est)
    print data['description']

    print('_' * 80)
    print(kringing)
    print

    for key in ['train', 'test']:
        print('_' * 80)
        print('transforming %s' % key)
        print
        X = data['X_%s' % key]

        X = kringing.fit_transform(X)
        data['X_%s' % key] = X

    print
    print('dumping data')
    joblib.dump(data, 'data/interp10_data.pkl')
    IPython.embed()
示例#6
0
    def train(self):
        with gzip.open(constants.TRAIN_EXPANDED, 'r') as source:
            reader = csv.reader(source)
            next(reader, None)

            n_sample = 0
            labels = []
            features = []
            for feature_vector in reader:
                s_features = feature_vector[2:6] + feature_vector[7:]
                s_label = int(feature_vector[1])
                features.append(s_features)
                labels.append(s_label)

                # print 'features', s_features
                # print 'labels', s_label
                # print 'norm features', normalized_features

                n_sample += 1
                if n_sample % 500000 == 0:
                    self.clf.partial_fit(features, labels)
                    features = []
                    labels = []
                    print 'Processing sample [%s]' % n_sample

        print 'Finished training'
        print 'Estimated parameters [%s]' % self.clf.get_params()

        # saving model into file
        joblib.dump(self.clf, constants.MODEL_FILENAME, compress=9)
示例#7
0
def tuning_xgbst(X, y):
    clf = xgb.XGBClassifier(n_estimators=10000,
                            scale_pos_weight=1.0, #1400.0/13458.0,
                            max_depth=6,
                            objective='binary:logistic',
                            learning_rate=0.02,
                            gamma=0.1,
                            min_child_weight=3,
                            max_delta_step=0,
                            subsample=0.7,
                            colsample_bytree=0.4,
                            colsample_bylevel=1.0,
                            reg_alpha=0,
                            reg_lambda=3000,
                            seed=0,
                            nthread=-1)
    print clf.get_params()
    # skf = StratifiedShuffleSplit(y, n_iter=5, test_size=0.25, random_state=0)
    skf = StratifiedKFold(y, n_folds=3, random_state=0)
    fold = 1
    for train_index, val_index in skf:
        print 'fold ', fold
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        # eval_metric use the parameters in XGBoost doc
        clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)],
                eval_metric='auc', early_stopping_rounds=1000, verbose=False)
        print "best_score", clf.best_score
        joblib.dump(clf, './models/xgbst/CV_' + str(fold) + '.model')
        fold += 1
示例#8
0
def train_svm(feedback, classes):

    print "Building n-grams"

    X_train_counts = count_vect.fit_transform(feedback) # converting string to the bag - of - words form, using bi-grams

    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) # weighting the words from bag-of-words form

    '''
    The vocabulary used here on the training set needs to be saved for classification matters,
    what this means is that the number of words during training is going to be different of
    the number of words in classification but the count still needs to be the same, in other
    words if the word "animal" is associated with the integer 3 during training, it has to be
    associated again with number 3 during classification
    '''

    pickle.dump(count_vect.vocabulary_,open("feature.pkl","wb")) # Saving vocab

    print "Saving words features"

    c = svm.SVC(kernel = 'rbf',gamma = 0.001, C = 100)

    print "training SVM"

    c.fit(X_train_tfidf, classes) # Training the SVM

    print "Training completed..."

    joblib.dump(c, 'filename.pkl', compress= 9) # Saving the Support vectors
def extract_features():
    des_type = 'HOG'

    # If feature directories don't exist, create them
    if not os.path.isdir(pos_feat_ph):
        os.makedirs(pos_feat_ph)

    # If feature directories don't exist, create them
    if not os.path.isdir(neg_feat_ph):
        os.makedirs(neg_feat_ph)

    print "Calculating the descriptors for the positive samples and saving them"
    for im_path in glob.glob(os.path.join(pos_im_path, "*")):
        #print im_path
        
        im = imread(im_path, as_grey=True)
        if des_type == "HOG":
            fd = hog(im, orientations, pixels_per_cell, cells_per_block, visualize, normalize)
        fd_name = os.path.split(im_path)[1].split(".")[0] + ".feat"
        fd_path = os.path.join(pos_feat_ph, fd_name)
        joblib.dump(fd, fd_path)
    print "Positive features saved in {}".format(pos_feat_ph)

    print "Calculating the descriptors for the negative samples and saving them"
    for im_path in glob.glob(os.path.join(neg_im_path, "*")):
        im = imread(im_path, as_grey=True)
        if des_type == "HOG":
            fd = hog(im,  orientations, pixels_per_cell, cells_per_block, visualize, normalize)
        fd_name = os.path.split(im_path)[1].split(".")[0] + ".feat"
        fd_path = os.path.join(neg_feat_ph, fd_name)
    
        joblib.dump(fd, fd_path)
    print "Negative features saved in {}".format(neg_feat_ph)

    print "Completed calculating features from training images"
 def train(self, seg_corpus, dep_corpus, path=None):
     assert seg_corpus.keys() == dep_corpus.keys()
     features, labels = self.extract_features_from_corpus(
         dep_corpus, seg_corpus=seg_corpus)
     self._train(features, labels)
     if path is not None:
         joblib.dump(self.pipeline, path, compress=1, cache_size=1e9)
示例#11
0
def check_covertype(datasets_folder):
    print("Checking availability of the covertype dataset")
    archive_path = os.path.join(datasets_folder, 'covtype.data.gz')
    covtype_dir = os.path.join(datasets_folder, "covertype")
    samples_path = os.path.join(covtype_dir, "samples.pkl")
    targets_path = os.path.join(covtype_dir, "targets.pkl")

    if not os.path.exists(covtype_dir):
        os.makedirs(covtype_dir)

    if not os.path.exists(archive_path):
        print("Downloading dataset from %s (10.7MB)" % COVERTYPE_URL)
        open(archive_path, 'wb').write(urlopen(COVERTYPE_URL).read())
    else:
        print("Found archive: " + archive_path)

    if not os.path.exists(samples_path) or not os.path.exists(targets_path):
        print("Parsing the data and splitting input and labels...")
        f = open(archive_path, 'rb')
        Xy = np.genfromtxt(gzip.GzipFile(fileobj=f), delimiter=',')

        X = Xy[:, :-1]
        y = Xy[:, -1].astype(np.int32)

        joblib.dump(X, samples_path)
        joblib.dump(y, targets_path )
    print("=> Success!")
示例#12
0
def fetch_vgg_architecture(caffemodel_parsed=None, caffemodel_protobuffer=None):
    """Fetch a pickled version of the caffe model, represented as list of
    dictionaries."""

    default_filename = os.path.join(VGG_PATH, 'vgg.pickle')
    if caffemodel_parsed is not None:
        if os.path.exists(caffemodel_parsed):
            return joblib.load(caffemodel_parsed)
        else:
            if os.path.exists(default_filename):
                import warnings
                warnings.warn('Did not find %s, but found %s. Loading it.' %
                              (caffemodel_parsed, default_filename))
                return joblib.load(default_filename)
    else:
        if os.path.exists(default_filename):
            return joblib.load(default_filename)

    # We didn't find the file: let's create it by parsing the protobuffer
    protobuf_file = fetch_vgg_protobuffer_file(caffemodel_protobuffer)
    model = _parse_caffe_model(protobuf_file)

    if caffemodel_parsed is not None:
        joblib.dump(model, caffemodel_parsed)
    else:
        joblib.dump(model, default_filename)

    return model
def gbm_fit(params, cv_folds):
    gbm = GradientBoostingRegressor(**params)
    gbm.fit(x_train, y_train)

    # Check accuracy of model
    # No need for validation data because of cross validation
    # Training data is split up into cv_folds folds:
    # Model trained on (cv_folds - 1) of the folds; last fold is saved as validation set
    cv_scores_mse = cross_validation.cross_val_score(gbm, x_train, y_train, cv=cv_folds, scoring='mean_squared_error')
    print '\nModel Report'
    print ('MSE Score: Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g' %
          (np.mean(cv_scores_mse), np.std(cv_scores_mse), np.min(cv_scores_mse), np.max(cv_scores_mse)))
    feat_imp = pd.Series(gbm.feature_importances_, features).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    plt.show()

    # Check actual performance on test data
    final_predictions = gbm.predict(x_test)
    test['health_score_in_week'] = final_predictions
    test.to_csv(output_file, columns=['user_id', 'date', 'steps', 'total_sleep', 'resting_hr',
                                      'step_week_slope', 'sleep_week_slope', 'hr_week_slope',
                                      'curr_health_score', 'health_score_in_week'])

    # Save the model to file 'health_prediction.pkl'
    joblib.dump(gbm, 'health_prediction.pkl', compress=1)
def train(trainingData, pklFile):
	# ========================================================================= #
	# =============== STEP 1. DEFINE OUTPUT LEARNT MODEL FILE ================= #
	# ========================================================================= #
	if (pklFile == ''):
		os.system('rm -rf learntModel & mkdir learntModel')
		pklFile = 'learntModel/learntModel.pkl'
	
	# ========================================================================= #
	# ================= STEP 2. PREPARE AND FORMATTING DATA =================== #
	# ========================================================================= #
	NUMBER_OF_FEATURES = len(trainingData[0]) - 1
	NUMBER_OF_TRAINING_POINTS = len(trainingData)

	x = trainingData[:, range(0, NUMBER_OF_FEATURES)]
	y = trainingData[:, NUMBER_OF_FEATURES]
	
	# ========================================================================= #
	# ============== STEP 3. DECLARE PRIMITIVES BEFORE THE PARTY ============== #
	# ========================================================================= #
	minSquareError = np.inf
	targetAlpha = None
	alphas = np.logspace(-10, -2, 500)			
	
	# ========================================================================= #
	# ===== STEP 4. PERFORM FITTING WITH THE BEST ALPHA AND SAVE THE MODEL ==== #
	# ========================================================================= #
	clf = LogisticRegressionCV(Cs=alphas)
	clf.fit(x, y)
	joblib.dump(clf, pklFile)
	
	return {"intercept": clf.intercept_, "coef":clf.coef_, "alpha":clf.C_, "accuracy":clf.score(x,y)}
def trainFixed():
	'''
	train a machine learner based on data from some fixed parameter point.
	save to fixed.pkl
	'''
	print "Entering train fixed"
	trainAndTarget = np.loadtxt('traindata.dat')
	traindata = trainAndTarget[:,0:2]
	targetdata = trainAndTarget[:,2]

	massPoints = np.unique(traindata[:,1])
	chunk = len(traindata)/len(massPoints)/2
	shift = len(traindata)/2


	#plot for fixed mu=0 training
	print "training fixed"
	clf = svm.NuSVR()
	reducedtrain = 	np.concatenate((traindata[4*chunk : 5*chunk,0], 
		traindata[4*chunk+shift : 5*chunk+shift , 0]))
	reducedtarget = np.concatenate((targetdata[4*chunk : 5*chunk], 
		targetdata[4*chunk+shift : 5*chunk+shift]))

	clf.fit(reducedtrain.reshape((len(reducedtrain),1)), reducedtarget)  
	joblib.dump(clf, 'fixed.pkl') 
def single_run(X, y,
               estimator, train, test,
               estimator_idx, split_idx,
               output_dir=None):
    X_train = X[train]
    y_train = y[train]
    X_test = X[test]
    y_test = y[test]

    if output_dir is not None:
        debug_folder = join(output_dir, "split_{}_est_{}".format(split_idx,
                                                                 estimator_idx))
        if not os.path.exists(debug_folder):
            os.makedirs(debug_folder)
        estimator.set_params(debug_folder=debug_folder)
        estimator.fit(X_train, y_train, probe_list=[(X_test, y_test)])
        # estimator.fit(X_train, y_train)
    else:
        estimator.fit(X_train, y_train)
    y_hat = estimator.predict(X_test)
    score = np.sqrt(mean_squared_error(y_hat, y_test))
    print('RMSE %s: %.3f' % (estimator, score))

    if output_dir is not None:
        with open(join(debug_folder, 'score'), 'w+') as f:
            f.write('score : %.4f' % score)
        dump(estimator, join(debug_folder, 'estimator'), compress=9)

    return score
def primaryMetLookup():
    fhin = open('CosmicMutantExport_v60_190712.csv', 'rU')
    fhin.readline()
    data1 = fhin.readlines()
    fhin.close()
    sampleOrigin = {}
    for line in data1[1:]:
        flds=  line.split(',')
        if flds[4] not in sampleOrigin: # in case the psite does not exist

            sampleOrigin[flds[4]] = Set([flds[23]])
        else:
            originSet = sampleOrigin[flds[4]]
            originSet.add(flds[23])
            sampleOrigin[flds[4]] = originSet


    keys = sampleOrigin.keys()
    samples = 0
    amb = 0
    for k in keys:
        origin = sampleOrigin[k]
        if len(origin) > 1:
            amb+=1
        else:
            samples+=1
    print 'ambiguous samples = ', amb
    print 'fine samples = ', samples
    print 'total number of ', str(samples+amb)
    joblib.dump(sampleOrigin, 'samples_origin.pkl')
示例#18
0
文件: main.py 项目: ppapaya/cail-2018
def train_pipeline(kind, cut, vectorizer, model_trainer, do_cut=False, do_vectorizer=False, record_num=None):
    print('reading...')
    alltext, accu_label, law_label, time_label = data.read_trainData("./data/data_train.json", record_num)

    if do_cut:
        print('cutting...')
        train_text = cut.cut(alltext)
        joblib.dump(train_text, './data/{}_cut_train.txt'.format(cut.name))

        print('cleaning...')
        cleaner = Cleaner()
        cleaned_train_text = cleaner.clean(train_text)
        joblib.dump(cleaned_train_text, './data/{}_cut_train_cleaned.txt'.format(cut.name))
    else:
        print('load existing cut file {}...'.format('./data/{}_cut_train_cleaned.txt'.format(cut.name)))
        cleaned_train_text = joblib.load('./data/{}_cut_train_cleaned.txt'.format(cut.name))

    vectorizer_name = '{}_{}'.format(cut.name, vectorizer.name)
    if do_vectorizer:
        print('{} training...'.format(vectorizer_name))
        vectorizer = vectorizer.train(cleaned_train_text)
        joblib.dump(vectorizer,
                    './model/{}/predictor/model/{}_vectorizer.model'.format(model_trainer.name, vectorizer_name))
        print('{} vectorizing...'.format(vectorizer))
        vec = vectorizer.transform(cleaned_train_text)
        joblib.dump(vec, './data/vec_{}.txt'.format(vectorizer_name))
    else:
        print('load existing vec file {}...'.format('./data/vec_{}.txt'.format(vectorizer_name)))
        vec = joblib.load('./data/vec_{}.txt'.format(vectorizer_name))

    print('{} training...'.format(kind))
    model = model_trainer.train(vec, accu_label)
    joblib.dump(model, './model/{}/predictor/model/{}_{}.model'.format(model_trainer.name, vectorizer_name, kind))
def main():
    """ Generates features and fits classifier. """
    
    featureIndexes = processData(os.path.join(dataFolder, "avito_train.tsv"), itemsLimit=300000)
    trainFeatures,trainTargets, trainItemIds=processData(os.path.join(dataFolder,"avito_train.tsv"), featureIndexes, itemsLimit=300000)
    testFeatures, testItemIds=processData(os.path.join(dataFolder,"avito_test.tsv"), featureIndexes)
    joblib.dump((trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds), os.path.join(dataFolder,"train_data.pkl"))
    trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(os.path.join(dataFolder,"train_data.pkl"))
    logging.info("Feature preparation done, fitting model...")
    clf = SGDClassifier(    loss="log", 
                            penalty="l2", 
                            alpha=1e-4, 
                            class_weight="auto")
    clf.fit(trainFeatures,trainTargets)

    logging.info("Predicting...")
    
    predicted_scores = clf.predict_proba(testFeatures).T[1]

    
    logging.info("Write results...")
    output_file = "avito_starter_solution.csv"
    logging.info("Writing submission to %s" % output_file)
    f = open(os.path.join(dataFolder,output_file), "w")
    f.write("id\n")
    
    for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
        f.write("%d\n" % (item_id))
    f.close()
    logging.info("Done.")
示例#20
0
def compute(filename):
	fileArray = filename.split("/")
	operator = fileArray[-1].split(".")[0]

	print 'SVM received operator = ' + operator
	ip = open(filename)
	i = 0
	A = []
	B = []
	for line in ip:
		temp = []
		elm = line.rstrip("\n").split(" ");
		temp = [np.exp(float(row)) for row in range(len(elm)-1) ]

		# A.append([temp[0] ,temp[1]])
		A.append(temp)
		B.append(np.float(elm[len(elm)-1]))

	clf = svm.SVR()
	clf.fit(A,B)


	# f.close()
	modelURI = "Models/"+operator+"/"
	if not os.path.exists(modelURI):
		os.makedirs(modelURI)
	
	modelURI += 'm.pkl'
	joblib.dump(clf, modelURI)

	print 'SUCCESS,' + modelURI + ' written to disk.'
def normalize_one(name):
  out_name = path(name).splitext()[0] + '.dat'
  a = sio.loadmat(name)
  desc = a['desc']
  frames = a['frames']
  normalize_sift(desc, inplace=True)
  dump(dict(frames=frames, desc=desc), out_name, compress=3)
def trainClassifier(clf,
      dir,model_file='adaptive',
      data_file='train',
      seed=1234,
    ):
  '''
   Train classifier
  '''
  print 'Training classifier'

  data = np.loadtxt('{0}/train_{1}.dat'.format(dir,data_file)) 
  traindata = data[:,:-1]
  targetdata = data[:,-1]
  pdb.set_trace()

  if model_g == 'mlp':
    train_mlp((traindata, targetdata), save_file='{0}/{1}_F0_F1.pkl'.format(dir,model_file))
  else:
    rng = np.random.RandomState(seed)
    indices = rng.permutation(traindata.shape[0])
    traindata = traindata[indices]
    targetdata = targetdata[indices]
    scores = cross_validation.cross_val_score(clf, traindata, targetdata)
    print "Accuracy: {0} (+/- {1})".format(scores.mean(), scores.std() * 2)
    clf.fit(traindata,targetdata)
    #clf.plot_importance_matrix(vars_names)
    joblib.dump(clf, '{0}/{1}_F0_F1.pkl'.format(dir,model_file))
    def setTestInputforNN(self, collection={}, sel_words=[]):
        list_of_strings = []
        list_of_salary = []
        count = 0
        sel_words_set = set(sel_words)
        sel_words_list = list(sel_words_set)
        for document in collection:
            count += 1
            title = document.getTitle()
            description = document.getDescription()
            salary = (int)(document.getSalaryNorm())
            words = re.split(" ", title) + re.split(" ", description)
            # words = [x for x in words if x in sel_words]
            wordsUnique = set(words)
            wordsUnique = wordsUnique & sel_words_set
            words = [x for x in words if x in wordsUnique]
            documentString = " ".join(words)
            list_of_strings.append(documentString)
            list_of_salary.append(salary)

            if not (count % 15000):
                break

        vectorizer = CountVectorizer(vocabulary=sel_words, min_df=1)
        self.inp = vectorizer.fit_transform(list_of_strings)
        from sklearn.externals import joblib

        joblib.dump(self.inp.tocsr(), "test_dataset_in.joblib")

        self.inp_size = len(list_of_strings)
        output = np.array(list_of_salary)
        self.target = output.reshape(len(list_of_strings), 1)
        joblib.dump(self.target, "test_dataset_out.joblib")

        return [self.inp, self.target]
示例#24
0
文件: classify.py 项目: cmor/gala
def save_classifier(cl, fn, use_joblib=True, **kwargs):
    """Save a classifier to disk.

    Parameters
    ----------
    cl : classifier object
        Pickleable object or a classify.VigraRandomForest object.
    fn : string
        Writeable path/filename.
    use_joblib : bool, optional
        Whether to prefer joblib persistence to pickle.
    kwargs : keyword arguments
        Keyword arguments to be passed on to either `pck.dump` or 
        `joblib.dump`.

    Returns
    -------
    None

    Notes
    -----
    For joblib persistence, `compress=3` is the default.
    """
    if isinstance(cl, VigraRandomForest):
        cl.save_to_disk(fn)
    elif use_joblib and sklearn_available:
        if not kwargs.has_key('compress'):
            kwargs['compress'] = 3
        joblib.dump(cl, fn, **kwargs)
    else:
        with open(fn, 'w') as f:
            pck.dump(cl, f, protocol=kwargs.get('protocol', -1))
示例#25
0
def rf_fit():

	train_inp,valid_inp,train_target,valid_target = prepare_input()

	rf = RandomForestClassifier(random_state=31,n_jobs=-1,verbose=1,n_estimators=100,min_samples_split=5)
	start = time.time()

	rf.fit(train_inp,train_target)

	end = time.time()
	print "fitting took {:0.4} seconds".format(end-start)

	training_output = rf.predict_proba(train_inp)
	validation_output = rf.predict_proba(valid_inp)

	training_error = log_loss(train_target,training_output)
	validation_error = log_loss(valid_target,validation_output)

	print "Train error: {:02.4f}".format(training_error)
	print "Validation error: {:02.4f}".format(validation_error)


	joblib.dump(rf,rf_filename)


	return rf
示例#26
0
def trainModel():

	# 数据预处理
	data_train = joblib.load('data/data_train.pkl')
	label_train = joblib.load('data/label_train.pkl')

	print data_train.shape

	clf = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.1, degree=0.1, gamma=1.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=True)

	#clf.set_params(kernel='rbf')

	print clf

	print data_train.shape
	print label_train.shape

	print 'begin training....'
	clf.fit(data_train,label_train)
	print 'finish training....'
	print clf
	joblib.dump(clf, 'model/svm.pkl')
	
	return None
示例#27
0
def perform_cluster_analysis(dataset):

    filename = 'elbow_plot.dat'

    if os.path.exists(cpath + filename):
        data = joblib.load(cpath + filename)
        K = data[0]
        meandistortions = data[1]
    else:
        X = dataset
        print 'X Shape: ', X.shape

        #K = range(1, 50, 5)
        K = [1, 2, 5, 10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
        #K = [1, 2, 5, 10, 50, 100]
        meandistortions = []
        cluster_centers = []
        for k in K:
            print k
            kmeans = KMeans(n_clusters=k, n_jobs=3)
            kmeans.fit(X)
            #import ipdb; ipdb.set_trace() # debugging code
            #meandistortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1))/X.shape[0])
            meandistortions.append(kmeans.inertia_)
            cluster_centers.append(kmeans.cluster_centers_)
            #print 'k: ', k, ' Cluster Centers: ', kmeans.cluster_centers_
        data = [K, meandistortions]
        joblib.dump(data, cpath + filename, compress=8)

    plot_name = "elbow_plot.png"
    title = 'Selecting k with the Elbow Method'
    xlabel = 'Number of Clusters (k)'
    ylabel = 'Average Distortion'
    xyplot(K, meandistortions, 0, 0, 0, 0, title, xlabel, ylabel, staticpath + plot_name, line=1, y_log=0)
示例#28
0
def xgb_fit():

	train_inp,valid_inp,train_target,valid_target = prepare_input()

	dtrain = xgb.DMatrix(train_inp,label=train_target)
	dvalid = xgb.DMatrix(valid_inp)


	param = {'max_depth':10, 'eta':0.02, 'silent':1, 'objective':'binary:logistic' }
	param['nthread'] = 4
	param['eval_metric'] = 'auc'
	param['subsample'] = 0.7
	param['colsample_bytree']= 0.7
	param['min_child_weight'] = 0
	param['booster'] = "gblinear"

	watchlist  = [(dtrain,'train')]
	num_round = 300
	early_stopping_rounds=10
	bst = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)

	joblib.dump(bst,bst_filename)


	train_pred = bst.predict(xgb.DMatrix(train_inp))
	valid_pred = bst.predict(xgb.DMatrix(valid_inp))
def train_classifier():
    pos_feat_path = positive_features_path
    neg_feat_path = negative_features_path

    model_path = classifier_model_path

    feature_vectors = []
    labels = []

    for feat_path in glob.glob(os.path.join(pos_feat_path, "*.feat")):
        fd = joblib.load(feat_path)
        print len(fd)
        if len(fd):
            fd = fd.astype(numpy.object)
            feature_vectors.append(fd)
            labels.append(1)

    for feat_path in glob.glob(os.path.join(neg_feat_path, "*.feat")):
        fd = joblib.load(feat_path)
        print len(fd)
        if len(fd):
            fd = fd.astype(numpy.object)
            feature_vectors.append(fd)
            labels.append(0)

    classifier = LinearSVC()
    print "Training classifier"
    classifier.fit(feature_vectors, labels)
    print "Classifier successfully trained"
    if not os.path.isdir(os.path.split(model_path)[0]):
        os.makedirs(os.path.split(model_path)[0])
    joblib.dump(classifier, model_path)
示例#30
0
X_eval = [x[0] for x in samples_eval]
Y_train = [x[1] for x in samples_train]
Y_eval = [x[1] for x in samples_eval]

clf = svm.SVC(C=1.0,
              cache_size=200,
              class_weight="balanced",
              coef0=0.0,
              decision_function_shape='ovo',
              degree=3,
              gamma='auto',
              kernel='rbf',
              max_iter=-1,
              probability=True,
              random_state=None,
              shrinking=True,
              tol=0.001,
              verbose=False)
clf.fit(X_train, Y_train)

predicted = clf.predict_proba(X_eval)
score = clf.score(X_eval, Y_eval)
decided = clf.decision_function(X_eval)
expected = Y_eval

joblib.dump(clf, 'model.pkl', protocol=0)

print("Samples: " + str(len(X_train)))

print(score)
示例#31
0
data = {
    "train": {
        "X": X_train,
        "y": y_train
    },
    "test": {
        "X": X_test,
        "y": y_test
    }
}

# list of numbers from 0.0 to 1.0 with a 0.05 interval
alphas = np.arange(0.0, 1.0, 0.05)

for alpha in alphas:
    # Use Ridge algorithm to create a regression model
    reg = Ridge(alpha=alpha)
    reg.fit(data["train"]["X"], data["train"]["y"])

    preds = reg.predict(data["test"]["X"])
    mse = mean_squared_error(preds, data["test"]["y"])
    run.log('alpha', alpha)
    run.log('mse', mse)

    model_file_name = 'ridge_{0:.2f}.pkl'.format(alpha)
    # save model in the outputs folder so it automatically get uploaded
    with open(model_file_name, "wb") as file:
        joblib.dump(value=reg,
                    filename=os.path.join('./outputs/', model_file_name))

    print('alpha is {0:.2f}, and mse is {1:0.2f}'.format(alpha, mse))
示例#32
0
                                                        test_size=0.35,
                                                        random_state=0)

    print "Starting the training process..."

    #Start the training process
    clf.fit(X_train, Y_train)

    #If SHOW_CONFUSION_MATRIX is true, prints the confusion matrix
    if SHOW_CONFUSION_MATRIX:
        print "Confusion Matrix:"
        Y_predicted = clf.predict(X_test)
        print confusion_matrix(Y_test, Y_predicted)

    print "\nBest estimator parameters: "
    print clf.best_estimator_

    #Calculates the score of the best estimator found.
    score = clf.score(X_test, Y_test)

    print "\nSCORE: {score}\n".format(score=score)

    print "Saving the model...",

    #Saves the model to the "model.pkl" file
    joblib.dump(clf, 'model.pkl')
    #Saves the classes to the "classes.pkl" file
    joblib.dump(classes, 'classes.pkl')

    print "DONE"
示例#33
0
def get_saved_columns(file='enc_columns.txt'):
    with open(file, 'r') as f:
        return eval(f.readline())


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X = data_enc.iloc[:].values

sc = MinMaxScaler()
X = sc.fit_transform(X.astype(np.float64))

#Save scaler
from sklearn.externals import joblib
joblib.dump(sc, "scaler.save")

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=0)

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import optimizers

classifier = Sequential()
classifier.add(
    Dense(units=135,
          kernel_initializer='glorot_normal',
示例#34
0
from sklearn import tree

# Bumpy: 0
# Smooth: 1
features = [[140, 1], [130, 1], [150, 0], [170, 0]]

# Apple: 0
# Orange: 1
labels = [0, 0, 1, 1]

classifier = tree.DecisionTreeClassifier()
classifier = classifier.fit(features, labels)

from sklearn.externals import joblib
joblib.dump(classifier, 'myModel.pkl')
def train_and_generate_model():
    #global scaler

    K.clear_session()

    data_len = len(exchange_rates)
    train_len = int(len(exchange_rates) / TRAINDATA_DIV)

    print("data size: " + str(data_len))
    print("train len: " + str(train_len))

    tr_input_mat = []
    tr_angle_mat = []
    for i in range(1000, train_len, OUTPUT_LEN):
        tr_input_mat.append([
            exchange_rates[i], (exchange_rates[i] - exchange_rates[i - 1]) /
            exchange_rates[i - 1],
            get_rsi(exchange_rates, i),
            get_ma(exchange_rates, i),
            get_ma_kairi(exchange_rates, i),
            get_bb_1(exchange_rates, i),
            get_bb_2(exchange_rates, i),
            get_ema(exchange_rates, i),
            get_ema_rsi(exchange_rates, i),
            get_cci(exchange_rates, i),
            get_mo(exchange_rates, i),
            get_lw(exchange_rates, i),
            get_ss(exchange_rates, i),
            get_dmi(exchange_rates, i),
            get_vorarity(exchange_rates, i),
            get_macd(exchange_rates, i),
            judge_chart_type(exchange_rates[i - CHART_TYPE_JDG_LEN:i])
        ])
        tr_input_mat.append([
            reverse_exchange_rates[i],
            (reverse_exchange_rates[i] - reverse_exchange_rates[i - 1]) /
            reverse_exchange_rates[i - 1],
            get_rsi(reverse_exchange_rates, i),
            get_ma(reverse_exchange_rates, i),
            get_ma_kairi(reverse_exchange_rates, i),
            get_bb_1(reverse_exchange_rates, i),
            get_bb_2(reverse_exchange_rates, i),
            get_ema(reverse_exchange_rates, i),
            get_ema_rsi(reverse_exchange_rates, i),
            get_cci(reverse_exchange_rates, i),
            get_mo(reverse_exchange_rates, i),
            get_lw(reverse_exchange_rates, i),
            get_ss(reverse_exchange_rates, i),
            get_dmi(reverse_exchange_rates, i),
            get_vorarity(reverse_exchange_rates, i),
            get_macd(reverse_exchange_rates, i),
            judge_chart_type(reverse_exchange_rates[i - CHART_TYPE_JDG_LEN:i])
        ])

        tmp = (exchange_rates[i + OUTPUT_LEN] -
               exchange_rates[i]) / float(OUTPUT_LEN)
        if tmp >= 0:
            tr_angle_mat.append(1)
        else:
            tr_angle_mat.append(0)
        tmp = (reverse_exchange_rates[i + OUTPUT_LEN] -
               reverse_exchange_rates[i]) / float(OUTPUT_LEN)
        if tmp >= 0:
            tr_angle_mat.append(1)
        else:
            tr_angle_mat.append(0)

    X = np.array(tr_input_mat, dtype=np.float32)
    Y = np.array(tr_angle_mat, dtype=np.float32)

    X, scaler = preprocess_data(X)
    Y, encoder = preprocess_labels(Y)

    joblib.dump(scaler, "./sklearn.scaler.dump")

    np.random.seed(1337)  # for reproducibility

    nb_classes = Y.shape[1]
    print(nb_classes, 'classes')

    dims = X.shape[1]
    print(dims, 'dims')

    neuro_num = 50

    # setup deep NN
    model = Sequential()
    model.add(Dense(neuro_num, input_shape=(dims, ), activation="relu"))
    #model.add(Dense(neuro_num, activation="relu"))
    #model.add(BatchNormalization((neuro_num,)))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(int(neuro_num / 2), activation="relu"))
    #model.add(BatchNormalization((neuro_num/2,)))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(nb_classes, activation="sigmoid"))

    model.summary()

    model.compile(loss='binary_crossentropy', optimizer="adam")

    # # TPU
    tpu_grpc_url = "grpc://" + os.environ["COLAB_TPU_ADDR"]
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        tpu_grpc_url)
    strategy = keras_support.TPUDistributionStrategy(tpu_cluster_resolver)
    tpu_model = tf.contrib.tpu.keras_to_tpu_model(model, strategy=strategy)

    print("Training model...")
    start = time.time()
    tpu_model.fit(X,
                  Y,
                  batch_size=1024,
                  epochs=1000,
                  verbose=2,
                  validation_split=0.15)
    process_time = time.time() - start
    print("excecution time of training: " + str(process_time))

    dump_fd = open("./keras.model.json", "w")
    model_json_str = model.to_json()
    dump_fd.write(model_json_str)
    model.save_weights("./keras.weight")
    dump_fd.close()
示例#36
0
 def save(self, save_name):
     joblib.dump(self, save_name, compress=6)
示例#37
0
#print(data.columns)
new_data = data[["pclass", "sex"]]
print(new_data.head())
new_output = data[["survived"]]
print(new_output.head())
new_data['pclass'].replace('3rd', 3, inplace=True)
new_data['pclass'].replace('2nd', 2, inplace=True)
new_data['pclass'].replace('1st', 1, inplace=True)
#new_data['sex'].replace('female',0,inplace=True)
#new_data['sex'].replace('male',1,inplace=True)
new_data['sex'] = np.where(new_data['sex'] == 'female', 0, 1)
X_tr, X_te, y_tr, y_te = train_test_split(new_data,
                                          new_output,
                                          test_size=0.33,
                                          random_state=42)

print(new_data.head())

print('After train_test_split')
print(X_tr.shape)
print(X_te.shape)
print(y_tr.shape)
print(y_te.shape)

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_tr, y_tr)
acc = rf.score(X_te, y_te)
print(acc * 100)

joblib.dump(rf, 'rf1', compress=9)
示例#38
0
def batchtrain(self, njob = 1, phase = None, shared_memory = 'no', verbose='on'):
    t0 = time()
    nnodes = getattr(self, 'nnodes')
    dlen = getattr(self, 'dlen')
    dim = getattr(self, 'dim')
    mapsize = getattr(self, 'mapsize')
    

    #############################################
    # seting the parameters
    initmethod = getattr(self,'initmethod')
    mn = np.min(mapsize)
    if mn == 1:
        mpd = float(nnodes*10)/float(dlen)
    else:
        mpd = float(nnodes)/float(dlen)
    
    ms = max(mapsize[0],mapsize[1])
    if mn == 1:
        ms = ms/2.
    #Based on somtoolbox, Matlab
    #case 'train',    sTrain.trainlen = ceil(50*mpd);
    #case 'rough',    sTrain.trainlen = ceil(10*mpd); 
    #case 'finetune', sTrain.trainlen = ceil(40*mpd);
    if phase == 'rough':
        #training length
        trainlen = int(np.ceil(30*mpd))
        #radius for updating
        if initmethod == 'random':
        	radiusin = max(1, np.ceil(ms/3.))
        	radiusfin = max(1, radiusin/6.)
#         	radiusin = max(1, np.ceil(ms/1.))
#         	radiusfin = max(1, radiusin/2.)
        elif initmethod == 'pca':
            radiusin = max(1, np.ceil(ms/8.))
            radiusfin = max(1, radiusin/4.)
    elif phase == 'finetune':
        #train lening length
        
        #radius for updating
        if initmethod == 'random':
            trainlen = int(np.ceil(50*mpd))
            radiusin = max(1, ms/12.) #from radius fin in rough training  
            radiusfin = max(1, radiusin/25.)
            
#             radiusin = max(1, ms/2.) #from radius fin in rough training  
#             radiusfin = max(1, radiusin/2.)
        elif initmethod == 'pca':
        	trainlen = int(np.ceil(40*mpd))
        	radiusin = max(1, np.ceil(ms/8.)/4)
        	radiusfin = 1#max(1, ms/128)        
    
    radius = np.linspace(radiusin, radiusfin, trainlen)
    ##################################################    
    
    UD2 = getattr(self, 'UD2')
    New_Codebook_V = np.empty((nnodes, dim))
    New_Codebook_V = getattr(self, 'codebook')
    
    #print 'data is in shared memory?', shared_memory
    if shared_memory == 'yes':
        data = getattr(self, 'data')
        Data_folder = tempfile.mkdtemp()
        data_name = os.path.join(Data_folder, 'data')
        dump(data, data_name)
        data = load(data_name, mmap_mode='r')
    else:
        data = getattr(self, 'data')        
    #X2 is part of euclidean distance (x-y)^2 = x^2 +y^2 - 2xy that we use for each data row in bmu finding.
    #Since it is a fixed value we can skip it during bmu finding for each data point, but later we need it calculate quantification error
    X2 = np.einsum('ij,ij->i', data, data)
    if verbose=='on':
        print '%s training...' %phase
        print 'radius_ini: %f , radius_final: %f, trainlen: %d' %(radiusin, radiusfin, trainlen)
    neigh_func = getattr(self,'neigh')
    for i in range(trainlen):
    	if neigh_func == 'Guassian':
        	#in case of Guassian neighborhood
        	H = np.exp(-1.0*UD2/(2.0*radius[i]**2)).reshape(nnodes, nnodes)
        if neigh_func == 'Bubble':
        	# in case of Bubble function
#         	print radius[i], UD2.shape
#         	print UD2
        	H = l(radius[i],np.sqrt(UD2.flatten())).reshape(nnodes, nnodes) + .000000000001
#         	print H
        
        t1 = time()
        bmu = None
        bmu = self.para_bmu_find(data, New_Codebook_V, njb = njob)
        if verbose=='on':
            print
        #updating the codebook
        t2 = time()
        New_Codebook_V = self.update_codebook_voronoi(data, bmu, H, radius)
        #print 'updating nodes: ', round (time()- t2, 3)            
        if verbose=='on':
            print "epoch: %d ---> elapsed time:  %f, quantization error: %f " %(i+1, round(time() - t1, 3),np.mean(np.sqrt(bmu[1] + X2)))          
    setattr(self, 'codebook', New_Codebook_V)
    bmu[1] = np.sqrt(bmu[1] + X2)
    setattr(self, 'bmu', bmu)
示例#39
0
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.externals import joblib

# Read in data
data = pd.read_csv('clean_data.csv')
texts = data['text'].astype(str)
y = data['is_offensive']

# Vectorize the text
vectorizer = CountVectorizer(stop_words='english', min_df=0.0001)
X = vectorizer.fit_transform(texts)

# Train the model
model = LinearSVC(class_weight="balanced", dual=False, tol=1e-2, max_iter=1e5)
cclf = CalibratedClassifierCV(base_estimator=model)
cclf.fit(X, y)

# Save the model
joblib.dump(vectorizer, 'vectorizer.joblib')
joblib.dump(cclf, 'model.joblib') 
示例#40
0
# plt.ylabel('Loss')
# plt.legend(loc='best')
# plt.show()

#############################

from sklearn import svm
from sklearn import datasets

clf = svm.SVC()
iris = datasets.load_iris()
x, y = iris.data, iris.target
clf.fit(x, y)

#method 1 : pickle
# import pickle
# # with open('save/clf.pickle','wb') as f:
# #     pickle.dump(clf,f)
#
# with open('save/clf.pickle','rb') as f:
#     clf2 = pickle.load(f)
#     print (clf2.predict(x[0:1]))

# method 2 : joblib
from sklearn.externals import joblib
#Save
joblib.dump(clf, 'save/clf.pkl')
#resitore
clf3 = joblib.load('save/clf.pkl')
print(clf3.predict(x[0:1]))
示例#41
0
    result_value = round(predition_row[result_index], 2)
    if not results[real_result].get(result_value):
        results[real_result][result_value] = 0
    results[real_result][result_value] += 1

for result in results.keys():
    keys = sorted(results[result].keys(), reverse=True)
    current_sum = 0
    for result_value in keys:
        value = results[result][result_value]
        current_sum += value
        results_file.write("%s\t%s\t%s\t%s%%\n" %
                           (result, result_value, value,
                            round(1.0 * current_sum / total[result], 2)))

joblib.dump(clf, model_output_path)

# Write used features
feature_importance = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
indices = np.argsort(feature_importance)[::-1]

base_name = model_output_path.split(".pkl")[0]

with open(base_name + ".features", "w") as _file:
    for f in range(train[features].shape[1]):
        _file.write("%s\n" % features[indices[f]])

first_model_number = None
found_final = False
with open(base_name + ".recommended_features", "w") as _file:
示例#42
0
    end = nh3_co_end
    for i in range(len(start)):
        tmp_data = Db_seelct(start[i], end[i])
        data = np.vstack((data, tmp_data))
        for i in range(len(tmp_data)):
            y_train.append(class_bin[6])

    x_train = reader.get_Ratio(data)
    y_train = np.array(y_train)

    # x_train = minmax_scale(x_train,model_dir,train=True)

    scaler = StandardScaler()
    scaler.fit(x_train)
    x_train = scaler.transform(x_train)
    joblib.dump(scaler, model_dir+"scaler.pkl")

    modelTrain(x_train, y_train, save_name)

#-----------------------------------------------------------------------------------------------------------

    y_valid=[]

    normal_start = [65600]
    normal_end = [65700]

    h2s_start = [63868]
    h2s_end = [63968]

    nh3_start = [64610]
    nh3_end = [64710]
示例#43
0
    train_clf(clf, X_train, y_train)

    print("F1 score for training set is: {:.4f}".format(
        pred_clf(clf, X_train, y_train)))
    print("F1 score for testing set is: {:.4f}\n".format(
        pred_clf(clf, X_test, y_test)))


from sklearn.metrics import accuracy_score

params = {
    'max_depth': 9,
    'subsample': 0.5,
    'learning_rate': 0.01,
    'min_samples_leaf': 1,
    'random_state': 0
}
gbc = GradientBoostingClassifier(n_estimators=290, **params)
clf_ = gbc.fit(X_train, y_train)
y_pred = clf_.predict(X_test)
print('Accuracy is {}'.format(accuracy_score(y_test, y_pred)))
train_predict(gbc, X_train, y_train, X_test, y_test)

#pickling my model
from sklearn.externals import joblib
joblib.dump(gbc, 'model1.pkl')
print("Model dumped!")
model_columns = list(X.columns)
joblib.dump(model_columns, 'model_columns.pkl')
print("Model columns dumped")
示例#44
0
#%%
print(X_train[0].shape)
X = X_train[0]
for i in range(1,len(X_train)):
    X = np.vstack((X,X_train[i]))
print(X.shape)

#%%
from sklearn.cluster import MiniBatchKMeans,KMeans

cluster = MiniBatchKMeans(n_clusters=500, batch_size=50)
# cluster = KMeans(n_clusters=500,random_state= 54, max_iter= 5000,n_jobs = 2)
cluster.fit(X)
from sklearn.externals import joblib
joblib.dump(cluster, 'kmeans_model3.m')

#%%
km_model = joblib.load('kmeans_model3.m')

#%%
from sklearn import preprocessing
def get_image_presentation(dense_sampling_images, centroids_model):
    image_presentation = np.zeros((len(dense_sampling_images), len(centroids_model.cluster_centers_)))
    histograms = []
    count = 0
    for image in dense_sampling_images:
        hist = centroids_model.predict(image)
        histograms.append(hist)
        for label in hist:
            image_presentation[count][label] += 1
    def run(self):
        self.output().makedirs()

        df = pd.read_csv(self.input().path,
                         usecols=[self.id_column, self.feature_name])
        joblib.dump(df, self.output().path, compress=1)
示例#46
0
def main():
    if not (args.use_w1_w2_embeddings or args.use_paraphrase_vectors):
        raise ValueError(
            'At least one of "use_w1_w2_embeddings" or "use_paraphrase_vectors" should be set.'
        )

    # Load the datasets
    logger.info('Loading the datasets from {}'.format(args.dataset_prefix))
    train_set = DatasetReader(args.dataset_prefix + '/train.tsv')
    val_set = DatasetReader(args.dataset_prefix + '/val.tsv',
                            label2index=train_set.label2index)
    test_set = DatasetReader(args.dataset_prefix + '/test.tsv',
                             label2index=train_set.label2index)

    # Generate the feature vectors using the paraphrasing model
    logger.info('Generating feature vectors...')
    train_features, val_features, test_features = [], [], []

    if args.use_paraphrase_vectors:
        logger.info('Reading word embeddings from {}...'.format(
            args.word_embeddings_for_model))
        wv, model_words = load_binary_embeddings(
            args.word_embeddings_for_model)

        logger.info('Loading paraphrasing model from {}...'.format(
            args.paraphrase_model_dir))
        model = Model.load_model(args.language_model_dir, wv)

        model_words = ['[w1]', '[w2]', '[par]'] + model_words
        modelw2index = {w: i for i, w in enumerate(model_words)}
        UNK = modelw2index['unk']

    if args.use_w1_w2_embeddings:
        logger.info('Reading word embeddings from {}...'.format(
            args.word_embeddings_for_dist))
        wv, words = load_binary_embeddings(args.word_embeddings_for_dist)
        w2index = {w: i for i, w in enumerate(words)}
        UNK = w2index['unk']

        train_features.append(
            np.vstack([
                np.concatenate(
                    [wv[w2index.get(w1, UNK), :], wv[w2index.get(w2, UNK), :]])
                for (w1, w2) in train_set.noun_compounds
            ]))
        val_features.append(
            np.vstack([
                np.concatenate(
                    [wv[w2index.get(w1, UNK), :], wv[w2index.get(w2, UNK), :]])
                for (w1, w2) in val_set.noun_compounds
            ]))
        test_features.append(
            np.vstack([
                np.concatenate(
                    [wv[w2index.get(w1, UNK), :], wv[w2index.get(w2, UNK), :]])
                for (w1, w2) in test_set.noun_compounds
            ]))

    # Tune the hyper-parameters using the validation set
    logger.info('Classifying...')
    reg_values = [0.5, 1, 2, 5, 10]
    penalties = ['l2']
    k_values = [10, 15, 25, 50] if args.use_paraphrase_vectors else [0]
    classifiers = ['logistic', 'svm']
    f1_results = []
    descriptions = []
    models = []
    all_test_instances = []

    for k in k_values:
        curr_train_features, curr_val_features, curr_test_features = train_features, val_features, test_features
        if args.use_paraphrase_vectors:
            curr_train_features += [
                predict_paraphrases(model, train_set.noun_compounds,
                                    model_words, modelw2index, UNK, k)
            ]
            curr_val_features += [
                predict_paraphrases(model, val_set.noun_compounds, model_words,
                                    modelw2index, UNK, k)
            ]
            curr_test_features += [
                predict_paraphrases(model, test_set.noun_compounds,
                                    model_words, modelw2index, UNK, k)
            ]

        train_instances = [
            np.concatenate(list(f)) for f in zip(*curr_train_features)
        ]
        val_instances = [
            np.concatenate(list(f)) for f in zip(*curr_val_features)
        ]
        test_instances = [
            np.concatenate(list(f)) for f in zip(*curr_test_features)
        ]

        for cls in classifiers:
            for reg_c in reg_values:
                for penalty in penalties:
                    descriptions.append(
                        'K: {}, Classifier: {}, Penalty: {}, C: {:.2f}'.format(
                            k, cls, penalty, reg_c))

                    # Create the classifier
                    if cls == 'logistic':
                        classifier = LogisticRegression(
                            penalty=penalty,
                            C=reg_c,
                            multi_class='multinomial',
                            n_jobs=20,
                            solver='sag')
                    else:
                        classifier = LinearSVC(penalty=penalty,
                                               dual=False,
                                               C=reg_c)

                    logger.info(
                        'Training with classifier: {}, penalty: {}, c: {:.2f}...'
                        .format(cls, penalty, reg_c))
                    classifier.fit(train_instances, train_set.labels)
                    val_pred = classifier.predict(val_instances)
                    p, r, f1, _ = evaluate(val_set.labels,
                                           val_pred,
                                           val_set.index2label,
                                           do_full_reoprt=False)
                    logger.info(
                        'K: {}, Classifier: {}, penalty: {}, c: {:.2f}, precision: {:.3f}, recall: {:.3f}, F1: {:.3f}'
                        .format(k, cls, penalty, reg_c, p, r, f1))
                    f1_results.append(f1)
                    models.append(classifier)
                    all_test_instances.append(test_instances)

    best_index = np.argmax(f1_results)
    description = descriptions[best_index]
    classifier = models[best_index]
    logger.info('Best hyper-parameters: {}'.format(description))

    # Save the best model to a file
    logger.info('Copying the best model...')
    joblib.dump(classifier, '{}/best.pkl'.format(args.model_dir))

    # Evaluate on the test set
    logger.info('Evaluation:')
    test_instances = all_test_instances[best_index]
    test_pred = classifier.predict(test_instances)
    precision, recall, f1, support = evaluate(test_set.labels,
                                              test_pred,
                                              test_set.index2label,
                                              do_full_reoprt=True)
    logger.info('Precision: {:.3f}, Recall: {:.3f}, F1: {:.3f}'.format(
        precision, recall, f1))

    # Write the predictions to a file
    output_predictions(args.model_dir + '/predictions.tsv',
                       test_set.index2label, test_pred,
                       test_set.noun_compounds, test_set.labels)
示例#47
0
     for gene in lncRNA_names],
    dtype=float)  #to keep order!!!!!!!	y

proteins = np.concatenate((hsapiens_info, atha_info), axis=0)

y = []

for num in range(0, proteins.shape[0]):
    y.append(0)
for num in range(0, lncRNA_info.shape[0]):
    y.append(1)

y = np.asarray(y)

X = np.concatenate((proteins, lncRNA_info), axis=0)

print(y.shape)
print(X.shape)

X_normalized = preprocessing.normalize(X, norm='l2')

clf = GradientBoostingClassifier(n_estimators=100,
                                 learning_rate=0.02,
                                 subsample=0.6,
                                 max_depth=10,
                                 random_state=339)
clf.fit(X_normalized, y)

# uncomment to save classifier as pickle
joblib.dump(clf, 'model5.pkl')
示例#48
0
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(x_tr, y_tr)
y_pred = regression.predict(x_te)

plt.scatter(x_tr, y_tr, color='red')
plt.plot(x_tr, regression.predict(x_tr), color='blue')
plt.title('Seasonal Trend vs Price : Training Set')
plt.xlabel('Start of Season Price')
plt.ylabel('End of Season (Discounted) Price')
plt.savefig('templates/retailvsdiscounted.jpg')
plt.show()

plt.scatter(x_te, y_te, color='red')
plt.plot(x_tr, regression.predict(x_tr), color='blue')
plt.title('Seasonal Trend vs Price : Training Set')
plt.xlabel('Start of Season Price')
plt.ylabel('End of Season (Discounted) Price')
plt.savefig('templates/pricevdiscountedtest.jpg')
plt.show()


def getdiscount(price):
    print("The discounted value for given item : ", regression.predict(price))


getdiscount(5000)

from sklearn.externals import joblib
joblib.dump(regression, 'linmodel.pkl')
def predict(config_file=None,
            data_dirs=None,
            annotation_file=None,
            file_format=None,
            model_meta_file=None,
            output_dir=None,
            segment=None,
            predict_proba=False,
            convert_to=None,
            return_predictions=True):
    """high-level function for prediction of syllable labels.
    Accepts either a config file or a set of parameters and
    uses them to predict labels for syllable segments in audio files,
    based on features extracted from those segments.

    Parameters
    ----------
    config_file : string
        filename of YAML file that configures label prediction
    data_dirs : list
        of str, directories that contain audio files from which features should be extracted.
        hvc.extract attempts to create an annotation.csv file based on the audio file types in
        the directories.
    annotation_file : str
        filename of an annotation.csv file
    file_format : str
        format of audio files. One of the following: {'cbin','wav'}
    model_meta_file : str
        filename of .meta file for classifier to use.
    output_dir : str
        Name of parent directory in which to create output. If parent directory does not exist, it is created.
        Default is current working directory.
    segment : bool
        if True, segment song. If annotation file is passed as an argument, then segments from that file
        are used. If data_dirs is passed as an argument, and segment is False, then the FeatureExtractor
        will look for annotation files, and will raise an error if none are found. Default when data_dirs
        is passed as an argument is True, i.e. it is assumed the user has not already segmented the song
        and wants to do this in an automated way, then apply classifiers to the segments.
    predict_proba : bool
        If True, estimate probabilities that labels are correct. Default is False.
    convert_to: str
        If True, convert predictions to annotation files. Default is False.
    return_predictions : bool
        If True, return feature file with predicted labels. Default is True.

    Returns
    -------
    predictions : dict
        feature file returned as a Python dictionary, with the additional
        (key, value) pair of 'pred_labels', a Numpy array containing the
        labels predicted by the classifier.
        Only returned if return_predictions = True.
    """
    if config_file and (file_format or model_meta_file or output_dir or segment
                        or convert_to):
        raise ValueError('Cannot specify config_file and other parameters '
                         'when calling hvc.predict, '
                         'please specify either config_file or all other '
                         'parameters ')

    if config_file and data_dirs:
        raise ValueError('Please specify either config_file or data_dirs, '
                         'not clear which to use when both are specified')

    if config_file and annotation_file:
        raise ValueError('Please specify either config_file or annotation_file, '
                         'not clear which to use when both are specified')

    home_dir = os.getcwd()

    if config_file:
        predict_config = hvc.parseconfig.parse_config(config_file, 'predict')
        print('parsed predict config')

        for todo in predict_config['todo_list']:
            # get absolute path before changing directories
            # in case user specified output as a relative dir
            output_dir = os.path.abspath(todo['output_dir'])
            output_dir = os.path.join(output_dir, 'predict_output_'
                                      + hvc.utils.timestamp())
            if not os.path.isdir(output_dir):
                os.mkdir(output_dir)

            extract_params = {
                'output_dir': output_dir,
                'data_dirs': todo['data_dirs'],
                'labels_to_use': 'all',
                'file_format': todo['file_format']
            }

            model_meta_file = joblib.load(todo['model_meta_file'])
            feature_file_for_model = model_meta_file['feature_file']
            print('loading feature file')
            feature_file = joblib.load(feature_file_for_model)

            feature_extractor = feature_file['feature_extractor']
            print('extracting features')
            feature_extractor.extract(**extract_params,
                                      segment=True,
                                      make_output_subdir=False)

            os.chdir(output_dir)
            ftr_files = glob('features_created*')
            model_filename = model_meta_file['model_filename']
            model_name = model_meta_file['model_name']
            if model_name in valid_models['sklearn']:
                clf = joblib.load(model_filename)
                scaler = model_meta_file['scaler']
            elif model_name in valid_models['keras']:
                if 'keras.models' not in locals():
                    import keras.models
                clf = keras.models.load_model(model_filename)
                spect_scaler = model_meta_file['spect_scaler']

            for ftr_file in ftr_files:
                print("predicting labels for features in file: {}"
                      .format(ftr_file))
                ftr_file_dict = joblib.load(ftr_file)
                if model_name in valid_models['sklearn']:
                    features = ftr_file_dict['features']
                    if np.any(np.isnan(features)):  # if any rows have nan values for features
                        features_has_nan = True
                        # Initialize predictions vector, to later assign nan values
                        # to predictions for those rows
                        pred_labels_nan = np.full((features.shape[0],), 'nan')  # has to be same dtype as predictions
                        # Need to remove rows with nans before normalization + classification
                        features_not_nan_rows = np.where(~np.isnan(features).any(axis=1))[0]
                        features = features[features_not_nan_rows, :]
                    else:
                        features_has_nan = False
                    features_scaled = scaler.transform(features)
                    pred_labels = clf.predict(features_scaled)
                    if features_has_nan:
                        # index pred_labels into pred_labels_nan
                        # so that all nan rows will have 'nan' as prediction
                        pred_labels_nan[features_not_nan_rows] = pred_labels
                        # now make pred_labels point to ndarray with 'nan' predictions included
                        pred_labels = pred_labels_nan

                elif model_name in valid_models['keras']:
                    neuralnet_inputs_dict = ftr_file_dict['neuralnet_inputs']
                    inputs_key = model_meta_file['feature_list'][0]
                    neuralnet_inputs = neuralnet_inputs_dict[inputs_key]
                    neuralnet_inputs_scaled = spect_scaler.transform(neuralnet_inputs)
                    neuralnet_inputs_scaled = neuralnet_inputs_scaled[:, :, :, np.newaxis]
                    pred_labels = clf.predict(neuralnet_inputs_scaled)
                    label_binarizer = model_meta_file['label_binarizer']
                    pred_labels = label_binarizer.inverse_transform(pred_labels)

                ftr_file_dict['pred_labels'] = pred_labels

                if 'predict_proba' in todo:
                    if todo['predict_proba']:
                        pred_probs = clf.predict_proba(features_scaled)
                        ftr_file_dict['pred_probs'] = pred_probs
                joblib.dump(ftr_file_dict, ftr_file)

                if 'convert' in todo:
                    songfiles = ftr_file_dict['songfiles']
                    songfile_IDs = ftr_file_dict['songfile_IDs']
                    if todo['convert'] == 'notmat':
                        all_sampfreqs = ftr_file_dict['all_sampfreqs']
                        print('converting to .not.mat files')
                        for curr_song_id, songfile_name in enumerate(songfiles):
                            these = np.asarray(songfile_IDs) == curr_song_id
                            segment_params = ftr_file_dict['segment_params']
                            annotation.make_notmat(filename=songfile_name,
                                                   labels=ftr_file_dict['pred_labels'][these],
                                                   onsets_Hz=ftr_file_dict['onsets_Hz'][these],
                                                   offsets_Hz=ftr_file_dict['offsets_Hz'][these],
                                                   samp_freq=all_sampfreqs[curr_song_id],
                                                   threshold=segment_params['threshold'],
                                                   min_syl_dur=segment_params['min_syl_dur'],
                                                   min_silent_dur=segment_params['min_silent_dur'],
                                                   clf_file=model_filename,
                                                   alternate_path=output_dir)

            os.chdir(home_dir)

    elif data_dirs or annotation_file:
        if data_dirs and annotation_file:
            raise ValueError('hvc.predict received values for both data_dirs and '
                             'annotation_file arguments, unclear which to use. '
                             'Please only specify one or the other.')

        if model_meta_file is None:
            raise ValueError('model_meta_file required when as an argument when hvc.predict '
                             'is called with data_dirs or annotation_file.')

        if convert_to is not None:
            if convert_to not in valid_convert_types:
                raise ValueError('file format to convert predictions to, {}, is not a '
                                 'valid format'.format(convert_to))

        if segment is None:
            # default to True
            if data_dirs and (annotation_file is None):
                segment = True
            else:
                segment = False

        model_meta_file = joblib.load(model_meta_file)
        model_filename = model_meta_file['model_filename']
        model_name = model_meta_file['model_name']
        if predict_proba:
            if model_name not in valid_models['sklearn']:
                raise ValueError('predict_proba argument set to True, but model in {} is {}, '
                                 'which is not a valid scikit-learn model and does not have '
                                 'a predict probability function built in'.format(model_filename,
                                                                                  model))

        if output_dir is None:
            output_dir = os.getcwd()
        output_dir = os.path.abspath(output_dir)
        output_dir = os.path.join(output_dir, 'predict_output_'
                                  + hvc.utils.timestamp())
        if not os.path.isdir(output_dir):
            os.mkdir(output_dir)

        feature_file_for_model = model_meta_file['feature_file']
        print('loading feature file')
        feature_file = joblib.load(feature_file_for_model)

        extract_params = {
            'output_dir': output_dir,
            'labels_to_use': 'all',
            'file_format': file_format,
            'segment': segment
        }

        if annotation_file:
            extract_params['annotation_file'] = annotation_file
        elif data_dirs:
            extract_params['data_dirs'] = data_dirs

        feature_extractor = feature_file['feature_extractor']
        print('extracting features')
        feature_extractor.extract(**extract_params,
                                  make_output_subdir=False)

        os.chdir(output_dir)
        ftr_files = glob('features_created*')
        if model_name in valid_models['sklearn']:
            clf = joblib.load(model_filename)
            scaler = model_meta_file['scaler']
        elif model_name in valid_models['keras']:
            if 'keras.models' not in locals():
                import keras.models
            clf = keras.models.load_model(model_filename)
            spect_scaler = model_meta_file['spect_scaler']

        for ftr_file in ftr_files:
            print("predicting labels for features in file: {}"
                  .format(ftr_file))
            ftr_file_dict = joblib.load(ftr_file)
            if model_name in valid_models['sklearn']:
                features = ftr_file_dict['features']
                if np.any(np.isnan(features)):  # if any rows have nan values for features
                    features_has_nan = True
                    # Initialize predictions vector, to later assign nan values
                    # to predictions for those rows
                    pred_labels_nan = np.full((features.shape[0],), 'nan')  # has to be same dtype as predictions
                    # Need to remove rows with nans before normalization + classification
                    features_not_nan_rows = np.where(~np.isnan(features).any(axis=1))[0]
                    features = features[features_not_nan_rows, :]
                else:
                    features_has_nan = False
                features_scaled = scaler.transform(features)
                pred_labels = clf.predict(features_scaled)
                if features_has_nan:
                    # index pred_labels into pred_labels_nan
                    # so that all nan rows will have 'nan' as prediction
                    pred_labels_nan[features_not_nan_rows] = pred_labels
                    # now make pred_labels point to ndarray with 'nan' predictions included
                    pred_labels = pred_labels_nan
            elif model_name in valid_models['keras']:
                neuralnet_inputs_dict = ftr_file_dict['neuralnet_inputs']
                inputs_key = model_meta_file['feature_list'][0]
                neuralnet_inputs = neuralnet_inputs_dict[inputs_key]
                neuralnet_inputs_scaled = spect_scaler.transform(neuralnet_inputs)
                neuralnet_inputs_scaled = neuralnet_inputs_scaled[:, :, :, np.newaxis]
                pred_labels = clf.predict(neuralnet_inputs_scaled)
                label_binarizer = model_meta_file['label_binarizer']
                pred_labels = label_binarizer.inverse_transform(pred_labels)

            ftr_file_dict['pred_labels'] = pred_labels

            if predict_proba:
                pred_probs = clf.predict_proba(features_scaled)
                ftr_file_dict['pred_probs'] = pred_probs
            joblib.dump(ftr_file_dict, ftr_file)

            if convert_to:
                songfiles = ftr_file_dict['songfiles']
                songfile_IDs = ftr_file_dict['songfile_IDs']
                if convert_to == 'notmat':
                    all_sampfreqs = ftr_file_dict['all_sampfreqs']
                    print('converting to .not.mat files')
                    for curr_song_id, songfile_name in enumerate(songfiles):
                        these = np.asarray(songfile_IDs) == curr_song_id
                        segment_params = ftr_file_dict['segment_params']
                        annotation.make_notmat(filename=songfile_name,
                                               labels=ftr_file_dict['pred_labels'][these],
                                               onsets_Hz=ftr_file_dict['onsets_Hz'][these],
                                               offsets_Hz=ftr_file_dict['offsets_Hz'][these],
                                               samp_freq=all_sampfreqs[curr_song_id],
                                               threshold=segment_params['threshold'],
                                               min_syl_dur=segment_params['min_syl_dur'],
                                               min_silent_dur=segment_params['min_silent_dur'],
                                               clf_file=model_filename,
                                               alternate_path=output_dir)

        if return_predictions:
            predict_dict = {}
            for ftr_file in ftr_files:
                ftrs = joblib.load(ftr_file)
                if predict_dict == {}:
                    predict_dict['labels'] = ftrs['labels']
                    predict_dict['pred_labels'] = ftrs['pred_labels']
                    predict_dict['songfile_IDs'] = ftrs['songfile_IDs']
                    predict_dict['onsets_Hz'] = ftrs['onsets_Hz']
                    predict_dict['offsets_Hz'] = ftrs['offsets_Hz']
                    predict_dict['songfiles'] = ftrs['songfiles']
                    predict_dict['feature_list'] = ftrs['feature_list']
                    predict_dict['labels_to_use'] = ftrs['labels_to_use']
                    if 'features' in ftrs:
                        predict_dict['features'] = ftrs['features']
                        predict_dict['features_arr_column_IDs'] = ftrs['features_arr_column_IDs']
                    if 'feature_group_ID_dict' in ftrs:
                        predict_dict['feature_group_ID_dict'] = ftrs['feature_group_ID_dict']
                        predict_dict['feature_list_group_ID'] = ftrs['feature_list_group_ID']
                    if 'pred_probs' in ftrs:
                        predict_dict['pred_probs'] = ftrs['pred_probs']
                    if 'neuralnet_inputs' in ftrs:
                        predict_dict['neuralnet_inputs'] = ftrs['neuralnet_inputs']
                else:  # if we already loaded one feature file and predict_dict is not empty
                    # then concatenate
                    predict_dict['labels'] = np.concatenate(predict_dict['labels'], ftrs['labels'])
                    predict_dict['pred_labels'] = np.concatenate(predict_dict['pred_labels'], ftrs['pred_labels'])
                    predict_dict['songfile_IDs'] = np.concatenate(predict_dict['songfile_IDs'], ftrs['songfile_IDs'])
                    predict_dict['onsets_Hz'] = np.concatenate(predict_dict['onsets_Hz'], ftrs['onsets_Hz'])
                    predict_dict['offsets_Hz'] = np.concatenate(predict_dict['offsets_Hz'], ftrs['offsets_Hz'])
                    if 'features' in predict_dict:
                        predict_dict['features'] = np.concatenate(predict_dict['features'], ftrs['features'])
                    if 'neuralnet_inputs' in predict_dict:
                        for key, val in ftrs['neuralnet_input']:
                            predict_dict['neuralnet_inputs'][key] = \
                                np.concatenate((predict_dict['neuralnet_inputs'][key],
                                                ftrs['neuralnet_inputs'][key]))
                    if 'pred_probs' in predict_dict:
                        predict_dict['pred_probs'] = np.concatenate(predict_dict['pred_probs'], ftrs['pred_probs'])
            os.chdir(home_dir)
            return predict_dict
        else:
            os.chdir(home_dir)
示例#50
0
    verbose=2,
)

#train the model
model.fit(X_train, Y_train)

#test the model
from sklearn.metrics import accuracy_score, log_loss
Y_pred = model.predict(X_train)
Y_prob = model.predict_proba(X_train)
print(
    'The accuracy obtained for train data is {:.4f} and the cross entropy is {:.4f}'
    .format(accuracy_score(Y_train, Y_pred), log_loss(Y_train, Y_prob)))

Y_pred = model.predict(X_test)
Y_prob = model.predict_proba(X_test)
print(
    'The accuracy obtained for test data is {:.4f} and the cross entropy is {:.4f}'
    .format(accuracy_score(Y_test, Y_pred), log_loss(Y_test, Y_prob)))

#metrics of the model
from sklearn.metrics import classification_report, confusion_matrix
names = ['case' + str(s) for s in range(0, ncases)]
print(classification_report(Y_test, Y_pred, target_names=names))
print(confusion_matrix(Y_test, Y_pred))

#save the model
from sklearn.externals import joblib
joblib.dump(model, 'analysis/models/neuralnetwork.pkl')
joblib.dump(scaler, 'analysis/models/neuralnetwork_scaler.pkl')
    def _train_and_analyze_predictors(self):
        log.info('Training and analyzing predictors...')
        problem = SupervisedLearningPipeline.CLASSIFICATION
        meta_report = None
        fm_io = FeatureMatrixIO()

        # Build paths for output.
        pipeline_file_name = inspect.getfile(inspect.currentframe())
        data_dir = SupervisedLearningPipeline._fetch_data_dir_path(
            self, pipeline_file_name)

        # Test BifurcatedSupervisedClassifier and SupervisedClassifier.
        algorithms_to_test = list()
        algorithms_to_test.extend(SupervisedClassifier.SUPPORTED_ALGORITHMS)
        # for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS:
        #     algorithms_to_test.append('bifurcated-%s' % algorithm)
        log.debug('algorithms_to_test: %s' % algorithms_to_test)

        # Train and analyse algorithms.
        for algorithm in algorithms_to_test:
            log.info('Training and analyzing %s...' % algorithm)
            # If report_dir does not exist, make it.
            report_dir = '/'.join([data_dir, algorithm])
            if not os.path.exists(report_dir):
                os.makedirs(report_dir)

            # Define hyperparams.
            hyperparams = {}
            hyperparams['algorithm'] = algorithm
            hyperparams[
                'hyperparam_strategy'] = SupervisedClassifier.EXHAUSTIVE_SEARCH
            hyperparams['max_iter'] = 1024
            hyperparams['random_state'] = self._random_state

            # If bifurcated algorithm, define bifurcator.
            if 'bifurcated' in algorithm:
                # bifrucator = LAB.pre == 0
                hyperparams['bifurcator'] = '%s.pre' % self._var
                hyperparams[
                    'bifurcation_strategy'] = BifurcatedSupervisedClassifier.EQUAL
                hyperparams['bifurcation_value'] = 0
                hyperparams['bifurcated'] = True

            # Train classifier.
            predictor_path = self._build_model_dump_path(algorithm)
            if os.path.exists(
                    predictor_path) and 'bifurcated' not in algorithm:
                log.debug('Loading model from disk...')
                # TODO(sbala): Fix loblib.load so that it works for bifurcated
                # supervised classifiers.
                self._predictor = joblib.load(predictor_path)
                self._features = self._X_train.columns
                status = SupervisedClassifier.TRAINED
            else:
                status = SupervisedLearningPipeline._train_predictor(
                    self, problem, [0, 1], hyperparams)

            # If failed to train, write an error report.
            y_train_counts = self._y_train[
                self._y_train.columns[0]].value_counts()
            y_test_counts = self._y_test[
                self._y_test.columns[0]].value_counts()
            if status == SupervisedClassifier.INSUFFICIENT_SAMPLES:
                # Skip all analysis and reporting.
                # This will be true for all algorithms, so just return.
                # Build error report.
                algorithm_report = DataFrame(
                    {
                        'lab_panel': [self._var],
                        'algorithm': [algorithm],
                        'error': [status],
                        'y_train.value_counts()': [y_train_counts.to_dict()],
                        'y_test.value_counts()': [y_test_counts.to_dict()]
                    },
                    columns=[
                        'lab_panel', 'algorithm', 'error',
                        'y_train.value_counts()', 'y_test.value_counts()'
                    ])
                header = [
                    'LabNormalityPredictionPipeline("%s", 10000)' % self._var
                ]
                # Write error report.
                fm_io.write_data_frame_to_file(algorithm_report, \
                    '/'.join([report_dir, '%s-normality-prediction-report.tab' % (self._var)]), \
                    header)
            # If successfully trained, append to a meta report.
            elif status == SupervisedClassifier.TRAINED:
                pipeline_prefix = '%s-normality-prediction-%s' % (self._var,
                                                                  algorithm)

                SupervisedLearningPipeline._analyze_predictor(
                    self, report_dir, pipeline_prefix)
                SupervisedLearningPipeline._analyze_predictor_traindata(
                    self, report_dir)

                # continue # Do not generate stats results here...

                if meta_report is None:
                    meta_report = fm_io.read_file_to_data_frame('/'.join(
                        [report_dir,
                         '%s-report.tab' % pipeline_prefix]))
                else:
                    algorithm_report = fm_io.read_file_to_data_frame('/'.join(
                        [report_dir,
                         '%s-report.tab' % pipeline_prefix]))
                    log.debug('algorithm_report: %s' % algorithm_report)
                    meta_report = meta_report.append(algorithm_report)
                # Write predictor to disk.
                predictor = SupervisedLearningPipeline.predictor(self)
                predictor_path = self._build_model_dump_path(algorithm)
                joblib.dump(predictor, predictor_path)

        # After building per-algorithm reports, write to meta report.
        # Note that if there were insufficient samples to build any of the
        # algorithms, then meta_report will still be None.
        if meta_report is not None:
            header = [
                'LabNormalityPredictionPipeline("%s", 10000)' % self._var
            ]
            fm_io.write_data_frame_to_file(meta_report, \
                '/'.join([data_dir, '%s-normality-prediction-report.tab' % self._var]), header)
def main():

    train_images, val_images = [], []
    train_folders = train_data
    val_folders = test_data if TESTING else val_data

    if not TRAINING:

        if len(sys.argv) != 2:
            print(
                "ERROR! Usage: python3 train.py *load_directory (if validating or testing)*"
            )
            return

        load_dir = './log/train/' + sys.argv[1]

        svm = joblib.load(load_dir + '/svm.pkl')
        kmeans = joblib.load(load_dir + '/kmeans.pkl')
        SURF_params = get_surf_params(load_dir + '/params.txt')

    if TRAINING:
        print("Getting train data...")
        train_labels_init = get_data(train_images, train_folders,
                                     class_labels_raw)
    else:
        print("Getting val/test data...")
        val_labels_init = get_data(val_images, val_folders, class_labels_raw)

    print("Getting SURF (train)...")
    if TRAINING:
        train_kps, train_descriptors = get_surf_features(
            train_images,
            hessian_threshold=hessian_threshold,
            upright=upright,
            extended=extended)
    else:
        print("Getting SURF (val/test)...")

        # print("hessian_threshold?: {}".format(int(SURF_params["hessian_threshold"])))
        # print("upright?: {}".format(str2bool(SURF_params["upright"])))
        # print("extended?: {}".format(str2bool(SURF_params["extended"])))

        val_kps, val_descriptors = get_surf_features(
            val_images,
            hessian_threshold=int(SURF_params["hessian_threshold"]),
            upright=str2bool(SURF_params["upright"]),
            extended=str2bool(SURF_params["extended"]))

    print("Rearranging data....")
    if TRAINING:
        train_descriptors_np_init = [
            np.array(curr_des) for curr_des in train_descriptors
        ]
    else:
        val_descriptors_np_init = [
            np.array(curr_des) for curr_des in val_descriptors
        ]

    train_descriptors_np, val_descriptors_np = [], []
    train_labels, val_labels = [], []

    if TRAINING:
        print("Clearing out invalid train")
        for i in range(len(train_descriptors_np_init)):
            if len(train_descriptors_np_init[i].shape) == 2:
                train_descriptors_np.append(train_descriptors_np_init[i])
                train_labels.append(train_labels_init[i])
    else:
        print("Clearing out invalid val/test")
        for i in range(len(val_descriptors_np_init)):
            if len(val_descriptors_np_init[i].shape) == 2:
                val_descriptors_np.append(val_descriptors_np_init[i])
                val_labels.append(val_labels_init[i])

    if TRAINING:
        print("Stacking train")
        train_descriptor_matrix = np.vstack(train_descriptors_np)
        train_image_index = np.ones(
            (train_descriptor_matrix.shape[0], ), dtype=int) * -1
    else:
        print("Stacking val/test")
        val_descriptor_matrix = np.vstack(val_descriptors_np)
        val_image_index = np.ones(
            (val_descriptor_matrix.shape[0], ), dtype=int) * -1

    if TRAINING:
        print("Indexer train")
        curr_pos = 0
        diff = 0
        for i in range(len(train_images)):
            if train_descriptors[i] is None:
                diff += 1
                continue
            next_pos = curr_pos + len(train_descriptors[i])
            train_image_index[curr_pos:next_pos] = i - diff
            curr_pos = next_pos
    else:
        print("Indexer val/test")
        curr_pos = 0
        diff = 0
        for i in range(len(val_images)):
            if val_descriptors[i] is None:
                diff += 1
                continue
            next_pos = curr_pos + len(val_descriptors[i])
            val_image_index[curr_pos:next_pos] = i - diff
            curr_pos = next_pos

    if TRAINING:
        print("Running k-means")
        kmeans = MiniBatchKMeans(
            n_clusters=vocab_size,
            precompute_distances=precompute_distances,
            verbose=kmeans_verbose,
            # n_jobs=n_jobs, # num CPUs, -1 -> all CPUs
            # algorithm=algorithm
        ).fit(train_descriptor_matrix)

        train_centroid_labels = kmeans.labels_
        centroids = kmeans.cluster_centers_
    else:
        print("Getting centroid labels (val/test)")
        val_centroid_labels = kmeans.predict(val_descriptor_matrix)

    print("Building image histograms")
    if TRAINING:
        train_features = np.zeros((len(train_labels), vocab_size), dtype=int)
        for i in range(len(train_centroid_labels)):
            img_ind = train_image_index[i]
            train_features[img_ind, train_centroid_labels[i]] += 1
    else:
        val_features = np.zeros(
            (len(val_labels), int(SURF_params['vocab_size'])), dtype=int)
        for i in range(len(val_centroid_labels)):
            img_ind = val_image_index[i]
            val_features[img_ind, val_centroid_labels[i]] += 1

    now = datetime.datetime.now(dateutil.tz.tzlocal())
    timestamp = now.strftime('%Y_%m_%d_%H_%M')

    if TRAINING:
        SVM_params = [(0.01, "linear"), (0.1, "linear"), (0.5, "linear"),
                      (1.0, "linear"), (0.01, "rbf"), (0.1, "rbf"),
                      (0.5, "rbf"), (1.0, "rbf")]

        for param_set in SVM_params:

            C, kernel = param_set

            print("Training SVM")
            svm = SVC(C=C,
                      kernel=kernel,
                      verbose=svm_verbose,
                      shrinking=shrinking)
            svm.fit(train_features, train_labels)

            pred_train = svm.predict(train_features)
            train_diffs = pred_train - train_labels
            train_error = len(
                np.nonzero(train_diffs)[0]) / (train_diffs.shape[0])

            print("Training error is: {}".format(train_error))

            print("Saving confusion matrix (train)...")
            confMat_train = confusion_matrix(train_labels, pred_train)
            print(confMat_train)
            plt.matshow(confMat_train)
            plt.show()

            print_params(C, kernel)

            print("Logging data to:")
            cwd = os.getcwd()
            directory = cwd + "/log/train/" + "reg_" + str(
                param_set[0]) + "_kernel_" + param_set[1] + timestamp
            print(directory)

            svm_filename = directory + '/svm.pkl'
            kmeans_filename = directory + '/kmeans.pkl'
            params_filename = directory + '/params.txt'
            train_confmat_file = directory + '/confmat_train.npy'

            os.makedirs(directory)

            joblib.dump(svm, svm_filename)
            joblib.dump(kmeans, kmeans_filename)
            np.save(train_confmat_file, confMat_train)
            save_params(params_filename, C, kernel)
    else:

        pred_val = svm.predict(val_features)
        val_diffs = pred_val - val_labels
        val_error = len(np.nonzero(val_diffs)[0]) / (val_diffs.shape[0])

        print("Val error is: {}".format(val_error))

        print("Saving confusion matrix (val)...")
        confMat_val = confusion_matrix(val_labels, pred_val)
        print(confMat_val)
        plt.matshow(confMat_val)
        plt.show()

        cwd = os.getcwd()
        curr = 'val' if VALIDATING else 'test'
        directory = cwd + '/log/' + curr + '/' + timestamp
        os.makedirs(directory)
        val_confmat_file = directory + '/confmat.npy'
        np.save(val_confmat_file, confMat_val)
示例#53
0
from keras import backend as K
K.set_image_data_format('channels_last')

(X_train, y_train), (X_test, y_test) = cifar10.load_data()

## Convert images to grayscale

import cv2

X_train = map(lambda x: cv2.cvtColor(x, cv2.COLOR_BGRA2GRAY), X_train)
X_test = map(lambda x: cv2.cvtColor(x, cv2.COLOR_BGRA2GRAY), X_test)

## Fit

from sklearn import svm
import numpy

clf = svm.SVC(gamma=0.001, C=100.)

X_train = numpy.array(X_train)
X_train = numpy.array(X_train).reshape((X_train.shape[0], -1), order='F')

y_train = numpy.array(y_train).flatten()

clf.fit(X_train, y_train)

## Save classifier to disk

from sklearn.externals import joblib
joblib.dump(clf, 'clasifiers/svn-classifier.pkl')
plt.figure()
plt.plot(range(len(y_pred[120:250])),
         y_pred[120:250],
         'b',
         label="price_predict")
plt.plot(range(len(y_pred[120:250])), y_test[120:250], 'r', label="price_test")
plt.legend(loc="upper right")
plt.show()

#随机森林
try:
    clf = joblib.load('random.h5')  # 加载API
except:
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    joblib.dump(clf, "random.h5")  # 保存
y_pred2 = clf.predict(X_test)
plt.figure()
plt.plot(range(len(y_pred2[120:250])),
         y_pred2[120:250],
         'b',
         label="price_predict")
plt.plot(range(len(y_pred2[120:250])),
         y_test[120:250],
         'r',
         label="price_test")
plt.legend(loc="upper right")
plt.show()
#朴素贝叶斯
try:
    mnb = joblib.load('mnb.pkl')
示例#55
0
def MLR_model(Training_features, Training_targets, Testing_features,
              Testing_targets, stock_limit, Price_table):
    max_profit_to_loss = -10000000000000000
    min_error = 1
    for L1 in [40, 80]:
        for L2 in [20]:
            for L3 in [10]:
                for alpha in [0.001]:
                    MLPR = MLPRegressor(activation='relu',
                                        hidden_layer_sizes=(L1, L2, L3),
                                        solver='lbfgs',
                                        alpha=alpha,
                                        batch_size=50,
                                        learning_rate='adaptive',
                                        max_iter=2000000,
                                        early_stopping=True)
                    MLPR.fit(Training_features, Training_targets)
                    Train_predict = MLPR.predict(Training_features)
                    predict_price = []
                    for q in range(len(Testing_features)):
                        predict_price.append(
                            MLPR.predict(Testing_features[q].reshape(1, -1)))

                    error = np.mean(
                        abs(
                            np.array(Train_predict) -
                            np.array(Training_targets)) /
                        np.array(Training_targets))
                    # Calculate profit
                    stock = 0
                    loss = 0
                    brought = 0
                    profit = 0
                    # For plotting
                    t = list(range(len(Testing_targets)))
                    BP = []
                    B = []
                    SP = []
                    S = []
                    for i in range(6, len(predict_price)):
                        if (i % 1 == 0):
                            current_price = Price_table[i]
                            if (
                                    stock <= predict_price[i] and stock > 0
                            ):  # Sell the stock if you have it as soon as hiting a down trend
                                if ((current_price - stock) >= 0):
                                    profit += (current_price - stock)
                                else:
                                    loss += abs(current_price - stock)
                                stock = 0
                                brought = 0
                                SP.append(t[i])
                                S.append(current_price)
                            elif (
                                    current_price <= predict_price[i]
                                    and stock == 0
                            ):  # Buy stock at the bottom of the down trend
                                stock = current_price
                                brought = 1
                                BP.append(t[i])
                                B.append(current_price)

                    if (loss == 0):
                        loss = 0.0001
                    if (profit / loss >= max_profit_to_loss):
                        model = MLPR
                        max_profit = profit
                        max_profit_to_loss = profit / loss
                        prices = predict_price
                        Buy_points = BP
                        Buys = B
                        Sell_points = SP
                        Sells = S
    # Save optimal model
    joblib.dump(model, 'MLPR.pkl')
    plot_data = [[t, Price_table, 'm'], [np.array(t), prices, 'b'],
                 [Buy_points, Buys, 'ro'], [Sell_points, Sells, 'go']]
    metric = max_profit_to_loss
    if (metric >= 10000):
        metric /= 15000

    return [max_profit, metric, plot_data]
示例#56
0
def get_weather(df, how='learning', freq=None):
    """
    Match timeseries with weather data.
    df : [Dataframe]
    If type == learning :
        Matching with historitical data weather
    if type == forcast :
        Matching with forcast data. Freq must be fill with this opton
    freq : Timedelta ex : "1H"
    """

    df = df.reset_index()

    # Check params
    if how not in ['learning', 'forecast']:
        logger.error(
            'Bad option for get_weather. You must choose between learning or forecast'
        )
        return df

    if how == 'forecast' and freq is None:
        logger.error("For forecast option, we must specify freq. Ex freq='1H'")

    # Process for learning matching
    if how == 'learning':
        lyon_meteo = pd.read_csv('data/lyon_weather.csv', parse_dates=['date'])
        lyon_meteo.rename(columns={'date': 'ts'}, inplace=True)

        # have to labelencode weather_desc
        LE = LabelEncoder()
        lyon_meteo['weather_desc'] = LE.fit_transform(
            lyon_meteo['weather_desc'])

        # Dump LabelEncoder
        joblib.dump(LE, 'model/Label_Encoder_Weather.pkl')

        # Resemple data on 10
        clean_lyon_meteo = lyon_meteo.resample(
            "10T", on="ts").mean().bfill().reset_index()
        df = df.merge(clean_lyon_meteo[[
            'ts', 'temp', 'humidity', 'weather_desc', 'cloudiness'
        ]],
                      on='ts',
                      how='left')

        df = df.sort_index()
        df = df.set_index('ts')
        return df

    # Process for forecast matching
    if how == 'forecast':
        lyon_forecast = pd.read_csv('data/lyon_forecast.csv',
                                    parse_dates=['forecast_at', 'ts'])
        lyon_forecast[
            'delta'] = lyon_forecast['ts'] - lyon_forecast['forecast_at']

        # Filter on delta with freq
        lyon_forecast = lyon_forecast[lyon_forecast['delta'] == freq]
        lyon_forecast.drop_duplicates(subset=['ts', 'delta'],
                                      keep='first',
                                      inplace=True)

        # Label encode weather_desc
        LE = joblib.load('model/Label_Encoder_Weather.pkl')
        lyon_forecast['weather_desc'] = LE.transform(
            lyon_forecast['weather_desc'])

        #Merging
        # We take the last forecast (on freq) using backward merging
        df = df.sort_values('ts')
        df_index_save = df.index  # Savind index merge will destroy it
        df = pd.merge_asof(left=df,
                           right=lyon_forecast[[
                               'ts', 'temp', 'humidity', 'weather_desc',
                               'cloudiness'
                           ]],
                           on='ts',
                           direction='backward')
        df.index = df_index_save

        # Resorting as originaly (to don't loose y_test order)
        df = df.sort_index()
        df = df.set_index('ts')

        return df
示例#57
0
# -*- coding:utf-8 -*-

from sklearn.cluster import KMeans
from retrieval import load_feat_db
from sklearn.externals import joblib
from myconfig import DATASET_BASE, N_CLUSTERS
import os

if __name__ == '__main__':
    feats, color_feats, labels = load_feat_db()
    model = KMeans(n_clusters=N_CLUSTERS, random_state=0, n_jobs=-1).fit(feats)
    model_path = os.path.join(DATASET_BASE, r'models', r'kmeans.m')
    joblib.dump(model, model_path)
 def save(self):
     self.active=True
     joblib.dump(self.learn,self.name_file)
                long_status=0
                    
            start_bar=start_bar+1

       
print('operazioni in Profit :'+ str(round(operations_in_profit))+" guadagno $ "+str(round(operations_in_profit*tp_factor)))     

print('operazioni in Loss :'+ str(round(operations_in_loss))+ " perdita $ "+ str(round(operations_in_loss)))   

df_equity=pd.DataFrame(columns=["EquityLine"])
df_equity['EquityLine']=equity_curve
EquityLine=df_equity['EquityLine']
CloseLine=EquityLine.plot()

from sklearn.externals import joblib
joblib.dump(modelSVC, 'modelEURUSD0-35kbars.pkl')
    
#DISTRIBUZIONE DEI BODI NELLE CANDELE DI MINIMO
bodyGraph=df_Lfail['Shadow_n'].plot.hist(title='body size',bins=50,alpha=0.5,color='blue',normed=True)   
#bodyGraph2=df_serie['Body'].plot.hist(title='body size',bins=300,alpha=0.2,color='red', normed=True) 

# PREPARO DF VALIDAZIONE COME DF BT CIOE CON INDEX DATE OHLC NEX E PAST E BODY    
  
#LEVO LA COLONNA VOLUME CHE NON SERVE
df_validazione=df_validazione.drop(['Volume'],axis=1)
df_validazione=df_validazione.reset_index(drop=True)

## CREA LE COLONNE OHLC NEXT E PAST PER AVERE SU OGNI ROW IL PATTERN A TRE BARRE COMPLETO
#raccogli i dati della barra seguente, cosi quando estrapolo min e massimi ho i dati per vedere correlazioni e pattern a 3 barre
df_validazione['Open_n']=df_validazione["Open"].shift(-1)
df_validazione['High_n']=df_validazione["High"].shift(-1)
def train_model(feats_csv):

	df = pd.DataFrame()
	df = pd.read_csv(feats_csv).iloc[:,1:]

	y = np.ravel(df.iloc[:,-1:])
	X = np.array(df.iloc[:,:-1])

	############ 15 Best selected features using ANOVA F-value score function ###############
	X_new = SelectKBest(f_classif, k=15).fit_transform(X, y)
	selected_features = SelectKBest(f_classif, k=15).fit(X, y).get_support(indices = True)

	############ KNN manhattan ###############
	##### preprocessing: data scaling######## 
	min_max_scaler = MinMaxScaler()
	X_new = min_max_scaler.fit_transform(X_new)

	model = KNeighborsClassifier(n_neighbors = 1,algorithm = 'brute',metric = 'manhattan',weights = 'uniform')
	model.fit(X_new,y)

	newdir = '../kNN_clfr'
	os.mkdir(newdir)

	joblib.dump(model, os.path.join(newdir,'kNN.pkl')) 

	return