def get_best_model_params(model, data, labels, model_type='b'): x_train, x_test, y_train, y_test = get_data_sets(data, labels) train = dv().fit_transform(x_train) test = np.array(y_train) hyper_parameters = get_hyper_parameters(model_type) clf = gs(model, hyper_parameters) if model_type == 'gmm': clf = model clf.fit(train.todense()) else: clf.fit(train.todense(), test) print "Best parameters set found on development set:" print if not model_type == 'gmm': print clf.best_estimator_ train = dv().fit_transform(x_test) test = np.array(y_test) #print clf.score(train.todense(), test) return clf
def train(): #for each sentence in the dependency parses, construct the tree: features=[] answers = [] data = [] for sentence in data: #format sentence to pass into tree maker #iterate through sentence until tree is formed - one each iteration, will consider multiple pairs of words #each time we consider two words, get features, save as row in array with dictvectorize '''features, answers = make_tree('',sentence,True) features.append(featues) answers.append(answers)''' pass #example so that can do svm #order of features -2,-1,0-,0+,1,2: pos,lex,ch-L-pos,ch-L-lex,ch-R-pos,ch-R-lex example_features1 = {'pos-2': ':', 'pos-1': 'NNS', 'pos-0': 'IN', 'pos0': 'NN', 'pos1': 'WP', 'pos1': ':'} example_features1.update( {'lex-2': '-', 'lex-1': 'sellers','lex-0': 'of','lex0': 'resort','lex1': 'who','lex2': '-'}) example_features1.update({'chrrpos-1':'DT','chrrpos0':'JJ','chrlex-1':'the','chrlex0':'last'}) example_features1.update({'chlpos1': 'VBD', 'chllex1':'were'}) example_features2 = {'pos-2': 'NNS', 'pos-1': 'IN', 'pos-0': 'NN', 'pos0': 'WP', 'pos1': ':'} example_features2.update( {'lex-2': 'sellers', 'lex-1': 'of','lex-0': 'resort','lex0': 'who','lex1': '-'}) example_features2.update({'chrrpos-2':'DT','chrrpos-0':'JJ','chrlex-2':'the','chrlex-0':'last'}) example_features2.update({'chlpos0': 'VBD', 'chllex0':'were'}) #this feature matrix has two entries example_features = [example_features1,example_features2] #each time we consider two words, get correct action (shift, right, left), put in the answer vector example_answers = np.array(['right','left']) #dictvectorize to turn strings into numerical values vec = dv() example_array = vec.fit_transform(example_features).toarray() '''array = vec.fit_transform(features).toarray()''' #TODO: a later goal: once we have the matrix, sort and split label (left, right, split) data #so that we can run three different models #use sklearn svm to come up with a model #persist the model in a pickle clf = svm.SVC() clf.fit(example_array,example_answers) '''clf.fit(array,answers)''' pkl = open('svm.pkl','wb') pickle.dump(clf,pkl) pkl.close() return clf
def perform(target_label): data_pred = pd.read_csv('data_set/content_preds.csv', dtype=object).drop(target_label, 1) user_label = pd.read_csv('data_set/userinfo.csv', dtype=object)[['profile_id', target_label]] data_merged = pd.merge(data_pred, user_label, on='profile_id', how='inner') user_info = pd.read_csv('data_set/userinfo.csv', dtype=object, usecols=list(data_merged.columns)) kf = StratifiedKFold(data_merged[target_label], n_folds=10, shuffle=True) list_of_each_fold_score = list() for tr_index, te_index in kf: tr_criterion = [x in set(data_merged['profile_id'][tr_index]) for x in user_info['profile_id']] data_tr = user_info[tr_criterion].copy() data_te = data_merged.loc[te_index].copy() data_concatenated = data_tr.append(data_te, ignore_index=True) data_concatenated[['profile_id']] = data_concatenated[['profile_id']].astype(float) # print(data_concatenated) vectorizer = dv(sparse=False) data_x_dict = data_concatenated.drop([target_label], 1).T.to_dict().values() data_x_vec = vectorizer.fit_transform(data_x_dict) data_x_frame = pd.DataFrame(data_x_vec) for i in list(data_x_frame.columns): if len(data_x_frame[i].unique()) > 2: profile_id_index = i data_tr_x = data_x_frame.set_index(profile_id_index).loc[ map(int, list(set(data_merged['profile_id'][tr_index])))].sort_index().copy() data_tr_y = data_tr[target_label].copy() data_te_x = data_x_frame.set_index(profile_id_index).loc[ map(int, list(set(data_merged['profile_id'][te_index])))].sort_index().copy() data_te_y = data_te[target_label].copy() clf = MultinomialNB() clf.fit(data_tr_x, data_tr_y) y_pred = clf.predict(data_te_x) score = metrics.accuracy_score(data_te_y, y_pred) list_of_each_fold_score.append(score) return np.asarray(list_of_each_fold_score).mean()
# print type(x_num_scaled_matrix), type(cat_data) # print x_num_scaled_matrix # print cat_data return x_num_scaled_matrix, cat_data num_train_matrix, cat_train_matrix = encoding(train, numeric_cols, train_drop_cols) x_cat_train_data = cat_train_matrix.T.to_dict().values() # num_matrix1 = data.drop(category_cols, axis=1) # x_num_data = num_matrix1.T.to_dict().values() # print x_cat_data # print num_matrix vectorized = dv(sparse = False) ##NOTE: directly call transform function on training data will cause error since features are not loaded yet # we should call fit_transform on training data and then transform on test data # to make sure the test data's features coincide to training data's vec_x_cat_train = vectorized.fit_transform(x_cat_train_data) # print vec_x_cat_train, vec_x_cat_train.shape x_train = np.hstack((num_train_matrix, vec_x_cat_train)) # print x_train, x_train.shape # print x_test, x_test.shape # print vectorized sgd = SGDRegressor() sgd.fit(x_train, y) # print sgd.coef_
# In[ ]: # In[ ]: dd = {'ele1': 2, 'ele2': 3} dd.keys() # In[ ]: # In[ ]: # preprocessing - sklearn.feature_extraction.DictVectorizer, transform feature-value mappings to # vectors. iris = datasets.load_iris() y = iris.target iris_dv = dv(sparse=False) my_dict = [{'species': iris.target_names[i]} for i in y] my_dict_trans = iris_dv.fit_transform(my_dict) # In[ ]: # Linear regression boston = datasets.load_boston() boston_X = boston.data boston_y = boston.target lr = LinearRegression() lr.fit(boston_X, boston_y) predictions = lr.predict(boston_X) # In[ ]:
def get_data(data, labels): x_train, x_test, y_train, y_test = get_data_sets(data, labels) train = dv().fit_transform(x_test) test = np.array(y_test) return train.todense(), test
#for hackerrank comment line 12 , 13 and 14 and replace 'y.pop(0)' with 'input()' y = [] with open('quoraAnswerClassifier.txt') as f: y = f.readlines() n, m = [int(i) for i in y.pop(0).strip().split()] train_label = [] _train = [] for i in range(n): a = y.pop(0).strip().split() a.pop(0) train_label.append(a.pop(0)) b = [x.split(':') for x in a] _train.append({int(e[0]): float(e[1]) for e in b}) train = dv().fit_transform(_train).toarray() model = rf() model.fit(train, train_label) test_name = [] _test = [] for i in range(int(y.pop(0).strip())): a = y.pop(0).strip().split() test_name.append(a.pop(0)) b = [x.split(':') for x in a] _test.append({int(e[0]): float(e[1]) for e in b}) test = dv().fit_transform(_test).toarray() test_lable = model.predict(test)
def train(): #for each sentence in the dependency parses, construct the tree: features = [] answers = [] data = [] for sentence in data: #format sentence to pass into tree maker #iterate through sentence until tree is formed - one each iteration, will consider multiple pairs of words #each time we consider two words, get features, save as row in array with dictvectorize '''features, answers = make_tree('',sentence,True) features.append(featues) answers.append(answers)''' pass #example so that can do svm #order of features -2,-1,0-,0+,1,2: pos,lex,ch-L-pos,ch-L-lex,ch-R-pos,ch-R-lex example_features1 = { 'pos-2': ':', 'pos-1': 'NNS', 'pos-0': 'IN', 'pos0': 'NN', 'pos1': 'WP', 'pos1': ':' } example_features1.update({ 'lex-2': '-', 'lex-1': 'sellers', 'lex-0': 'of', 'lex0': 'resort', 'lex1': 'who', 'lex2': '-' }) example_features1.update({ 'chrrpos-1': 'DT', 'chrrpos0': 'JJ', 'chrlex-1': 'the', 'chrlex0': 'last' }) example_features1.update({'chlpos1': 'VBD', 'chllex1': 'were'}) example_features2 = { 'pos-2': 'NNS', 'pos-1': 'IN', 'pos-0': 'NN', 'pos0': 'WP', 'pos1': ':' } example_features2.update({ 'lex-2': 'sellers', 'lex-1': 'of', 'lex-0': 'resort', 'lex0': 'who', 'lex1': '-' }) example_features2.update({ 'chrrpos-2': 'DT', 'chrrpos-0': 'JJ', 'chrlex-2': 'the', 'chrlex-0': 'last' }) example_features2.update({'chlpos0': 'VBD', 'chllex0': 'were'}) #this feature matrix has two entries example_features = [example_features1, example_features2] #each time we consider two words, get correct action (shift, right, left), put in the answer vector example_answers = np.array(['right', 'left']) #dictvectorize to turn strings into numerical values vec = dv() example_array = vec.fit_transform(example_features).toarray() '''array = vec.fit_transform(features).toarray()''' #TODO: a later goal: once we have the matrix, sort and split label (left, right, split) data #so that we can run three different models #use sklearn svm to come up with a model #persist the model in a pickle clf = svm.SVC() clf.fit(example_array, example_answers) '''clf.fit(array,answers)''' pkl = open('svm.pkl', 'wb') pickle.dump(clf, pkl) pkl.close() return clf