for i in xrange(testdata.shape[1]): c_dict_test['cD2V' + str(i)] = testdata[:, i] out_c_pd_train = pd.DataFrame(c_dict_train) out_c_pd_test = pd.DataFrame(c_dict_test) # load data with pandas train_data = '../train_after_preproc.csv' test_data = '../test_after_preproc.csv' train = pd.read_csv(train_data).fillna("") test = pd.read_csv(test_data).fillna("") cftrain = fe.count_word_feature(train) cftest = fe.count_word_feature(test) query_label_stat = pd.read_csv('../query_label_stat.csv') # A+ features_train = pd.merge(cftrain, out_a_pd_train) features_test = pd.merge(cftest, out_a_pd_test) features_train = pd.merge(features_train, query_label_stat) features_test = pd.merge(features_test, query_label_stat) features_train = features_train.drop(['d2dsim'], axis=1) features_test = features_test.drop(['d2dsim'], axis=1) y_train = features_train.median_relevance.values
c_dict_test = {'id':idx_test} for i in xrange(traindata.shape[1]): c_dict_train['cD2V'+str(i)] = traindata[:,i] for i in xrange(testdata.shape[1]): c_dict_test['cD2V'+str(i)] = testdata[:,i] out_c_pd_train = pd.DataFrame(c_dict_train) out_c_pd_test = pd.DataFrame(c_dict_test) #########################Ensemble A and B######################################################################################## train = pd.read_csv(trainfap).fillna("") test = pd.read_csv(testfap).fillna("") cftrain = fe.count_word_feature(train) cftest = fe.count_word_feature(test) query_label_stat = pd.read_csv('../query_label_stat.csv') features_train = pd.merge(cftrain,out_a_pd_train) features_test = pd.merge(cftest,out_a_pd_test) features_train = pd.merge(cftrain,out_b_pd_train) features_test = pd.merge(cftest,out_b_pd_test) features_train = pd.merge(features_train,out_b_pd_train) features_test = pd.merge(features_test,out_b_pd_train) features_train = pd.merge(cftrain,query_label_stat) features_test = pd.merge(cftest,query_label_stat)
('svd', TruncatedSVD(n_components=200, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)), ('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm', SVC(C=11, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None))]) clf.fit(s_data, s_labels) t_labels = clf.predict(t_data) B_pd = pd.DataFrame({'id':idx,'bp':t_labels}) # myl train = pd.read_csv('../train_after_preproc.csv').fillna("") test = pd.read_csv('../test_after_preproc.csv').fillna("") # cftrain_rv0 = fe.count_word_feature(train,1) # cftrain_rv1 = fe.count_word_feature(train,2) cftrain = fe.count_word_feature(train) cftest = fe.count_word_feature(test) query_label_stat = pd.read_csv('../query_label_stat.csv') # features_train_rv0 = pd.merge(cftrain_rv0,query_label_stat) # features_train_rv1 = pd.merge(cftrain_rv1,query_label_stat) features_train = pd.merge(cftrain,query_label_stat) features_test = pd.merge(cftest,query_label_stat) # y_rv0 = features_train_rv0.median_relevance.values # X_train_rv0 = features_train_rv0.drop(['query','id','median_relevance'],axis=1).values # y_rv1 = features_train_rv1.median_relevance.values # X_train_rv1 = features_train_rv1.drop(['query','id','median_relevance'],axis=1).values y = features_train.median_relevance.values