binary=True, lowercase=True, stop_words=None, ngram_range=(1, ngram)) X1 = count_vect.fit_transform(dataset1["data"]) y1 = dataset1["target"] # print("finish", "transform") # feature-level X2 = dataset2["data"] y2 = dataset2["target"] y = y1 if feature < X1.shape[1]: X1 = SelectKBest(chi2, k=feature).fit_transform(X1, y) X1 = X1.todense() # print("finish", "Kbest") X = np.concatenate((X1, np.matrix(X2)), axis=1) # print("finish", "append") #for c in Cs: key = " ".join( ["feature", str(feature), "c", str(10), "ngram", str(ngram)]) try: clf = tree.DecisionTreeClassifier() # clf = LogisticRegression(multi_class='ovr', C=10) # clf = svm.SVC(C=c, kernel='linear') scores = cross_val_score(clf, X, y, cv=10, n_jobs=1, verbose=0)
for target_category in data_text_x: count_vect = CountVectorizer(min_df=0, max_df=9999, binary=True, lowercase=True, stop_words=None, ngram_range=(1, 5)) y = data_y[origin_category] x_text = count_vect.fit_transform(data_text_x[origin_category]) x_text = SelectKBest(chi2, k=5000).fit_transform(x_text, y) x_stat = data_stat_x[origin_category] x = np.concatenate((x_text.todense(), np.matrix(x_stat)), axis=1) # regression model reg = linear_model.Ridge(alpha=1.0) reg.fit(x, y) y2 = data_y[target_category] x_text2 = count_vect.fit_transform(data_text_x[target_category]) x_text2 = SelectKBest(chi2, k=5000).fit_transform(x_text2, y2) x_stat2 = data_stat_x[target_category] x2 = np.concatenate((x_text2.todense(), np.matrix(x_stat2)), axis=1) r_square = reg.score(x2, y2) print(origin_category, target_category, str(r_square))