def dict_vectorize(dict_list): assert isinstance(dict_list, list) from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer() vec.fit(dict_list) return vec
def export(self, query, n_topics, n_words, title="PCA Export", fname="PCAExport"): vec = DictVectorizer() rows = topics_to_vectorspace(self.model, n_topics, n_words) X = vec.fit_transform(rows) pca = skPCA(n_components=2) X_pca = pca.fit(X.toarray()).transform(X.toarray()) match = [] for i in range(n_topics): topic = [t[1] for t in self.model.show_topic(i, len(self.dictionary.keys()))] m = None for word in topic: if word in query: match.append(word) break pyplot.figure() for i in range(X_pca.shape[0]): pyplot.scatter(X_pca[i, 0], X_pca[i, 1], alpha=.5) pyplot.text(X_pca[i, 0], X_pca[i, 1], s=' '.join([str(i), match[i]])) pyplot.title(title) pyplot.savefig(fname) pyplot.close()
def pair_vectors(pairs, features, words, output_path): vectorizer = DictVectorizer() vectors = vectorizer.fit_transform(x[1] for x in features) vector_map = {word:vector for word, vector in itertools.izip((x[0].split('/')[0] for x in features), vectors)} # Positive examples positive = [] record = [] for specific, general in pairs: positive.append(vector_map[general] - vector_map[specific]) record.append( (specific, general, 1) ) pair_set = set([tuple(x) for x in pairs]) non_positive = [] for i in range(len(positive)): first = second = None while first == second or (first, second) in pair_set: first = words[random.randint(len(words))] second = words[random.randint(len(words))] non_positive.append(vector_map[second] - vector_map[first]) record.append( (first, second, 0) ) data = vstack(positive + non_positive) target = [1]*len(positive) + [0]*len(non_positive) # Save dataset with open(os.path.join(output_path,'wn-noun-dependencies.mat'), 'wb') as data_file: dump_svmlight_file(data, target, data_file) with open(os.path.join(output_path,'wn-noun-dependencies.json'), 'w') as record_file: json.dump(record, record_file)
def _train(self, train_data, resources): sample_length = len(train_data) dict_status_path = os.path.join(root_dic, 'dict_vectorizer_{}.status'. format(sample_length)) if os.path.isfile(dict_status_path): dictVectorizer = joblib.load(dict_status_path) else: dictVectorizer = DictVectorizer() dictVectorizer.fit(train_data[self.features]. fillna(0). to_dict('record')) joblib.dump(dictVectorizer, dict_status_path) tfidf_status_path = os.path.join(root_dic, 'tfidf_vectorizer_{}.status'. format(sample_length)) if os.path.isfile(tfidf_status_path): tfidf = joblib.load(tfidf_status_path) else: tfidf = TfidfVectorizer(min_df=40, max_features=300) tfidf.fit(train_data.essay) joblib.dump(tfidf, tfidf_status_path) resources['dictVectorizer'] = dictVectorizer resources['tfidf'] = tfidf print 'Head Processing Completed' return train_data, resources
def generate_matrix(): D = [] y = [] fex = features.IpadicFeature() progress = 0 print('create feature dictionary') for q, a in load_corpus(): D.append(list(fex.transform(q))) a = normalize.normalize_askfm(a, h2z=False) y.append(isnot_shitsumon(a)) progress += 1 if progress % 100 == 0: print(progress) dv = DictVectorizer() dv.fit(itertools.chain(*D)) progress = 0 print('create feature vector') X = [] for ds in D: count = None for d in ds: v = dv.transform(d) if count is None: count = v else: count += v X.append(count) progress += 1 if progress % 100 == 0: print(progress) X = scipy.sparse.vstack(X) y = numpy.array(y) return X, y, dv
def get_vector(name, feature_names, full_vector): """ Returns a complete feature vector """ name_features = {} name_features["last_letter"] = name[-1] name_features["last_two"] = name[-2:] name_features["last_is_vowel"] = 0 if name[-1] in "aeiouy" else 0 vectorizer = DictVectorizer() small_vector = vectorizer.fit_transform(name_features).toarray()[0] small_feature_names = vectorizer.get_feature_names() hit_count = 0 for index, feature_name in enumerate(feature_names): if feature_name in small_feature_names: full_vector[index] = small_vector[small_feature_names.index(feature_name)] hit_count += 1 else: full_vector[index] = 0 assert hit_count == len(small_feature_names) == small_vector.shape[0] assert full_vector.shape[0] == len(feature_names) return full_vector
def extractData(features, examples=None, scaler=None, featureOrder=None, scaling=False): vec = DictVectorizer() samples = vec.fit_transform(features) featureNames = vec.get_feature_names() if (featureOrder != None): indices = [featureNames.index(feature) for feature in featureOrder] samples = samples[:, indices] imp = pp.Imputer(missing_values='NaN', strategy='mean') if (examples == None): imp.fit(samples) else : imp.fit(examples) impSamples = imp.transform(samples) if (impSamples.shape == samples.shape): samples = impSamples else: print("too few samples to replace missing values, using 0's") samples[shouldReplace(samples)]=0 # if (scaler == None): # scaler = pp.StandardScaler(with_mean=False) # scaler.fit(samples) # samples = scaler.transform(samples) if (scaling): samples = pp.scale(samples,with_mean=False) if (sprs.isspmatrix(samples)): samples = samples.todense() return [samples, featureNames,imp,scaler]
def vectorize(train_features, test_features): """ convert set of features to vector representation :param train_features: A dictionary with the following structure { instance_id: {f1:count, f2:count,...} ... } :param test_features: A dictionary with the following structure { instance_id: {f1:count, f2:count,...} ... } :return: X_train: A dictionary with the following structure { instance_id: [f1_count,f2_count, ...]} ... } X_test: A dictionary with the following structure { instance_id: [f1_count,f2_count, ...]} ... } """ X_train = {} X_test = {} vec = DictVectorizer() vec.fit(train_features.values()) for instance_id in train_features: X_train[instance_id] = vec.transform(train_features[instance_id]).toarray()[0] for instance_id in test_features: X_test[instance_id] = vec.transform(test_features[instance_id]).toarray()[0] return X_train, X_test
def TransformIntoVectors(totalData,totalLabel): v = DictVectorizer(sparse=True) X = v.fit_transform(totalData) Y = array(totalLabel) return (X,Y)
class Projects: def __init__(self, outcome_file): self.state_feature_index = 7 self.zip_feature_index = 8 self.binary_feature_index = [12, 13, 14, 15, 16, 17, 19, 20, 32, 33] self.categorical_feature_index = [18, 21, 22, 25, 26, 27, 28] self.numerical_feature_index = [29, 30, 31] self.date_feature_index = 34 self.vec = DictVectorizer(sparse=False) self.load_projects(outcome_file) def load_projects(self, outcome_file): fin = open(outcome_file) self.project_feature_names = fin.next().strip().split(',') self.projects = dict((line.strip().split(',')[0], line.strip().split(','))\ for line in fin) fin.close() def all_features(self, pids): measurements_state = map(lambda k: {str(self.state_feature_index): self.projects[k][self.state_feature_index]}, pids) measurements_zip = map(lambda k: {str(self.zip_feature_index): self.projects[k][self.zip_feature_index][:3]}, pids) measurements_bin = map(lambda k: dict((str(fi), self.projects[k][fi]) for fi in self.binary_feature_index), pids) measurements_cat = map(lambda k: dict((str(fi), self.projects[k][fi]) for fi in self.categorical_feature_index), pids) #measurements_num = map(lambda k: [float(self.projects[k][fi]) for fi in self.numerical_feature_index], pids) measurements_num = map(lambda k: dict((str(fi), str(discretize_num(float(self.projects[k][fi])))) for fi in self.numerical_feature_index), pids) return self.vec.fit_transform(measurements_state), self.vec.fit_transform(measurements_zip), self.vec.fit_transform(measurements_bin), self.vec.fit_transform(measurements_cat), self.vec.fit_transform(measurements_num)#,np.array(measurements_num)
def test_dictvectorizer(): D = [{"foo": 1, "bar": 3}, {"bar": 4, "baz": 2}, {"bar": 1, "quux": 1, "quuux": 2}] for sparse in (True, False): for dtype in (int, np.float32, np.int16): for sort in (True, False): for iterable in (True, False): v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort) X = v.fit_transform(iter(D) if iterable else D) assert_equal(sp.issparse(X), sparse) assert_equal(X.shape, (3, 5)) assert_equal(X.sum(), 14) assert_equal(v.inverse_transform(X), D) if sparse: # CSR matrices can't be compared for equality assert_array_equal(X.A, v.transform(iter(D) if iterable else D).A) else: assert_array_equal(X, v.transform(iter(D) if iterable else D)) if sort: assert_equal(v.feature_names_, sorted(v.feature_names_))
def tokenize( training_data, test_data ): # print training_data.shape if 'Risk_Stripe' in categorical_fields: categorical_fields.remove( 'Risk_Stripe' ) for c in categorical_fields: training_data[c] = training_data[c].map(str) test_data[c] = test_data[c].map(str) cat_data = training_data[categorical_fields] ts_cat_data = test_data[categorical_fields] # print cat_data.shape vec = DictVectorizer() tr_cat_data_dict = cat_data.T.to_dict().values() ts_cat_data_dict = ts_cat_data.T.to_dict().values() tr_cat_data_array = vec.fit_transform( tr_cat_data_dict ).toarray() ts_cat_data_array = vec.transform( ts_cat_data_dict ).toarray() # print tr_cat_data_array.shape # print ts_cat_data_array.shape non_cat_data = training_data.drop( categorical_fields, axis=1 ) non_cat_data = np.array( non_cat_data ).astype(np.float) new_tr_data = np.concatenate( (tr_cat_data_array, non_cat_data), axis=1 ) # print new_tr_data.shape non_cat_data = test_data.drop( categorical_fields, axis=1 ) non_cat_data = np.array( non_cat_data ).astype(np.float) new_ts_data = np.concatenate( (ts_cat_data_array, non_cat_data), axis=1 ) # print new_ts_data.shape new_tr_data = pd.DataFrame( new_tr_data, index=training_data.index ) new_ts_data = pd.DataFrame( new_ts_data, index=test_data.index ) return new_tr_data, new_ts_data
def cv_prediction(feature_dict, feature, polarity, threshold, folds): accuracy = 0 precision = 0 recall = 0 f1 = 0 count = 0 dicvec = DictVectorizer() LR = LogisticRegression() kfold = KFold(len(polarity), n_folds=folds) for train, test in kfold: count += 1 x = list() y = list() [x.append(feature[i]) for i in train] [y.append(polarity[i]) for i in train] x.append(feature_dict) y.append(0) LR.fit(dicvec.fit_transform(x), y) test_label = list() answer_label = list() [answer_label.append(polarity[j]) for j in test] for j in test: query = fit_feature(feature[j], feature_dict) result = -1 if query.shape[1] != len(feature_dict) else prediction(LR, query, threshold) test_label.append(result) accuracy += accuracy_score(answer_label, test_label) precision += precision_score(answer_label, test_label) recall += recall_score(answer_label, test_label) f1 += f1_score(answer_label, test_label) print('{}_fold finished.'.format(count)) return accuracy, precision, recall, f1
def _dic_list_to_matrix(self, processedData, normalize): vectorizer = DictVectorizer() if normalize: res = preprocessing.normalize(vectorizer.fit_transform(processedData), norm='l2') else: res = vectorizer.fit_transform(processedData) return vectorizer.get_feature_names(), res
def main(): logging.basicConfig(level=logging.INFO, format='%(message)s') parser = argparse.ArgumentParser(description='Train morphology generation model') parser.add_argument('category', help='Russian word category to (R/V/A/N/M)') parser.add_argument('model', help='output file for trained model') parser.add_argument('--penalty', help='regularization penalty', type=float, default=0.001) args = parser.parse_args() assert len(args.category) == 1 with open(args.model, 'w') as f: f.write('write test / training...') logging.info('Extracting features for training data') training_features = [] training_outputs = [] for source, target, alignment in read_sentences(sys.stdin): for features, output in extract_instances(args.category, source, target, alignment): training_features.append(features) training_outputs.append(output) vectorizer = DictVectorizer() X = vectorizer.fit_transform(training_features) y = training_outputs logging.info('Training data size: %d instances x %d features', *X.shape) logging.info('Training model for category: %s (%d tags)', args.category, len(set(y))) model = LogisticRegression(C=args.penalty) model.fit(X, y) with open(args.model, 'w') as f: cPickle.dump((args.category, vectorizer, model), f, protocol=-1)
def learn_classify__svm_individual(data, folds, test_fold=4): test_folds = [0, 1, 2, 3, 4] X_train = [] y_train = [] X_test = [] y_test = [] for i in test_folds: if i == test_fold: continue for name in folds[i]: c, ind = parse_filename(name) X_train.append(data[c][ind]['features']) y_train.append(data[c][ind]['meta']['stance']) for i in test_folds: if i != test_fold: continue for name in folds[i]: c, ind = parse_filename(name) X_test.append(data[c][ind]['features']) y_test.append(data[c][ind]['meta']['stance']) vectorizer = DictVectorizer(sparse=True) X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.fit_transform(X_test) clf = svm.LinearSVC() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) return accuracy_score(y_test, y_pred)
def get_LinearRegression_Acc(a,b,c): # Convert features into vector of numbers from sklearn.feature_extraction import DictVectorizer v1 = DictVectorizer().fit(a+c) #define training data X_data_tr = v1.transform(a) Y_data_tr = b #define test data X_data_ts = v1.transform(c) #import Linear Regression classifier import numpy as np from sklearn import linear_model regr = linear_model.LinearRegression() regr.fit(X_data_tr,Y_data_tr) #Use trained model to classify test data Y_pred = regr.predict(X_data_ts) # Convert into nearest integer Y_pred = np.rint(Y_pred) return Y_pred
def main(): # load data #path = 'generated/extracted_text' os.system("mkdir generated") path = 'extracted_text' data = map(json.loads, file(path)) # count word for every tag tags = TAGS + ['boilerplate', 'boilerpipe'] counts_per_tag = {} for tag in tags: counts = map(count, get(tag, data)) counts_per_tag[tag] = counts total = sum_up(counts_per_tag, len(data)) # vectorize v = DictVectorizer() v.fit([total]) features = {} for tag in tags: features[tag] = v.transform(counts_per_tag[tag]) save('text_features', features) save('text_vectorizer', v) os.system("mv generated/text_features . ") os.system("mv generated/text_vectorizer . ")
def cat_to_dummy(df, cat_cols, test=False, vectorizer=None): """ Convert categorical variables to dummies for either train or test dataset """ num_cols = list(set(df.columns) - set(cat_cols)) cat = df[cat_cols].astype(str) num = df[num_cols] x_num = num.values cat.fillna( 'NA', inplace = True ) x_cat = cat.T.to_dict().values() if test: vec_x_cat = vectorizer.transform(x_cat) x = np.hstack((x_num, vec_x_cat)) return x else: vectorizer = DV(sparse=False) vec_x_cat = vectorizer.fit_transform(x_cat) x = np.hstack((x_num, vec_x_cat)) return x, vectorizer
def featurize(X): """ Featurizes using just word overlap features. Extracts the unigram features from the given train set. - X: numpy array with (n_train, max_seqlen, max_sentlen) Return: - vector: A numpy array of size (n_train, n_features) corresponding to the featurization of X. """ feats = [] n_train = len(X) print n_train for i in xrange(n_train): feat_dict = {} s1 = X[i][0] s2 = X[i][1] feat_dict.update(word_overlap_features(s1, s2)) feats.append(feat_dict) v = DictVectorizer(sparse=True) vector = v.fit_transform(feats) return vector
def get_neurosynth_terms(combined_df): """ Grab terms for each image, decoded with neurosynth""" terms = list() from sklearn.feature_extraction import DictVectorizer vectorizer = DictVectorizer() image_ids = list() for row in combined_df.iterrows(): image_id = row[1]['image_id'] image_ids.append(int(image_id)) print "Fetching terms for image %i" % image_id image_url = row[1]['url_image'].split('/')[-2] try: elevations = mem.cache(url_get)( 'http://neurosynth.org/decode/data/?neurovault=%s' % image_url) data = json.loads(elevations)['data'] data = dict([(i['analysis'], i['r']) for i in data]) except HTTPError: data = {} terms.append(data) X = vectorizer.fit_transform(terms).toarray() term_dframe = dict([('neurosynth decoding %s' % name, X[:, idx]) for name, idx in vectorizer.vocabulary_.items()]) term_dframe['image_id'] = image_ids return pd.DataFrame(term_dframe)
def KFoldPredictionScore (X,y,k,header): from sklearn.svm import SVC from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer() try: accuracy = 0.0 for X_train, y_train, X_test, y_test in k_fold_generator(X, y, k): vec = DictVectorizer() fit = vec.fit(X_train) X_train_counts = fit.transform(X_train) X_test_counts = fit.transform(X_test) clf = SVC(kernel="linear", C=0.025) try: clf.fit(X_train_counts.toarray(), y_train) #predict = clf.predict(X_test_counts.toarray()) accuracy += clf.score(X_test_counts.toarray(),y_test) # coef = clf._get_coef() # print(np.argsort(coef)[-20:]) #for i in range(0,len(X_test)): #print (X_test[i]['ID']+"\t"+y_test[i]+"\t"+predict[i]) except BaseException as b: print (b) print (header+"\t"+str(accuracy)) except BaseException as b: print (b)
def get_similarity_of_words(): f = open('final_rated_posts.csv', 'r') counters = [] vocab_map = dict() index = 0 for _ in range(5): counters.append(Counter()) for line in f: attributes = line.split('@@') words = set(attributes[1].strip().split()) words_mapped = [] for word in words: if word not in vocab_map: vocab_map[word] = index index += 1 words_mapped.append(vocab_map[word]) label = int(attributes[2]) if label in (1, 2, 3, 4, 5): counters[label-1] += Counter(list(words_mapped)) f.close() vectorizer = DictVectorizer(sparse=False) data = vectorizer.fit_transform(counters) words_1 = set(counters[0].keys()) words_5 = set(counters[4].keys()) print len(words_1) print len(words_5) print len(words_1 - words_5) print len(words_5 - words_1) for i in xrange(len(data)): words_i = data[i] for j in xrange(len(data)): words_j = data[j] print "Comparing words of posts categorized as %d and %d" % (i + 1, j + 1) #print cosine_similarity(words_i, words_j) print jaccard_sim(set(counters[i]), set(counters[j]))
def get_linear_regression_model(x, y): print 'generating linear model' linreg = linear_model.LinearRegression() d = DictVectorizer() t_x = d.fit_transform(x) linreg.fit(t_x, y) return d, linreg
def create_matrix(dataCols, exclude_aids=[], return_pairs=False): # Find aid-pid pairs pairs = [] for aid in [a for a in dataCols.values()[0] if a not in exclude_aids]: for pid in dataCols.values()[0][aid]: addpair = True for dataCol in dataCols.values(): try: tmp = dataCol[aid][pid] except KeyError: addpair = False if addpair: pairs.append((aid, pid)) # Create the list of dictionaries dataDict = [{} for i in pairs] for n in range(len(pairs)): aid, pid = pairs[n] for dataName, data in dataCols.items(): dataDict[n][dataName] = data[aid][pid] # Create the matrix vec = DictVectorizer() data = vec.fit_transform(dataDict).toarray() colnames = vec.get_feature_names() # Done if return_pairs: return data, colnames, pairs else: return data, colnames
def getTermStatistics(all_hits): host = environ['ELASTICSEARCH_SERVER'] if environ.get('ELASTICSEARCH_SERVER') else 'localhost' es = Elasticsearch(hosts=[host]) tfidfs = [] docs = [] for i in range(0, len(all_hits), 100): hits = all_hits[i:i+100] term_res = es.mtermvectors(index=environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_INDEX') else 'memex', doc_type=environ['ELASTICSEARCH_DOC_TYPE'] if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page', term_statistics=True, fields=['text'], ids=hits) #pprint.pprint(term_res['docs']) for doc in term_res['docs']: #pprint.pprint(doc) if doc.get('term_vectors'): if 'text' in doc['term_vectors']: docs.append(doc['_id']) tfidfs.append(terms_from_es_json(doc)) #else: # pprint.pprint(doc) #pprint.pprint(tfidfs) v = DictVectorizer() return [v.fit_transform(tfidfs), v.get_feature_names()]
def extract(self): """ Extract features for clustering """ continuous_features = [] discrete_features = [] for page, text in zip(self.pages, self.texts): # continuous features continuous_features.append([ process(page, text) for key, process in self.CONTINUOUS_FEATURES.iteritems() ]) # discrete features discrete_feature = dict(text['computed'].items()) discrete_feature['path'] = ' > '.join(text['path']) discrete_features.append(discrete_feature) # build numpy array continuous_features = preprocessing.scale(np.array(continuous_features)) # vectorize discrete features vectorizer = DictVectorizer() discrete_features = vectorizer.fit_transform(discrete_features).toarray() return np.hstack([continuous_features, discrete_features]).astype(np.float32)
def test_sklearn_nb(balanced): movie_words = process_plots_mp(balanced) training_movies = [movie_words[i] for i in range(len(movie_words)) if i % 3 != 0] test_movies = [movie_words[i] for i in range(len(movie_words)) if i % 3 == 0] vec = DictVectorizer() training_features = vec.fit_transform([movie.wordcounts for movie in training_movies]).toarray() training_labels = np.array([movie.year for movie in training_movies]) #LOGGER.debug("Original size of feature vectors: %d (issparse: %s)" % ( #csr_matrix(training_features[-1]).toarray().size, str(issparse(training_features)) #)) mnb_classifier = MultinomialNB() mnb_classifier.fit(training_features, training_labels) test_features = vec.transform([movie.wordcounts for movie in test_movies]) test_labels = np.array([movie.year for movie in test_movies]) results = mnb_classifier.predict(test_features) correct = sum([1 for i, result in enumerate(results) if result == test_labels[i]]) LOGGER.info("skleanrn's MultinomialNB classifier predicted %d/%d correctly (%0.3f%% accuracy)" % ( correct, len(test_labels), correct / len(test_labels) * 100 ))
def retrain_models(username): train_x, train_y, body_x, body_y, head_x, head_y = model_retriever.retrieve_data_db(username) b_train_x = [] b_train_y = numpy.concatenate([body_y, train_y]) for msg in (body_x + train_x): b_train_x.append(extract_body_features(msg)) body_vec = TfidfVectorizer(norm="l2") b_train_x = body_vec.fit_transform(b_train_x) h_train_x = [] h_train_y = numpy.concatenate([head_y, train_y]) for msg in (head_x + train_x): h_train_x.append(extract_header_features(msg)) head_vec = DictVectorizer() h_train_x = head_vec.fit_transform(h_train_x) body_model = LinearSVC(loss='l2', penalty="l2", dual=False, tol=1e-3) head_model = RidgeClassifier(tol=1e-2, solver="lsqr") body_model.fit(b_train_x, b_train_y) head_model.fit(h_train_x, h_train_y) print("Finished training models for "+username+"...") store_models(username, body_vec, body_model, head_vec, head_model)
def runsvd(): data = [] y = [] students=set() steps=set() #training input data for i in range(0, numOfLines): data.append({"student_id": str(student_id[i]), "step_id": str(step_full_name[i])}) y.append(int(correct_first_attempt[i])) #students.add(student_id[i]) #steps.add(step_full_name[i]) #training output data data2 = [] y2 = [] for i in range(0, numOfLines2): data2.append({"student_id": str(student_id2[i]), "step_id": str(step_full_name2[i])}) y2.append(int(correct_first_attempt2[i])) test_data = data2 y_test = np.array(y2) train_data = data y_train = np.array(y) print(len(train_data)) print(len(y_train)) train_data_same = copy.copy(train_data) v = DictVectorizer() X_train = v.fit_transform(train_data) #so far N=40 is good, iter=55 fm = pylibfm.FM(num_factors=40, num_iter=55, verbose=True, task="classification", initial_learning_rate=0.2, learning_rate_schedule="optimal") fm.fit(X_train, y_train) # Evaluate train_data_same = v.transform(train_data_same) test_data = v.transform(test_data) preds = fm.predict(test_data) #print(y_train) #print(preds) with open('FactorizationMachineResult.csv', 'w') as csvfile: fieldnames = ['Row', 'Student ID', 'Correct First Attempt', 'real y'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for x in range(numOfLines2): writer.writerow({'Row': row_Num2[x], 'Student ID': student_id2[x] ,'Correct First Attempt': preds[x], 'real y': y_test[x]}) #print("FM MSE: %.4f" % mean_squared_error(y_train,preds)) rmse = mean_squared_error(y_test, preds)**0.5 print("RMSE: %.4f" % rmse) return
## 使用平均年龄来填充年龄中的nan值 train_data['Age'].fillna(train_data['Age'].mean(), inplace=True) test_data['Age'].fillna(test_data['Age'].mean(), inplace=True) ## 使用票价的均值来填充票价中的nan值 test_data['Fare'].fillna(test_data['Age'].mean(), inplace=True) ## 使用登陆最多的港口来填充 print(train_data['Embarked'].value_counts()) train_data['Embarked'].fillna('S', inplace=True) test_data['Embarked'].fillna('S', inplace=True) # 特征选择 features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'] train_features = train_data[features] train_labels = train_data['Survived'] test_features = test_data[features] ##处理符号化的对象 dvec = DictVectorizer(sparse=False) train_features = dvec.fit_transform(train_features.to_dict(orient='record')) print(dvec.feature_names_) #构造ID3决策数 clf = DecisionTreeClassifier(criterion='entropy') #决策树训练 clf.fit(train_features, train_labels) #决策树预测 test_features = dvec.transform(test_features.to_dict(orient='record')) pred_labels = clf.predict(test_features) #得到决策树准确率 acc_decision_tree = round(clf.score(train_features, train_labels), 6) print(u'score 准确率为%4lf' % acc_decision_tree) #K折交叉验证 acc_cross_decision_tree = np.mean( cross_val_score(clf, train_features, train_labels, cv=10))
def fit(self, X, y): """Build a WEASEL classifiers from the training set (X, y), Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True) y = y.values if isinstance(y, pd.Series) else y # Window length parameter space dependent on series length self.n_instances, self.series_length = X.shape[0], len(X.iloc[0, 0]) self.max_window = min(self.series_length, self.max_window) self.window_sizes = list(range(self.min_window, self.max_window, self.win_inc)) max_acc = -1 self.highest_bit = (math.ceil(math.log2(self.max_window))+1) final_bag_vec = None for norm in self.norm_options: # transformers = [] for w, word_length in enumerate(self.word_lengths): all_words = [dict() for x in range(len(X))] transformers = [] for i, window_size in enumerate(self.window_sizes): # if w == 0: # only compute once, otherwise shorten transformer = SFA(word_length=np.max(word_length), alphabet_size=self.alphabet_size, window_size=window_size, norm=norm, anova=self.anova, binning_method=self.binning_strategy, bigrams=self.bigrams, remove_repeat_words=False, lower_bounding=False, save_words=False) sfa_words = transformer.fit_transform(X, y) transformers.append(transformer) # use the shortening of words trick # sfa_words = transformers[i]._shorten_bags(word_length) # TODO refactor? dicts not really needed here ... bag = sfa_words.iloc[:, 0] # chi-squared test to keep only relevent features # bag_vec = DictVectorizer(sparse=False).fit_transform(bag) # chi2_statistics, p = chi2(bag_vec, y) # relevant_features = np.where( # chi2_statistics >= self.chi2_threshold)[0] # merging bag-of-patterns of different window_sizes # to single bag-of-patterns with prefix indicating # the used window-length for j in range(len(bag)): for (key, value) in bag[j].items(): # if key in relevant_features: # chi-squared test # append the prefices to the words to # distinguish between window-sizes word = (key << self.highest_bit) | window_size # X_all_words[j].append((word, value)) all_words[j][word] = value # TODO use CountVectorizer instead on actual words ... ??? vectorizer = DictVectorizer(sparse=True) bag_vec = vectorizer.fit_transform(all_words) clf = LogisticRegression(max_iter=5000, solver="liblinear", dual=True, penalty="l2", random_state=self.random_state) current_acc = cross_val_score(clf, bag_vec, y, cv=5).mean() # clf = RandomForestClassifier(oob_score=True, # n_estimators=1000, # n_jobs=-1).fit(bag_vec, y) # current_acc = clf.oob_score_ # print("Train acc:", norm, word_length, current_acc) if current_acc > max_acc: max_acc = current_acc self.vectorizer = vectorizer self.clf = clf self.SFA_transformers = transformers self.best_word_length = word_length final_bag_vec = bag_vec if max_acc == 1.0: break # there can be no better model than 1.0 # # fit final model using all words # for i, window_size in enumerate(self.window_sizes): # self.SFA_transformers[i] = \ # SFA(word_length=np.max(self.word_lengths), # alphabet_size=self.alphabet_size, # window_size=window_size, # norm=norm, # anova=self.anova, # binning_method=self.binning_strategy, # bigrams=self.bigrams, # remove_repeat_words=False, # lower_bounding=False, # save_words=False) # self.SFA_transformers[i].fit_transform(X, y) self.clf.fit(final_bag_vec, y) self._is_fitted = True return self
parser.add_argument('student_data', type=argparse.FileType('r'), help="the student data file in datashop format") args = parser.parse_args() if args.ft == "transaction": ssr_file = transaction_to_student_step(args.student_data) ssr_file = open(ssr_file, 'r') else: ssr_file = args.student_data kcs, opps, y, stu, student_label, item_label = read_datashop_student_step( ssr_file) # Get everything in the right matrix format sv = DictVectorizer() qv = DictVectorizer() ov = DictVectorizer() S = sv.fit_transform(stu) Q = qv.fit_transform(kcs) O = ov.fit_transform(opps) X = hstack((S, Q, O)) y = np.array(y) # Regularize the student intercepts l2 = [1.0 for i in range(S.shape[1])] l2 += [0.0 for i in range(Q.shape[1])] l2 += [0.0 for i in range(O.shape[1])] # Bound the learning rates to be positive bounds = [(None, None) for i in range(S.shape[1])]
n_hidden_2 = 32 # 隠れ層2のユニットの数 n_input = 4 # 与える変数の数 n_classes = 2 # 分類するクラスの数 今回は生き残ったか否かなので2 # csvファイルの読み込み df = pd.read_csv('train.csv', header=0) labelEncoder = preprocessing.LabelEncoder() df['Sex'] = labelEncoder.fit_transform(df['Sex']) # df['Cabin'] = labelEncoder.fit_transform(df['Cabin']) # df['Embarked'] = labelEncoder.fit_transform(df['Embarked']) #x_np = np.array(df[['Pclass', 'Sex', 'Age', 'Parch' ,'Fare']].fillna(0)) # x_np = np.array(df[['Pclass', 'Sex', 'Age' ,'Fare']].fillna(0)) x_np = np.array(df[['Pclass', 'Sex', 'Age', 'Fare']].fillna(0)) d = df[['SurvivedText']].to_dict('record') vectorizer = DictVectorizer(sparse=False) print(d) y_np = vectorizer.fit_transform(d) print(y_np) [x_train, x_test] = np.vsplit(x_np, [train_size]) # 入力データを訓練データとテストデータに分ける [y_train, y_test] = np.vsplit(y_np, [train_size]) # ラベルを訓練データをテストデータに分ける #x_train, x_test, y_train, y_test = train_test_split(x_np, y_np, test_size=0.3, random_state=0) # tf Graph input x = tf.placeholder("float", [None, n_input]) # 回答が二種類 y = tf.placeholder("float", [None, n_classes])
if len(right_feature) == k + 1: features['%s-%s' % (i + 1, i + k + 1)] = right_feature if len(left_feature) == k + 1: features['%s-%s' % (-i - 1, -i - k - 1)] = left_feature x.append(features) word_stripped = word.replace('-', '') return (x, [_build_feature_dict(word_stripped, k, size, size) for k in xrange(len(word_stripped))], #(np.array(y) == 0).astype(int), np.array(y, dtype=int) + 2, np.array(stress, dtype=int)) if __name__ == '__main__': X_train, y_train = [], [] vect_syl = DictVectorizer(sparse=True) vect_stress = DictVectorizer(sparse=True) vect_syl.feature_names_ = set() vect_stress.feature_names_ = set() # fit vectorizers for _, word, stress in syllabifications('../silabe.train.xml', 10): if len(word.strip().replace('-', '')) != len(stress): print >> sys.stderr, "Skipped %s" % word continue x_dict_syl, x_dict_stress, y_syl, y_stress = word_to_feature_dict( word.strip(), stress, size=4) for x in x_dict_syl: for f, v in x.iteritems(): if isinstance(v, (str, unicode)): f = "%s%s%s" % (f, vect_syl.separator, v) vect_syl.feature_names_.add(f)
j += 1 print "Progress : ", j, " / ", len(X_train), "\r", sys.stdout.flush() x_test[steal_ratio] = [] for dataPoint in X_test: x_test[steal_ratio].append(featureHybrid(dataPoint, steal_ratio)) # z = zip(x_train, y_train) # random.shuffle(z) # x_train = [a for a,b in z] # y_train = [b for a,b in z] v = DictVectorizer(sparse = True) x_train = v.fit_transform(x_train) print "Shape of Data : ", x_train.shape print "Positive Labels : ", sum(y_train) model[steal_ratio] = LinearSVC().fit(x_train, y_train) p_label = model[steal_ratio].predict(x_train) showResults(y_train, p_label) print "Testing set size : ", len(x_test[steal_ratio]) print "Positive Labels : ", sum(y_test) x_test[steal_ratio] = v.transform(x_test[steal_ratio]) p_label = model[steal_ratio].predict(x_test[steal_ratio]) showResults(y_test, p_label)
def __init__(self, lens, **kwargs): super().__init__(**kwargs) self.lens = lens self.vect = DictVectorizer()
def __init__(self, dataset, training, test, config='config/properties.json', lr=0.001, num_factors=10, num_iter=100, threshold=4, implicit=False): self.dataset = dataset self.config_file = config self.properties = [] self.implicit = implicit if self.implicit: self.threshold = 0.5 else: self.threshold = threshold self._set_properties() self._read_item_attributes() print('finished reading item attributes') self.model = pylibfm.FM(num_factors=num_factors, num_iter=num_iter, verbose=True, task="classification", initial_learning_rate=lr, learning_rate_schedule="optimal") self.x_train, self.y_train, self.train_users, self.train_items = self._load_data( training) self.x_test, self.y_test, self.test_users, self.test_items = self._load_data( test) if self.implicit: # need to generate negative candidates for training num_negative_candidates = 100 all_items = self.train_items.union(self.test_items) unrated_items = [ item for item in all_items if item not in self.train_items ] unrated_items = sorted(unrated_items) for user in self.train_users: negative_candidates = list( random.sample(unrated_items, num_negative_candidates)) for item in negative_candidates: self.x_train.append(self._fetch_attributes(user, item)) self.y_train.append(0.) for user in self.test_users: negative_candidates = list( random.sample(unrated_items, num_negative_candidates)) for item in negative_candidates: self.x_test.append(self._fetch_attributes(user, item)) self.y_test.append(0.) print('finished reading data') self.vectorizer = DictVectorizer() self.x_train = self.vectorizer.fit_transform(self.x_train) self.x_test = self.vectorizer.transform(self.x_test) print('finished transforming data') self.model.fit(self.x_train, self.y_train) # fit the model print('finished fitting model')
item_ids = [] for item_id in item_ids: x = np.append(user_dic[user_id].A, item_dic[item_id].A).reshape(1, -1) x_tem = np.append(x_tem, x, axis=0) y_data.append(vector[item_id]) users.append(user_id) items.append(item_id) x_data = sparse.vstack((x_data, sparse.csr_matrix(x_tem[1:]))) print('Creating X_data Now:', user_id) return x_data.tocsr()[1:], np.array(y_data), np.array(users), np.array( items) data = open('Data_Lon.csv').readlines() user_data, user_style, item_attribute, item_tag = Load_data(data) v = DictVectorizer() user_data = v.fit_transform(user_data) user_data = sparse.hstack((user_data, user_style)).tocsr() item_data = sparse.hstack((item_attribute, item_tag)).tocsr() x_data, y_data, user_id, item_id = create_x_data() sparse.save_npz('x_data.npz', x_data) sparse.save_npz('y_data.npz', sparse.csr_matrix(y_data)) sparse.save_npz('user_id.npz', sparse.csr_matrix(user_id)) sparse.save_npz('item_id.npz', sparse.csr_matrix(item_id))
# #filtered = [w for w, pos in refiltered if pos.startswith('NN')] # #词干化 # ps = PorterStemmer() # filtered = [ps.stem(w) for w in filtered] # return " ".join(filtered) def clear_title(title, remove_stopwords): raw_text = BeautifulSoup(title, 'lxml').get_text() letters = re.sub('[^a-zA-Z]', ' ', raw_text) words = letters.lower().split() if remove_stopwords: stop_words = set(stopwords.words('english')) words = [w for w in words if w not in stop_words] return ' '.join(words) dict_vec = DictVectorizer(sparse=False) PATH_TO_ORIGINAL_DATA = './datasets/' #PATH_TO_PROCESSED_DATA = '/path/to/store/processed/data/' data = pd.read_csv(PATH_TO_ORIGINAL_DATA + 'cleared_bugs', sep='\t') selected_columns = ['Product', 'Component', 'Assignee', 'Summary'] data = data[selected_columns] #print len(data['Product'].unique()) #print len(data['Component'].unique()) classes = data['Assignee'].unique() n_classes = len(classes) #print n_classes classmap = pd.Series(data=np.arange(n_classes), index=classes) #print classmap data = pd.merge(data, pd.DataFrame({ 'Assignee': classes,
# 首先我们补充age里的数据,使用平均数或者中位数都是对模型偏离造成最小影响的策略。 #inplace=True:不创建新的对象,直接对原始对象进行修改; # inplace=False:对数据进行修改,创建并返回新的对象承载其修改结果。 X['age'].fillna(X['age'].mean(), inplace=True) #对原始数据进行分割,25%的乘客数据用于测试 from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33) #对类别型特征进行转化,成为特征向量 from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer(sparse=False) #sparse=False意思是不产生稀疏矩阵 #转换特征后,凡是类别型的特征都单独剥离出来,独成一列特征,数值型保持不变 X_train = vec.fit_transform(X_train.to_dict(orient='record')) # 对测试数据的特征进行转换 X_test = vec.transform(X_test.to_dict(orient='record')) # 一、使用单一决策树进行模型训练和预测分析 from sklearn.tree import DecisionTreeClassifier dtc = DecisionTreeClassifier() #使用默认配置初始化决策树分类器 dtc.fit(X_train, y_train) #使用分割得到的训练数据进行模型学习 dtc_y_predict = dtc.predict(X_test) #使用训练好的决策树模型对测试特征数据进行预测 # 二、使用随机森林分类器进行集成模型的训练以及预测分析 from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier()
class FMRec: def __init__(self, dataset, training, test, config='config/properties.json', lr=0.001, num_factors=10, num_iter=100, threshold=4, implicit=False): self.dataset = dataset self.config_file = config self.properties = [] self.implicit = implicit if self.implicit: self.threshold = 0.5 else: self.threshold = threshold self._set_properties() self._read_item_attributes() print('finished reading item attributes') self.model = pylibfm.FM(num_factors=num_factors, num_iter=num_iter, verbose=True, task="classification", initial_learning_rate=lr, learning_rate_schedule="optimal") self.x_train, self.y_train, self.train_users, self.train_items = self._load_data( training) self.x_test, self.y_test, self.test_users, self.test_items = self._load_data( test) if self.implicit: # need to generate negative candidates for training num_negative_candidates = 100 all_items = self.train_items.union(self.test_items) unrated_items = [ item for item in all_items if item not in self.train_items ] unrated_items = sorted(unrated_items) for user in self.train_users: negative_candidates = list( random.sample(unrated_items, num_negative_candidates)) for item in negative_candidates: self.x_train.append(self._fetch_attributes(user, item)) self.y_train.append(0.) for user in self.test_users: negative_candidates = list( random.sample(unrated_items, num_negative_candidates)) for item in negative_candidates: self.x_test.append(self._fetch_attributes(user, item)) self.y_test.append(0.) print('finished reading data') self.vectorizer = DictVectorizer() self.x_train = self.vectorizer.fit_transform(self.x_train) self.x_test = self.vectorizer.transform(self.x_test) print('finished transforming data') self.model.fit(self.x_train, self.y_train) # fit the model print('finished fitting model') def _set_properties(self): with codecs.open(self.config_file, 'r', encoding='utf-8') as config_read: property_file = json.loads(config_read.read()) for typology in property_file[self.dataset]: for property_name in property_file[self.dataset][typology]: self.properties.append(Property(property_name, typology)) def _fetch_attributes(self, user, item): # create a dictionary with user item interactions and item attributes d = {'user_id': user, 'item_id': item} attribute_dict = self.item_attributes[item] for prop_name, attribute in attribute_dict.items(): d[prop_name] = attribute return d def _read_item_attributes(self): self.item_attributes = defaultdict( dict) # dict of dict containing item attributes for prop in self.properties: # iterate in content based properties prop_name = prop.name if prop_name == 'feedback': # no need for feedback data here pass if 'feedback_' in prop_name: # no need for hybrid graphs prop_name = prop_name.replace('feedback_', '') with open('datasets/%s/graphs/%s.edgelist' % (self.dataset, prop_name)) as edgelist: for line in edgelist: line_split = line.strip('\n').split(' ') item = line_split[0] attribute = line_split[1] self.item_attributes[item][prop_name] = attribute def _load_data(self, data): X = [] y = [] users = set() items = set() with open(data) as data_file: for line in data_file: line_split = line.strip('\n').split(' ') user = line_split[0] item = line_split[1] rating = line_split[2] # create a dictionary with user item interactions and item attributes d = self._fetch_attributes(user, item) X.append(d) if int(rating) >= self.threshold: rating = 1 else: rating = 0 y.append(float(rating)) users.add(user) items.add(item) return X, y, users, items def compute_user_item_features(self, user, item, items_liked_by_user, users_liking_the_item): try: d = self._fetch_attributes(user, item) score = self.model.predict(self.vectorizer.transform(d))[0] features = [score] # user item relatedness from fm model except KeyError: # do not have user item pair in embedding features = [0.] return features def fit(self, x_train, y_train, qids_train): return 0 def predict(self, x_test, qids_test): preds = x_test return preds @staticmethod def parse_args(): parser = argparse.ArgumentParser(description="Run entity2rec.") parser.add_argument('--dimensions', type=int, default=200, help='Number of dimensions. Default is 200.') parser.add_argument('--iter', default=5, type=int, help='Number of epochs in SGD') parser.add_argument('--workers', type=int, default=8, help='Number of parallel workers. Default is 8.') parser.add_argument('--config_file', nargs='?', default='config/properties.json', help='Path to configuration file') parser.add_argument('--dataset', nargs='?', default='Movielens1M', help='Dataset') parser.add_argument('--train', dest='train', help='train', default=None) parser.add_argument('--test', dest='test', help='test', default=None) parser.add_argument('--validation', dest='validation', default=None, help='validation') parser.add_argument( '--all_items', dest='all_unrated_items', action='store_false', default=True, help= 'Whether keeping the rated items of the training set as candidates. ' 'Default is AllUnratedItems') parser.add_argument('--implicit', dest='implicit', action='store_true', default=False, help='Implicit feedback with boolean values') parser.add_argument('--write_features', dest='write_features', action='store_true', default=False, help='Writes the features to file') parser.add_argument('--read_features', dest='read_features', action='store_true', default=False, help='Reads the features from a file') parser.add_argument( '--threshold', dest='threshold', default=4, type=int, help='Threshold to convert ratings into binary feedback') parser.add_argument('--num_users', dest='num_users', type=int, default=False, help='Sample of users for evaluation') parser.add_argument('--lr', dest='lr', type=float, default=0.001, help='Starting value for the learning rate') parser.add_argument('--hyper_opt', dest='hyper_opt', default=False, action='store_true', help='Sample of users for evaluation') return parser.parse_args()
def get_data_queue(args): users, items, labels = [], [], [] if args.dataset == 'ml-100k': data_path = os.path.join(args.data, 'ml-100k', 'u.data') elif args.dataset == 'ml-1m': data_path = os.path.join(args.data, 'ml-1m', 'ratings.dat') elif args.dataset == 'ml-10m': data_path = os.path.join(args.data, 'ml-10m', 'ratings.dat') elif args.dataset == 'youtube-small': data_path = os.path.join(args.data, 'youtube-weighted-small.npy') if 'ml' in args.dataset: # movielens dataset with open(data_path, 'r') as f: for i, line in enumerate(f.readlines()): if args.dataset == 'ml-100k': line = line.split() elif args.dataset == 'ml-1m' or args.dataset == 'ml-10m': line = line.split('::') users.append(int(line[0]) - 1) items.append(int(line[1]) - 1) labels.append(float(line[2])) labels = StandardScaler().fit_transform(np.reshape( labels, [-1, 1])).flatten().tolist() print('user', max(users), min(users)) print('item', max(items), min(items)) users, items, labels = shuffle(users, items, labels) indices = list(range(len(users))) num_train = int(len(users) * args.train_portion) num_valid = int(len(users) * args.valid_portion) if not args.mode == 'libfm': data_queue = torch.utils.data.TensorDataset( torch.tensor(users), torch.tensor(items), torch.tensor(labels)) train_queue = torch.utils.data.DataLoader( data_queue, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[:num_train]), pin_memory=True) valid_queue = torch.utils.data.DataLoader( data_queue, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[num_train:num_train + num_valid]), pin_memory=True) test_queue = torch.utils.data.DataLoader( data_queue, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[num_train + num_valid:]), pin_memory=True) else: # prepare data format for libfm data_queue = [] for i in range(len(users)): data_queue.append({ 'user': str(users[i]), 'item': str(items[i]) }) v = DictVectorizer() data_queue = v.fit_transform(data_queue) train_queue = [ data_queue[:num_train], np.array(labels[:num_train]) ] valid_queue = [ data_queue[num_train:num_train + num_valid], np.array(labels[num_train:num_train + num_valid]) ] test_queue = [ data_queue[num_train + num_valid:], np.array(labels[num_train + num_valid:]) ] else: # 3-d dataset [ps, qs, rs, labels] = np.load(data_path).tolist() labels = StandardScaler().fit_transform(np.reshape( labels, [-1, 1])).flatten().tolist() ps = [int(i) for i in ps] qs = [int(i) for i in qs] rs = [int(i) for i in rs] print('p', max(ps), min(ps)) print('q', max(qs), min(qs)) print('r', max(rs), min(rs)) ps, qs, rs, labels = shuffle(ps, qs, rs, labels) indices = list(range(len(ps))) num_train = int(len(ps) * args.train_portion) num_valid = int(len(ps) * args.valid_portion) if not args.mode == 'libfm': data_queue = torch.utils.data.TensorDataset( torch.tensor(ps), torch.tensor(qs), torch.tensor(rs), torch.tensor(labels)) train_queue = torch.utils.data.DataLoader( data_queue, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[:num_train]), pin_memory=True) valid_queue = torch.utils.data.DataLoader( data_queue, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[num_train:num_train + num_valid]), pin_memory=True) test_queue = torch.utils.data.DataLoader( data_queue, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler( indices[num_train + num_valid:]), pin_memory=True) else: # prepare data format for libfm data_queue = [] for i in range(len(ps)): data_queue.append({ 'p': str(ps[i]), 'q': str(qs[i]), 'r': str(rs[i]) }) v = DictVectorizer() data_queue = v.fit_transform(data_queue) train_queue = [ data_queue[:num_train], np.array(labels[:num_train]) ] valid_queue = [ data_queue[num_train:num_train + num_valid], np.array(labels[num_train:num_train + num_valid]) ] test_queue = [ data_queue[num_train + num_valid:], np.array(labels[num_train + num_valid:]) ] return train_queue, valid_queue, test_queue
examples, ys, train_size=0.9, shuffle=True, random_state=RANDOM_SEED, ) # split off train, validate from (tv) pieces. ex_train, ex_vali, y_train, y_vali = train_test_split( ex_tv, y_tv, train_size=0.9, shuffle=True, random_state=RANDOM_SEED ) #%% vectorize: from sklearn.preprocessing import StandardScaler, MinMaxScaler feature_numbering = DictVectorizer(sparse=False) # Learn columns from training data (again) feature_numbering.fit(ex_train) # Translate our list of texts -> matrices of counts rX_train = feature_numbering.transform(ex_train) rX_vali = feature_numbering.transform(ex_vali) rX_test = feature_numbering.transform(ex_test) scaling = StandardScaler() X_train = scaling.fit_transform(rX_train) X_vali = scaling.transform(rX_vali) X_test = scaling.transform(rX_test) print(X_train.shape, X_vali.shape) #%% train a model: from sklearn.tree import DecisionTreeRegressor
"""Here we transform each input (a string) into a python dict full of features""" return [self._ff(s) for s in X] if __name__ == "__main__": # create Logistic Regression pipeline text_log_clf = Pipeline( [ ('ff', FF( lowercase=True, byte_unigrams=True, ) ), # This will convert python dicts into efficient sparse data structures ('dict', DictVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression(max_iter=500, verbose=2, C=100., solver='sag')), ] ) # create Naive Bayes pipeline text_nbc_clf = Pipeline( [ ('ff', FF( lowercase=True, byte_unigrams=True, ) ), # This will convert python dicts into efficient sparse data structures ('dict', DictVectorizer()),
if dataset == "train": with open('../data/train_2.dat') as f: for line in f: (userID, movieID, rating) = line.split(' ') data.append({"userID": str(userID), "movieID": str(movieID)}) try: # for matrix factorization, this was y.append(float(rating)) # y.append(float(rating)) except ValueError: print "Check line {l}".format(l=line) users.add(userID) movies.add(movieID) return (data, y, users, movies) train = get_unique_users_movies("train") test = get_unique_users_movies("test") X_train, y_train = train[0], train[1] X_test = test[0] print type(y_train) v = DictVectorizer() X_train_dv = v.fit_transform(X_train) X_test_dv = v.transform(X_test) print X_train_dv
from sklearn.feature_extraction import DictVectorizer '''one-hot编码''' onehot_encoder = DictVectorizer() X = [{'city': 'New York'}, {'city': 'San Francisco'}, {'city': 'Chapel Hill'}] print(onehot_encoder.fit_transform(X).toarray()) '''特征标准化''' # 等同于StandardScaler from sklearn import preprocessing import numpy as np X = np.array([[0., 0., 5., 13., 9., 1.], [0., 0., 13., 15., 10., 15.], [0., 3., 15., 2., 0., 11.]]) print(preprocessing.scale(X)) # 能更好的处理异常值 from sklearn.preprocessing import RobustScaler scaler = RobustScaler() X_scaled = scaler.fit_transform(X) print(X_scaled)
print(train_data.columns.values.tolist()) tf_idf = TfidfVectorizer(min_df=5) train_full_descr_transformed = tf_idf.fit_transform( train_data['FullDescription'].values.astype('U'), y=None) test_full_descr_transformed = tf_idf.transform( test_data['FullDescription'].values.astype('U')) train_data['LocationNormalized'].fillna('nan', inplace=True) train_data['ContractTime'].fillna('nan', inplace=True) from sklearn.feature_extraction import DictVectorizer enc = DictVectorizer() X_train_categ = enc.fit_transform( train_data[['LocationNormalized', 'ContractTime']].to_dict('records')) X_test_categ = enc.transform(test_data[['LocationNormalized', 'ContractTime']].to_dict('records')) """ print ('X_train_categ size: ', X_train_categ.size, '\n') print ('X_test_categ size: ', X_test_categ.size, '\n') print ('test_data[[LocationNormalized, ContractTime]: ', test_data[['LocationNormalized', 'ContractTime']], '\n') print ('X_train_categ: ', X_train_categ, '\n') print ('train_full_descr_transformed size: ', train_full_descr_transformed.size, '\n') print ('train_data[LocationNormalized] size: ', train_data['LocationNormalized'].size, '\n') """ from scipy.sparse import hstack
def __init__(self): # Any classifier could be used here self.model = LogisticRegression() self.vectorizer = DictVectorizer() self.labelEncoder = LabelEncoder()
def train(data, classifier_file): # do not change the heading of the function data_list = data model_x = [] model_y = [] vo_list = { 'IH', 'UW', 'OY', 'AH', 'ER', 'EY', 'AO', 'AW', 'AY', 'EH', 'AE', 'UH', 'IY', 'AA', 'OW' } co_list = { 'W', 'K', 'HH', 'G', 'JH', 'Z', 'Y', 'N', 'V', 'SH', 'L', 'NG', 'S', 'CH', 'R', 'D', 'B', 'TH', 'F', 'DH', 'T', 'P', 'M', 'ZH' } strong_suffixes = { 'al', 'ance', 'ancy', 'ant', 'ard', 'ary', 'àte', 'auto', 'ence', 'ency', 'ent', 'ery', 'est', 'ial', 'ian', 'iana', 'en', 'ésce', 'ic', 'ify', 'ine', 'ion', 'tion', 'ity', 'ive', 'ory', 'ous', 'ual', 'ure', 'wide', 'y', 'se', 'ade', 'e', 'ee', 'een', 'eer', 'ese', 'esque', 'ette', 'eur', 'ier', 'oon', 'que' } strong_prefixes = { 'ad', 'co', 'con', 'counter', 'de', 'di', 'dis', 'e', 'en', 'ex', 'in', 'mid', 'ob', 'para', 'pre', 're', 'sub', 'a', 'be', 'with', 'for' } neutral_prefixes = { 'down', 'fore', 'mis', 'over', 'out', 'un', 'under', 'up', 'anti', 'bi', 'non', 'pro', 'tri', 'contra', 'counta', 'de', 'dis', 'extra', 'inter', 'intro', 'multi', 'non', 'post', 'retro', 'super', 'trans', 'ultra' } neutral_suffixes = { 'able', 'age', 'al', 'ate', 'ed', 'en', 'er', 'est', 'ful', 'hood', 'ible', 'ing', 'ile', 'ish', 'ism', 'ist', 'ize', 'less', 'like', 'ly' 'man', 'ment', 'most', 'ness', 'old', 's', 'ship', 'some', 'th', 'ward', 'wise', 'y' } suffixes = { 'inal', 'ain', 'tion', 'sion', 'osis', 'oon', 'sce', 'que', 'ette', 'eer', 'ee', 'aire', 'able', 'ible', 'acy', 'cy', 'ade', 'age', 'al', 'al', 'ial', 'ical', 'an', 'ance', 'ence', 'ancy', 'ency', 'ant', 'ent', 'ant', 'ent', 'ient', 'ar', 'ary', 'ard', 'art', 'ate', 'ate', 'ate', 'ation', 'cade', 'drome', 'ed', 'ed', 'en', 'en', 'ence', 'ency', 'er', 'ier', 'er', 'or', 'er', 'or', 'ery', 'es', 'ese', 'ies', 'es', 'ies', 'ess', 'est', 'iest', 'fold', 'ful', 'ful', 'fy', 'ia', 'ian', 'iatry', 'ic', 'ic', 'ice', 'ify', 'ile', 'ing', 'ion', 'ish', 'ism', 'ist', 'ite', 'ity', 'ive', 'ive', 'ative', 'itive', 'ize', 'less', 'ly', 'ment', 'ness', 'or', 'ory', 'ous', 'eous', 'ose', 'ious', 'ship', 'ster', 'ure', 'ward', 'wise', 'ize', 'phy', 'ogy' } prefixes = { 'ac', 'ad', 'af', 'ag', 'al', 'an', 'ap', 'as', 'at', 'an', 'ab', 'abs', 'acer', 'acid', 'acri', 'act', 'ag', 'acu', 'aer', 'aero', 'ag', 'agi', 'ig', 'act', 'agri', 'agro', 'alb', 'albo', 'ali', 'allo', 'alter', 'alt', 'am', 'ami', 'amor', 'ambi', 'ambul', 'ana', 'ano', 'andr', 'andro', 'ang', 'anim', 'ann', 'annu', 'enni', 'ante', 'anthrop', 'anti', 'ant', 'anti', 'antico', 'apo', 'ap', 'aph', 'aqu', 'arch', 'aster', 'astr', 'auc', 'aug', 'aut', 'aud', 'audi', 'aur', 'aus', 'aug', 'auc', 'aut', 'auto', 'bar', 'be', 'belli', 'bene', 'bi', 'bine', 'bibl', 'bibli', 'biblio', 'bio', 'bi', 'brev', 'cad', 'cap', 'cas', 'ceiv', 'cept', 'capt', 'cid', 'cip', 'cad', 'cas', 'calor', 'capit', 'capt', 'carn', 'cat', 'cata', 'cath', 'caus', 'caut', 'cause', 'cuse', 'cus', 'ceas', 'ced', 'cede', 'ceed', 'cess', 'cent', 'centr', 'centri', 'chrom', 'chron', 'cide', 'cis', 'cise', 'circum', 'cit', 'civ', 'clam', 'claim', 'clin', 'clud', 'clus claus', 'co', 'cog', 'col', 'coll', 'con', 'com', 'cor', 'cogn', 'gnos', 'com', 'con', 'contr', 'contra', 'counter', 'cord', 'cor', 'cardi', 'corp', 'cort', 'cosm', 'cour', 'cur', 'curr', 'curs', 'crat', 'cracy', 'cre', 'cresc', 'cret', 'crease', 'crea', 'cred', 'cresc', 'cret', 'crease', 'cru', 'crit', 'cur', 'curs', 'cura', 'cycl', 'cyclo', 'de', 'dec', 'deca', 'dec', 'dign', 'dei', 'div', 'dem', 'demo', 'dent', 'dont', 'derm', 'di', 'dy', 'dia', 'dic', 'dict', 'dit', 'dis', 'dif', 'dit', 'doc', 'doct', 'domin', 'don', 'dorm', 'dox', 'duc', 'duct', 'dura', 'dynam', 'dys', 'ec', 'eco', 'ecto', 'en', 'em', 'end', 'epi', 'equi', 'erg', 'ev', 'et', 'ex', 'exter', 'extra', 'extro', 'fa', 'fess', 'fac', 'fact', 'fec', 'fect', 'fic', 'fas', 'fea', 'fall', 'fals', 'femto', 'fer', 'fic', 'feign', 'fain', 'fit', 'feat', 'fid', 'fid', 'fide', 'feder', 'fig', 'fila', 'fili', 'fin', 'fix', 'flex', 'flect', 'flict', 'flu', 'fluc', 'fluv', 'flux', 'for', 'fore', 'forc', 'fort', 'form', 'fract', 'frag', 'frai', 'fuge', 'fuse', 'gam', 'gastr', 'gastro', 'gen', 'gen', 'geo', 'germ', 'gest', 'giga', 'gin', 'gloss', 'glot', 'glu', 'glo', 'gor', 'grad', 'gress', 'gree', 'graph', 'gram', 'graf', 'grat', 'grav', 'greg', 'hale', 'heal', 'helio', 'hema', 'hemo', 'her', 'here', 'hes', 'hetero', 'hex', 'ses', 'sex', 'h**o', 'hum', 'human', 'hydr', 'hydra', 'hydro', 'hyper', 'hypn', 'an', 'ics', 'ignis', 'in', 'im', 'in', 'im', 'il', 'ir', 'infra', 'inter', 'intra', 'intro', 'ty', 'jac', 'ject', 'join', 'junct', 'judice', 'jug', 'junct', 'just', 'juven', 'labor', 'lau', 'lav', 'lot', 'lut', 'lect', 'leg', 'lig', 'leg', 'levi', 'lex', 'leag', 'leg', 'liber', 'liver', 'lide', 'liter', 'loc', 'loco', 'log', 'logo', 'ology', 'loqu', 'locut', 'luc', 'lum', 'lun', 'lus', 'lust', 'lude', 'macr', 'macer', 'magn', 'main', 'mal', 'man', 'manu', 'mand', 'mania', 'mar', 'mari', 'mer', 'matri', 'medi', 'mega', 'mem', 'ment', 'meso', 'meta', 'meter', 'metr', 'micro', 'migra', 'mill', 'kilo', 'milli', 'min', 'mis', 'mit', 'miss', 'mob', 'mov', 'mot', 'mon', 'mono', 'mor', 'mort', 'morph', 'multi', 'nano', 'nasc', 'nat', 'gnant', 'nai', 'nat', 'nasc', 'neo', 'neur', 'nom', 'nom', 'nym', 'nomen', 'nomin', 'non', 'non', 'nov', 'nox', 'noc', 'numer', 'numisma', 'ob', 'oc', 'of', 'op', 'oct', 'oligo', 'omni', 'onym', 'oper', 'ortho', 'over', 'pac', 'pair', 'pare', 'paleo', 'pan', 'para', 'pat', 'pass', 'path', 'pater', 'patr', 'path', 'pathy', 'ped', 'pod', 'pedo', 'pel', 'puls', 'pend', 'pens', 'pond', 'per', 'peri', 'phage', 'phan', 'phas', 'phen', 'fan', 'phant', 'fant', 'phe', 'phil', 'phlegma', 'phobia', 'phobos', 'phon', 'phot', 'photo', 'pico', 'pict', 'plac', 'plais', 'pli', 'ply', 'plore', 'plu', 'plur', 'plus', 'pneuma', 'pneumon', 'pod', 'poli', 'poly', 'pon', 'pos', 'pound', 'pop', 'port', 'portion', 'post', 'pot', 'pre', 'pur', 'prehendere', 'prin', 'prim', 'prime', 'pro', 'proto', 'psych', 'punct', 'pute', 'quat', 'quad', 'quint', 'penta', 'quip', 'quir', 'quis', 'quest', 'quer', 're', 'reg', 'recti', 'retro', 'ri', 'ridi', 'risi', 'rog', 'roga', 'rupt', 'sacr', 'sanc', 'secr', 'salv', 'salu', 'sanct', 'sat', 'satis', 'sci', 'scio', 'scientia', 'scope', 'scrib', 'script', 'se', 'sect', 'sec', 'sed', 'sess', 'sid', 'semi', 'sen', 'scen', 'sent', 'sens', 'sept', 'sequ', 'secu', 'sue', 'serv', 'sign', 'signi', 'simil', 'simul', 'sist', 'sta', 'stit', 'soci', 'sol', 'solus', 'solv', 'solu', 'solut', 'somn', 'soph', 'spec', 'spect', 'spi', 'spic', 'sper', 'sphere', 'spir', 'stand', 'stant', 'stab', 'stat', 'stan', 'sti', 'sta', 'st', 'stead', 'strain', 'strict', 'string', 'stige', 'stru', 'struct', 'stroy', 'stry', 'sub', 'suc', 'suf', 'sup', 'sur', 'sus', 'sume', 'sump', 'super', 'supra', 'syn', 'sym', 'tact', 'tang', 'tag', 'tig', 'ting', 'tain', 'ten', 'tent', 'tin', 'tect', 'teg', 'tele', 'tem', 'tempo', 'ten', 'tin', 'tain', 'tend', 'tent', 'tens', 'tera', 'term', 'terr', 'terra', 'test', 'the', 'theo', 'therm', 'thesis', 'thet', 'tire', 'tom', 'tor', 'tors', 'tort', 'tox', 'tract', 'tra', 'trai', 'treat', 'trans', 'tri', 'trib', 'tribute', 'turbo', 'typ', 'ultima', 'umber', 'umbraticum', 'un', 'uni', 'vac', 'vade', 'vale', 'vali', 'valu', 'veh', 'vect', 'ven', 'vent', 'ver', 'veri', 'verb', 'verv', 'vert', 'vers', 'vi', 'vic', 'vicis', 'vict', 'vinc', 'vid', 'vis', 'viv', 'vita', 'vivi', 'voc', 'voke', 'vol', 'volcan', 'volv', 'volt', 'vol', 'vor', 'with', 'zo' } neutral_prefixes = upper(neutral_prefixes) neutral_suffixes = upper(neutral_suffixes) strong_prefixes = upper(strong_prefixes) strong_suffixes = upper(strong_suffixes) full_suffixes_set = upper(suffixes) full_prefixes_set = upper(prefixes) suffix = {"1", "2", "0"} for line in data_list: dict = {} vow_index = [] vowelCount = 0 pattern = "" y = "" dict["pos"] = nltk.pos_tag([line.split(":")[0]])[0][1] word = line.split(":")[0] temp = check_prefix(word, neutral_prefixes) if temp: dict['neu_pre'] = temp temp = check_suffix(word, neutral_suffixes) if temp: dict['neu_suf'] = temp temp = check_prefix(word, strong_prefixes) if temp: dict['str_pre'] = temp temp = check_suffix(word, strong_suffixes) if temp: dict['str_suf'] = temp temp = check_prefix(word, full_suffixes_set) if temp: dict['ful_pre'] = temp temp = check_suffix(word, full_prefixes_set) if temp: dict['ful_suf'] = temp line = line.split(":")[1].strip() syllables = line.split(" ") l = [] for i in syllables: l.append(i if not (i[-1].isdigit()) else i[:-1]) dict.update(Counter({''.join(i) for i in get_ngrams(l)})) dict['len'] = len(syllables) out = '' for i in range(len(syllables)): syl = syllables[i] if syl[-1] in suffix: vowelCount += 1 vow_index.append(i) out += syl[-1] # if syl[-1]=="1": # model_y.append(vowelCount) pattern += "V" else: pattern += "C" model_y.append(out) vowelCount = 0 dict["pattern"] = pattern dict['vow_len'] = len(vow_index) for i in vow_index: vowelCount += 1 if i - 1 >= 0: dict["onset2_" + str(vowelCount)] = syllables[i - 1] if i + 1 < len(syllables): dict["coda1_" + str(vowelCount)] = syllables[i + 1] dict["nucleus_" + str(vowelCount)] = syllables[i][:-1] model_x.append(dict) # print(pd.DataFrame(model_x)) # print(model_y) v = DictVectorizer(sparse=True) X = v.fit_transform(model_x) classifier = LogisticRegression(penalty='l2', class_weight='balanced') classifier.fit(X, model_y) with open(classifier_file, 'wb') as f: pickle.dump(classifier, f) pickle.dump(v, f)
sd_y = stats.stdev(y) for observation in x: score_x.append((observation - mean_x) / sd_x) for obseervation in y: score_y.append((observation - mean_y) / sd_y) return (sum([i * j for i, j in zip(score_x, score_y)])) / (n - 1) print(pearson(x, y)) ################# staff = [{ 'name': 'Steve Miller', 'age': 33. }, { 'name': 'Lyndon Jones', 'age': 12. }, { 'name': 'Baxter Morth', 'age': 18. }] vec = DictVectorizer() vec.fit_transform(staff).toarray() print(vec.get_feature_names())
class ImitationLearner(object): # initialize the classifier to be learned def __init__(self): # Any classifier could be used here self.model = LogisticRegression() self.vectorizer = DictVectorizer() self.labelEncoder = LabelEncoder() # this function predicts an instance given the state # state keeps track the various actions taken # it does not change the instance in any way, # it does change the state # the predicted structured output is returned in the end def predict(self, structured_instance, state=None, expert_policy_prob=0.0): if state == None: state = self.transitionSystem( structured_instance=structured_instance) # predict all remaining actions # if we do not have any actions we are done while len(state.agenda) > 0: # for each action # pop it from the queue current_action = state.agenda.popleft() # extract features and add them to the action # (even for the optimal policy, it doesn't need the features but they are needed later on) current_action.features = state.extractFeatures( structured_instance=structured_instance, action=current_action) # the first condition is to avoid un-necessary calls to random which give me reproducibility headaches if (expert_policy_prob == 1.0) or (expert_policy_prob > 0.0 and random.random() < expert_policy_prob): current_action.label = state.expert_policy( structured_instance, current_action) else: # predict (probably makes sense to parallelize across instances) # vectorize the features: vectorized_features = self.vectorizer.transform( current_action.features) # predict using the model normalized_label = self.model.predict(vectorized_features) # get the actual label (returns an array, get the first and only element) current_action.label = self.labelEncoder.inverse_transform( normalized_label)[0] # add the action to the state making any necessary updates state.updateWithAction(current_action, structured_instance) # OK return the final state reached return state class params(object): def __init__(self): self.learningParam = 0.1 self.iterations = 40 def train(self, structuredInstances, params): # create the dataset trainingFeatures = [] trainingLabels = [] # for each iteration for iteration in range(params.iterations): # set the expert policy prob expertPolicyProb = pow(1 - params.learningParam, iteration) print("Iteration:" + str(iteration) + ", expert policy prob:" + str(expertPolicyProb)) for structuredInstance in structuredInstances: # so we obtain the predicted output and the actions taken are in state # this prediction uses the gold standard since we need this info for the expert policy actions final_state = self.predict(structuredInstance, expert_policy_prob=expertPolicyProb) # initialize a second state to avoid having to roll-back stateCopy = self.transitionSystem( structured_instance=structuredInstance) # The agenda seems to initialized fine for action in final_state.actionsTaken: # DAgger just ask the expert stateCopy.agenda.popleft() expert_action_label = stateCopy.expert_policy( structuredInstance, action) # add the labeled features to the training data trainingFeatures.append(action.features) trainingLabels.append(expert_action_label) # take the original action chosen to proceed stateCopy.updateWithAction(action, structuredInstance) # OK, let's save the training data and learn some classifiers # vectorize the training data collected training_data = self.vectorizer.fit_transform(trainingFeatures) # encode the labels encoded_labels = self.labelEncoder.fit_transform(trainingLabels) # train self.model.fit(training_data, encoded_labels)
def main_gp(): import gp, GPyOpt from sklearn.feature_extraction import DictVectorizer parser = argparse.ArgumentParser() parser.add_argument('-a', '--agent', type=str, default='ppo_agent', help="Agent to use (ppo_agent|dqn_agent|etc)") parser.add_argument( '-g', '--gpu_split', type=float, default=1, help="Num ways we'll split the GPU (how many tabs you running?)") parser.add_argument('-n', '--net_type', type=str, default='lstm', help="(lstm|conv2d) Which network arch to use") parser.add_argument( '--guess', action="store_true", default=False, help="Run the hard-coded 'guess' values first before exploring") parser.add_argument( '--gpyopt', action="store_true", default=False, help= "Use GPyOpt library, or use basic sklearn GP implementation? GpyOpt shows more promise, but has bugs." ) args = parser.parse_args() # Encode features hsearch = HSearchEnv(gpu_split=args.gpu_split, net_type=args.net_type) hypers_, hardcoded = hsearch.hypers, hsearch.hardcoded hypers_ = {k: v for k, v in hypers_.items() if k not in hardcoded} hsearch.close() # Build a matrix of features, length = max feature size max_num_vals = 0 for v in hypers_.values(): l = len(v['vals']) if l > max_num_vals: max_num_vals = l empty_obj = {k: None for k in hypers_} mat = pd.DataFrame([empty_obj.copy() for _ in range(max_num_vals)]) for k, hyper in hypers_.items(): for i, v in enumerate(hyper['vals']): mat.loc[i, k] = v mat.ffill(inplace=True) # Above is Pandas-friendly stuff, now convert to sklearn-friendly & pipe through OneHotEncoder vectorizer = DictVectorizer() vectorizer.fit(mat.T.to_dict().values()) feat_names = vectorizer.get_feature_names() # Map TensorForce actions to GPyOpt-compatible `domain` # instantiate just to get actions (get them from hypers above?) bounds = [] for k in feat_names: hyper = hypers_.get(k, False) if hyper: bounded, min_, max_ = hyper['type'] == 'bounded', min( hyper['vals']), max(hyper['vals']) if args.gpyopt: b = {'name': k, 'type': 'discrete', 'domain': (0, 1)} if bounded: b.update(type='continuous', domain=(min_, max_)) else: b = [min_, max_] if bounded else [0, 1] bounds.append(b) def hypers2vec(obj): h = dict() for k, v in obj.items(): if k in hardcoded: continue if type(v) == bool: h[k] = float(v) else: h[k] = v or 0. return vectorizer.transform(h).toarray()[0] def vec2hypers(vec): # Reverse the encoding # https://stackoverflow.com/questions/22548731/how-to-reverse-sklearn-onehotencoder-transform-to-recover-original-data # https://github.com/scikit-learn/scikit-learn/issues/4414 if not args.gpyopt: vec = [vec] # gp.py passes as flat, GPyOpt as wrapped reversed = vectorizer.inverse_transform(vec)[0] obj = {} for k, v in reversed.items(): if '=' not in k: obj[k] = v continue if k in obj: continue # we already handled this x=y logic (below) # Find the winner (max) option for this key score, attr, val = v, k.split('=')[0], k.split('=')[1] for k2, score2 in reversed.items(): if k2.startswith(attr + '=') and score2 > score: score, val = score2, k2.split('=')[1] obj[attr] = val # Bools come in as floats. Also, if the result is False they don't come in at all! So we start iterate # hypers now instead of nesting this logic in reversed-iteration above for k, v in hypers_.items(): if v['type'] == 'bool': obj[k] = bool(round(obj.get(k, 0.))) return obj # Specify the "loss" function (which we'll maximize) as a single rl_hsearch instantiate-and-run def loss_fn(params): hsearch = HSearchEnv(gpu_split=args.gpu_split, net_type=args.net_type) reward = hsearch.execute(vec2hypers(params)) hsearch.close() return [reward] while True: conn = data.engine.connect() sql = "SELECT hypers, reward_avg FROM runs WHERE flag=:f" runs = conn.execute(text(sql), f=args.net_type).fetchall() conn.close() X, Y = [], [] for run in runs: X.append(hypers2vec(run.hypers)) Y.append([run.reward_avg]) print_feature_importances(X, Y, feat_names) if args.guess: guesses = {k: v['guess'] for k, v in hypers_.items()} X.append(hypers2vec(guesses)) Y.append([None]) args.guess = False if args.gpyopt: pretrain = {'X': np.array(X), 'Y': np.array(Y)} if X else {} opt = GPyOpt.methods.BayesianOptimization(f=loss_fn, domain=bounds, maximize=True, **pretrain) # using max_iter=1 because of database setup. Normally you'd go until convergence, but since we're using # a database for the runs we can parallelize runs across machines (connected to the same database). Then # between each run we can grab the result from the other machines and merge with our own; so only run # once, reset the model-fitting w/ the full database (which may have grown), and repeat opt.run_optimization(max_iter=1) else: gp.bayesian_optimisation2(n_iters=1, loss_fn=loss_fn, bounds=np.array(bounds), x_list=X, y_list=Y)
__author__ = 'davidoregan' import numpy as np from sklearn import svm import pandas as pd from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_extraction import DictVectorizer import numpy as np from ast import literal_eval vec = DictVectorizer() mydata2 = pd.read_csv('output.csv') with open("SVM's/output.csv") as f: dic = literal_eval('{' + f.read() +'}') print dic #pos_vectorized = vec.fit_transform(mydata2)
#缺失值处理 x["age"].fillna(x["age"].mean(), inplace=True) #转换成字典 x = x.to_dict(orient="records") #数据集划分 from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22) #字典特征抽取 from sklearn.feature_extraction import DictVectorizer transfer = DictVectorizer() x_train = transfer.fit_transform(x_train) x_test = transfer.transform(x_test) #决策树预估器: from sklearn.tree import DecisionTreeClassifier, export_graphviz estimator = DecisionTreeClassifier(criterion="entropy", max_depth=8) estimator.fit(x_train, y_train) #模型评估: y_predict = estimator.predict(x_test) print("y_predict:\n", y_predict) print("直接比对真实值和预测值:\n", y_test == y_predict) score = estimator.score(x_test, y_test) print("准确率:\n", score) #可视化决策树 ## http://webgraphviz.com/
print(df_encoded[:5, :]) # Print the shape of the original DataFrame print(df.shape) # Print the shape of the transformed array print(df_encoded.shape) ------------------------------------------------------------- # Import DictVectorizer from sklearn.feature_extraction import DictVectorizer # Convert df into a dictionary: df_dict df_dict = df.to_dict("records") # Create the DictVectorizer object: dv dv = DictVectorizer() # Apply dv on df: df_encoded df_encoded = dv.fit_transform(df_dict) # Print the resulting first five rows print(df_encoded[:5,:]) # Print the vocabulary print(dv.vocabulary_) ------------------------------------------------------- # Import necessary modules from sklearn.feature_extraction import DictVectorizer from sklearn.pipeline import Pipeline from sklearn.model_selection import cross_val_score
traindata = pd.read_csv(this_folder + "/insurance-train.csv") testdata = pd.read_csv(this_folder + "/insurance-test.csv") X_train = traindata[[ 'Gender', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage' ]] Y_train = traindata['Response'] X_test = testdata[[ 'Gender', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage' ]] vec = DictVectorizer() X_train = vec.fit_transform(X_train.to_dict(orient="record")) X_test = vec.transform(X_test.to_dict(orient="record")) gnb = GaussianNB() gnb.fit(X_train.toarray(), Y_train) # input X,y for training mnb = MultinomialNB() mnb.fit(X_train, Y_train) Y_test1 = gnb.predict(X_test.toarray()) output = pd.DataFrame({'id': testdata['id'], 'Response': Y_test1}) output.to_csv('Bayes_gnb.csv', index=False) Y_test2 = mnb.predict(X_test.toarray()) output = pd.DataFrame({'id': testdata['id'], 'Response': Y_test2}) output.to_csv('Bayes_mnb.csv', index=False)
def lexrank(sentences, continuous=False, sim_threshold=0.1, alpha=0.9, use_divrank=False, divrank_alpha=0.25): ''' compute centrality score of sentences. Args: sentences: [u'こんにちは.', u'私の名前は飯沼です.', ... ] continuous: if True, apply continuous LexRank. (see reference) sim_threshold: if continuous is False and smilarity is greater or equal to sim_threshold, link the sentences. alpha: the damping factor of PageRank and DivRank divrank: if True, apply DivRank instead of PageRank divrank_alpha: strength of self-link [0.0-1.0] (it's not the damping factor, see divrank.py) Returns: tuple ( { # sentence index -> score 0: 0.003, 1: 0.002, ... }, similarity_matrix ) Reference: Günes Erkan and Dragomir R. Radev. LexRank: graph-based lexical centrality as salience in text summarization. (section 3) http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html ''' # configure ranker ranker_params = {'max_iter': 1000} if use_divrank: ranker = divrank_scipy ranker_params['alpha'] = divrank_alpha ranker_params['d'] = alpha else: ranker = networkx.pagerank_scipy ranker_params['alpha'] = alpha graph = networkx.DiGraph() # sentence -> tf sent_tf_list = [] for sent in sentences: words = tools.word_segmenter_ja(sent) tf = collections.Counter(words) sent_tf_list.append(tf) sent_vectorizer = DictVectorizer(sparse=True) sent_vecs = sent_vectorizer.fit_transform(sent_tf_list) # compute similarities between senteces sim_mat = 1 - pairwise_distances(sent_vecs, sent_vecs, metric='cosine') if continuous: linked_rows, linked_cols = numpy.where(sim_mat > 0) else: linked_rows, linked_cols = numpy.where(sim_mat >= sim_threshold) # create similarity graph graph.add_nodes_from(range(sent_vecs.shape[0])) for i, j in zip(linked_rows, linked_cols): if i == j: continue weight = sim_mat[i,j] if continuous else 1.0 graph.add_edge(i, j, weight=weight) scores = ranker(graph, **ranker_params) return scores, sim_mat
headers = next(reader) print(headers) featureList = [] labelList = [] for row in reader: labelList.append(row[len(row) - 1]) rowDict = {} for i in range(1, len(row) - 1): rowDict[headers[i]] = row[i] featureList.append(rowDict) print(featureList) vec = DictVectorizer() dummyX = vec.fit_transform(featureList).toarray() print('dummyX: ' + str(dummyX)) print(vec.get_feature_names()) print('labelList: ' + str(labelList)) # vectorize class labels lb = preprocessing.LabelBinarizer() dummyY = lb.fit_transform(labelList) print('dummyY: ' + str(dummyY)) # Using decision tree for classification # clf = tree.DecisionTreeClassifier()
transitions.append(trans) x_templist.append(current_dictX) y_templist.append(current_Y) stack, graph = transition.empty_stack(stack, graph) for word in sentence: word['head'] = graph['heads'][word['id']] x_list.extend(x_temp_list) y_list.extend(y_temp_list) print("Encoding the features and classes...") # Vectorize the feature matrix and carry out a one-hot encoding vec = DictVectorizer(sparse=True) X = vec.fit_transform(x_list) # The statement below will swallow a considerable memory # X = vec.fit_transform(X_dict).toarray() # print(vec.get_feature_names()) y, nbr_to_class, classes_to_nbr = encode_classes(y_list) print("Training the model...") classifier = linear_model.LogisticRegression(penalty='l2', dual=True, solver='liblinear') model = classifier.fit(X, y) print(model) print('Predicting')