def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) self.calculate_importance = False self.sampler = InstanceHardnessThreshold(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.has_regression_range(), bug_features.severity(), bug_features.keywords(), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.patches(), bug_features.landings(), bug_features.product(), bug_features.component(), bug_features.is_mozillian(), bug_features.bug_reporter(), bug_features.blocked_bugs_number(), bug_features.priority(), bug_features.has_cve_in_alias(), bug_features.comment_count(), bug_features.comment_length(), bug_features.reporter_experience(), bug_features.number_of_bug_dependencies(), ] cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.hex(), feature_cleanup.dll(), feature_cleanup.synonyms(), feature_cleanup.crash(), ] self.extraction_pipeline = Pipeline([ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True, rollback_when=self.rollback, ), ), ( "union", ColumnTransformer([ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.0001), "title"), ( "comments", self.text_vectorizer(min_df=0.0001), "comments", ), ]), ), ]) self.clf = xgboost.XGBClassifier(n_jobs=16) self.clf.set_params(predictor="cpu_predictor")
for row in allLikesLS: aDictLikes2[row[0]].append(row[1]) combDICT = {} for uid in unqLikesUIDs: tmpDICT = {} tmpLS = aDictLikes2[uid] for row in tmpLS: tmpDICT[str(row)] = 1 combDICT[uid] = tmpDICT tryTHIS = [] for uid in unqLikesUIDs: tryTHIS.append(combDICT[uid]) v = DictVectorizer() likesMAT = v.fit_transform(tryTHIS) del globals()['likes'] del globals()['likesUIDs'] del globals()['likesLIDs'] del globals()['lsLikesUIDs'] del globals()['lsLikesLIDs'] del globals()['setLikesUIDs'] del globals()['setLikesLIDs'] del globals()['allLikesLS'] del globals()['aDictLikes2'] del globals()['aUID'] del globals()['row'] del globals()['combDICT'] del globals()['uid']
from sklearn.metrics import precision_score from sklearn import linear_model from sklearn import tree from sklearn.cluster import KMeans from sklearn import datasets from sklearn.model_selection import cross_val_predict from sklearn import metrics from sklearn.metrics import roc_curve, auc, roc_auc_score from sklearn.metrics import confusion_matrix from sklearn.feature_selection import RFECV from sklearn.feature_selection import RFE allegations = list() officers = dict() complainants = dict() features = DictVectorizer() results = [] #################################################### REFERENCE VARS # column numbers for allegation sheet id_colnum = 0 complainantid_colnum = 1 officerid_colnum = 2 allcat_colnum = 4 result_colnum = 11 investigator_colnum = 22 # column numbers for officer sheet id_colnum = 0 gender_colnum = 4 race_colnum = 6
def get_similarity_model(): model = Pipeline([("vect", DictVectorizer()), ("neigh", NearestNeighbors(n_neighbors=6))]) return model
def __init__(self): self.vectorizer = DictVectorizer()
def __init__(self, lemmatization=False, historical=False): BugModel.__init__(self, lemmatization) self.sampler = BorderlineSMOTE(random_state=0) feature_extractors = [ bug_features.has_str(), bug_features.severity(), # Ignore keywords that would make the ML completely skewed # (we are going to use them as 100% rules in the evaluation phase). bug_features.keywords({"regression", "talos-regression", "feature"}), bug_features.is_coverity_issue(), bug_features.has_crash_signature(), bug_features.has_url(), bug_features.has_w3c_url(), bug_features.has_github_url(), bug_features.whiteboard(), bug_features.blocked_bugs_number(), bug_features.ever_affected(), bug_features.affected_then_unaffected(), bug_features.product(), bug_features.component(), ] if historical: feature_extractors += [ bug_features.had_severity_enhancement(), bug_features.patches(), bug_features.landings(), ] cleanup_functions = [ feature_cleanup.url(), feature_cleanup.fileref(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(min_df=0.001), "title"), ( "first_comment", self.text_vectorizer(min_df=0.001), "first_comment", ), ( "comments", self.text_vectorizer(min_df=0.001), "comments", ), ] ), ), ] ) self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) self.clf.set_params(predictor="cpu_predictor")
open('./txt_CosineSimilarity/TFIDF_forCS_headline.txt', 'r', encoding="UTF-8")) CSBody_TFIDF = json.load( open('./txt_CosineSimilarity/TFIDF_forCS_body.txt', 'r', encoding="UTF-8")) CosineSimilarity = json.load( open('./txt_CosineSimilarity/TFIDF_CosineSimilarity.txt', 'r', encoding="utf-8")) data = pd.read_csv("merged_traintest.csv") # y = data['Stance_2cat'] y = data['Stance_4cat'] #(75385,1) # print ("y",y) dict_vec = DictVectorizer(sparse=True) X1 = dict_vec.fit_transform(CSHeadline_TFIDF.values()) X1 = pd.DataFrame(X1.toarray(), columns=dict_vec.get_feature_names()) # print (X1) # (75385 rows x 3293 columns) X2 = dict_vec.fit_transform(CSBody_TFIDF.values()) X2 = pd.DataFrame(X2.toarray(), columns=dict_vec.get_feature_names()) # print (X2) # (75385 rows x 4207 columns) X = pd.concat([X1, X2], axis=1) del X1 del X2 X3 = pd.DataFrame.from_dict(CosineSimilarity, orient='index', columns=['CosineSimilarity']) #(75385筆, 1欄)
def fit_feature_dict(self, sequences): train_data = self.get_sequence_features(sequences) self.feature2matrix = DictVectorizer() self.feature2matrix.fit(train_data)
from sklearn.metrics import accuracy_score ### Step 1 ### testData = pd.read_csv("testing.csv") trainingData = pd.read_csv("training.csv") Xtest = testData.drop("target", axis=1) Ytest = testData["target"] Xtrain = trainingData.drop("target", axis=1) Ytrain = trainingData["target"] ### step 2 ### Xtrain_dict = Xtrain.to_dict("records") Xtest_dict = Xtest.to_dict("records") dv = DictVectorizer() Xtrain_encoded = dv.fit_transform(Xtrain_dict) Xtest_encoded = dv.transform(Xtest_dict) clf = DecisionTreeClassifier() score = np.mean(cross_val_score(clf, Xtrain_encoded, Ytrain)) print(score) ### step 3 ### pipe_dv_dtc = make_pipeline(DictVectorizer(), DecisionTreeClassifier()) pipe_dv_dtc.fit(Xtrain_dict, Ytrain) pred = pipe_dv_dtc.predict(Xtest_dict) print(accuracy_score(Ytest, pred))
parser = argparse.ArgumentParser() parser.add_argument( '-p', '--persist', action='store_true', help='Specify whether to make the model persistent in models/*') parser.add_argument( '--noval', action='store_true', help='specify whether to evaluate the model\'s performance') args = parser.parse_args() forest_clf = RandomForestRegressor(min_samples_leaf=5, random_state=42) multi_clf = MultiOutputRegressor(forest_clf) full_pipeline = Pipeline([('filterer', DictFilterer(exclude_u_sub)), ('vectorizer', DictVectorizer(sparse=True)), ('selectKBest', SelectKBest(multi_f_classif, k=1000)), ('scaler', StandardScaler(with_mean=False)), ('framer', ToSparseDF()), ('clf', multi_clf)]) if __name__ == '__main__': from tables import Comment, User, db from collections import defaultdict comment_groups = Comment.query.with_entities(Comment.author, Comment.subreddit, db.func.count(Comment.subreddit))\ .group_by(Comment.author, Comment.subreddit)\ .all() subreddit_counts = defaultdict(dict) for author, subreddit, count in comment_groups:
return self def transform(self, X): return [{word: True for word in word_tokenize(document)} for document in X] # 加载消息。我们只对消息内容感兴趣,因此只提取和存储它们的text值。代码如下: tweets = [] with open(input_filename) as inf: for line in inf: if len(line.strip()) == 0: continue tweets.append(json.loads(line)['text']) # 加载消息的类别。 with open(labels_filename) as inf: labels = json.load(inf) # 创建流水线,把所有部件组合起来。流水线包含以下三个部分。 # 我们创建的NLTKBOW转换器 DictVectorizer转换器 BernoulliNB分类器 from sklearn.feature_extraction import DictVectorizer from sklearn.naive_bayes import BernoulliNB from sklearn.pipeline import Pipeline pipeline = Pipeline([('bag-of-words', NLTKBOW()), ('vectorizer', DictVectorizer()), ('naive-bayes', BernoulliNB())])
def make_conversion_data(num_feat_files, from_suffix, to_suffix): num_examples = 500 num_feats_per_file = 7 np.random.seed(1234567890) convert_dir = join(_my_dir, 'train', 'test_conversion') if not exists(convert_dir): os.makedirs(convert_dir) # Create lists we will write files from ids = [] features = [] labels = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = {"f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num in range(num_feat_files * num_feats_per_file)} x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) labels.append(y) features.append(x) # Create vectorizers/maps for libsvm subset writing feat_vectorizer = DictVectorizer() feat_vectorizer.fit(features) label_map = {label: num for num, label in enumerate(sorted({label for label in labels if not isinstance(label, (int, float))}))} # Add fake item to vectorizer for None label_map[None] = '00000' # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # Write out unmerged features in the `from_suffix` file format for i in range(num_feat_files): train_path = join(convert_dir, '{}_{}{}'.format(feature_name_prefix, i, from_suffix)) sub_features = [] for example_num in range(num_examples): feat_num = i * num_feats_per_file x = {"f{:03d}".format(feat_num + j): features[example_num]["f{:03d}".format(feat_num + j)] for j in range(num_feats_per_file)} sub_features.append(x) train_fs = FeatureSet('sub_train', ids, labels=labels, features=sub_features, vectorizer=feat_vectorizer) if from_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() else: Writer.for_path(train_path, train_fs).write() # Write out the merged features in the `to_suffix` file format train_path = join(convert_dir, '{}_all{}'.format(feature_name_prefix, to_suffix)) train_fs = FeatureSet('train', ids, labels=labels, features=features, vectorizer=feat_vectorizer) if to_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() else: Writer.for_path(train_path, train_fs).write()
for index, row in sub_train.iterrows(): wifi_dict = {} for wifi in row.wifi_infos.split(';'): bssid, signal, flag = wifi.split('|') wifi_dict[bssid] = int(signal) train_set.append(wifi_dict) test_set = [] for index, row in sub_test.iterrows(): wifi_dict = {} for wifi in row.wifi_infos.split(';'): bssid, signal, flag = wifi.split('|') wifi_dict[bssid] = int(signal) test_set.append(wifi_dict) v = DictVectorizer(sparse=False, sort=False) train_set = v.fit_transform(train_set) test_set = v.transform(test_set) train_set[train_set == 0] = np.NaN test_set[test_set == 0] = np.NaN sub_train = pd.concat([sub_train.reset_index(), pd.DataFrame(train_set)], axis=1) sub_test = pd.concat([sub_test.reset_index(), pd.DataFrame(test_set)], axis=1) lbl = LabelEncoder() lbl.fit(list(sub_train['shop_id'].values)) sub_train['label'] = lbl.transform(list(sub_train['shop_id'].values)) num_class = sub_train['label'].max() + 1
def add_sample_metadata_as_features(exp: Experiment, fields, sparse=None, inplace=False): '''Add covariates from sample metadata to the data table as features for machine learning. This will convert the columns of categorical strings using one-hot encoding scheme and add them into the data table as new features. .. note:: This is only for numeric and/or nominal covariates in sample metadata. If you want to add a ordinal column as a feature, use `pandas.Series.map` to convert ordinal column to numeric column first. Examples -------- >>> exp = Experiment(np.array([[1,2], [3, 4]]), sparse=False, ... sample_metadata=pd.DataFrame({'category': ['A', 'B'], ... 'ph': [6.6, 7.7]}, ... index=['s1', 's2']), ... feature_metadata=pd.DataFrame({'motile': ['y', 'n']}, index=['otu1', 'otu2'])) >>> exp Experiment with 2 samples, 2 features Let's add the columns of `category` and `ph` as features into data table: >>> new = exp.add_sample_metadata_as_features(['category', 'ph']) >>> new Experiment with 2 samples, 5 features >>> new.feature_metadata motile category=A NaN category=B NaN ph NaN otu1 y otu2 n >>> new.data # doctest: +SKIP array([[1. , 0. , 6.6, 1. , 2. ], [0. , 1. , 7.7, 3. , 4. ]]) Parameters ---------- fields : list of str the column names in the sample metadata. These columns will be converted to one-hot numeric code and then concatenated to the data table sparse : bool or None, optional use sparse or dense data matrix. When it is ``None``, it will follow the same sparsity of the current data table in the :class:`.Experiment` object inplace : bool change the :class:`.Experiment` object in place or return a copy of changed. Returns ------- Experiment See Also -------- sklearn.preprocessing.OneHotEncoder ''' logger.debug('Add the sample metadata {} as features'.format(fields)) if inplace: new = exp else: new = exp.copy() md = new.sample_metadata[fields] if sparse is None: sparse = new.sparse vec = DictVectorizer(sparse=sparse) encoded = vec.fit_transform(md.to_dict(orient='records')) if sparse: new.data = hstack((encoded, new.data), format='csr') else: new.data = np.concatenate([encoded, new.data], axis=1) # the order in the concatenation should be consistent with the data table new.feature_metadata = pd.concat( [pd.DataFrame(index=vec.get_feature_names()), new.feature_metadata]) return new
def crossval(paths, annDir, eval_type, use_reach, relabeling, conservative_eval, limit_training, balance_dataset): ''' Puts all together ''' def in_neighborhood(datum, intervals): ''' Used to filter a datum if it's not within the neighborhood of (min, max) ''' for interval in intervals: minIx, maxIx = interval if datum.ctxIx >= minIx and datum.ctxIx <= maxIx: return True return False print "Parsing data" paths = set(paths) labels, features, data = parse_data(paths, annDir, use_reach, relabeling) # Group indexes by paper id groups = {p: [] for p in paths} for i, d in enumerate(data): groups[d.namespace].append(i) # Hack!! groups2 = {} for k, v in groups.iteritems(): if len(v) != 0: groups2[k] = v groups = groups2 print print "Cross-validation" print "Using %i papers" % len(groups2) if relabeling: print "Doing relabeling" if conservative_eval: print "Doing conservative evaluation" if limit_training: print "Limiting training data range" if balance_dataset: print "Balancing data set during training" if one_hit_all: print "One-hit-all" print "Total golden data: %i\tTotal expanded data: %i" % (len( [d for d in data if d.golden]), len([d for d in data if not d.golden])) print # Only use the training data if limit_training: # Compute the range of the annotations intervals = defaultdict(list) k = 2 for datum in data: if datum.golden: intervals[datum.namespace].append( (datum.ctxIx - k, datum.ctxIx + k)) # Make it a numpy array to index it more easily data = np.asarray(data) dv = DictVectorizer() dv.fit(features.values()) # Build a feature vector and attach it to each datum vectors = {k: dv.transform(v) for k, v in features.iteritems()} c_results, p_results = [], [] # Do the "Cross-validation" only on those papers that have more than N papers for ix, path in enumerate(groups.keys()): print "Fold: %i" % (ix + 1) training_paths = paths - {path} X_train, X_test = [], [] y_train, y_test = [], [] data_train, data_test = [], [] for datum in data: if datum.namespace in training_paths: if limit_training: if not in_neighborhood(datum, intervals[datum.namespace]): continue X_train.append(vectors[datum]) y_train.append(labels[datum]) data_train.append(datum) for datum in data: if datum.namespace not in training_paths: if conservative_eval: if not datum.golden: continue X_test.append(vectors[datum]) y_test.append(labels[datum]) data_test.append(datum) # Balance the dataset if necessary if balance_dataset: train_positive, train_negative = [], [] for datum in data_train: if datum.label == 1: train_positive.append(datum) else: train_negative.append(datum) k = 4 # Ratio of negatives per positives for balancing size = len(train_positive) * k if size < len(train_negative): balanced_negatives = np.random.choice(train_negative, size, replace=False).tolist() else: balanced_negatives = train_negative data_train = train_positive + balanced_negatives X_train = [vectors[datum] for datum in data_train] y_train = [labels[datum] for datum in data_train] p = len([d for d in data_train if d.label == 1]) n = len([d for d in data_train if d.label == 0]) r = n / float(p) print path print "Training data: %i positives\t%i negatives\t%f N:P ratio" % ( p, n, r) p = len([d for d in data_test if d.label == 1]) n = len([d for d in data_test if d.label == 0]) r = n / float(p) print "Testing data: %i positives\t%i negatives\t%f N:P ratio" % (p, n, r) model_pred = machine_learning(vstack(X_train), y_train, vstack(X_test), y_test) policy_pred = policy(np.asarray(data_test)) # One-hit-all approach if one_hit_all: ctx_types = list({d.ctxGrounded for d in data_test}) ctx_types.sort() local_events = list({d.evt for d in data_test}) local_events.sort predicted_bag = set() for datum, prediction in it.izip(data_test, model_pred): if prediction == 1: predicted_bag.add((datum.evt, datum.ctxGrounded)) policy_bag = set() for datum, prediction in it.izip(data_test, policy_pred): if prediction == 1: policy_bag.add((datum.evt, datum.ctxGrounded)) truth_bag = set() for datum, prediction in it.izip(data_test, y_test): if prediction == 1: truth_bag.add((datum.evt, datum.ctxGrounded)) new_model_pred, new_policy_pred, new_truth = [], [], [] for evt in local_events: for ctx in ctx_types: if (evt, ctx) in predicted_bag: new_model_pred.append(1) else: new_model_pred.append(0) if (evt, ctx) in policy_bag: new_policy_pred.append(1) else: new_policy_pred.append(0) if (evt, ctx) in truth_bag: new_truth.append(1) else: new_truth.append(0) y_test = new_truth model_pred = new_model_pred policy_pred = new_policy_pred ###################### model_results = ClassificationResults("Model %s" % path, y_test, model_pred) policy_result = ClassificationResults("Policy %s" % path, y_test, policy_pred) print "Model scores: %s" % model_results print "Policy scores %s" % policy_result print c_results.append(model_results) p_results.append(policy_result) #return pd.Series(f1_diffs), model_f1s return c_results, p_results
def evaluate_combinations(X_train, Y_train, X_test, Y_test, model, w2v_vector): combination_results = dict() accuracy = [] n_params = [] caps = [] pos = [] NER = [] context = [] w2v = [] F1 = [] comb_list = list(product([True, False], repeat=5)) for i, each_comb in enumerate(comb_list): caps.append(each_comb[0]) pos.append(each_comb[1]) NER.append(each_comb[2]) context.append(each_comb[3]) w2v.append(each_comb[4]) print("\ncaps : ", each_comb[0], " POS : ", each_comb[1], " NER : ", each_comb[2], " context : ", each_comb[3], " w2v : ", each_comb[4]) train_dicts = make_feature_dicts(X_train, w2v_model_wv, token=True, caps=each_comb[0], pos=each_comb[1], NER=each_comb[2], context=each_comb[3], w2v=each_comb[4]) vec = DictVectorizer() X_train_v = vec.fit_transform(train_dicts) clf = 0 if model == "SVM": clf = LinearSVC(C=0.1, random_state=123, class_weight="balanced", max_iter=100, fit_intercept=True) elif model == "NN": clf = MLPClassifier(hidden_layer_sizes=(5, 10), solver='sgd', learning_rate='adaptive', activation='logistic', max_iter=50, random_state=42) elif model == "MNB": clf = GaussianNB() #clf = LogisticRegression() clf.fit(X_train_v.toarray(), Y_train) """ #checking feature weights for i, cls in enumerate(clf.classes_): print("\nFeature weights for class : ",cls,"\n") df = pd.DataFrame(data= {"Features" : vec.feature_names_, "weights" : clf.coef_[i]}) df = df.sort_values(axis=0,by='weights',ascending=False) print(df) """ test_dicts = make_feature_dicts(X_test, w2v_model_wv, token=True, caps=each_comb[0], pos=each_comb[1], NER=each_comb[2], context=each_comb[3], w2v=each_comb[4]) X_test_v = vec.transform(test_dicts) Y_preds = clf.predict(X_test_v) #confusion_matrix = confusion(test_labels, preds) #class_labels = ["Relevant","Not Relevant","Deceptive"] class_labels = ["Relevant", "Not Relevant", "Deceptive"] c_m = confusion_matrix(Y_test, Y_preds, labels=class_labels) cm_df = pd.DataFrame(c_m, index=class_labels, columns=class_labels) print('confusion matrix:\n%s\n' % str(cm_df)) #evaluation_matrix = evaluate(confusion_matrix) #f1_score = average_f1s(evaluation_matrix) acc = accuracy_score(Y_test, Y_preds) print("Accuracy : ", acc) n_params.append(clf.coef_.size) accuracy.append(acc) f1 = calculate_f1(cm_df) print("f1: ", f1) F1.append(f1) df = pd.DataFrame( data={ 'F1': F1, 'Accuracy': accuracy, 'n_params': n_params, 'caps': caps, 'pos': pos, 'NER': NER, 'context': context, 'w2v': w2v }) #df = pd.DataFrame(data={'Accuracy' : accuracy, 'caps' : caps, 'pos' : pos, 'NER' : NER, 'context' : context, 'w2v' : w2v}) df = df[[ 'F1', 'Accuracy', 'n_params', 'caps', 'pos', 'NER', 'context', 'w2v' ]] #df = df[['Accuracy','caps','pos','NER','context','w2v']] df = df.sort_values(axis=0, by='F1', ascending=False) return df
def main(): parser = argparse.ArgumentParser() parser.add_argument('--clusters', type=int, default=5, required=False) parser.add_argument('--confidence', type=float, default=0.0, required=False) parser.add_argument('directory', nargs=1) group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--kmeans', action='store_true') group.add_argument('--dbscan', action='store_true') args = parser.parse_args() confidence = args.confidence n_clusters = args.clusters directory = args.directory[0] documents = {} for filename in os.listdir(directory): if not filename.endswith('.labels'): continue with open(directory + '/' + filename, 'r') as f: documents[filename] = [] for line in f.readlines(): l_components = line.split('\t') conf = float(l_components[0]) label = l_components[1][:-1] if conf > confidence: documents[filename].append(label) v = DictVectorizer() dox = [ { l : 1 for l in documents[d] } for d in documents ] doc_names = [ d.rstrip('.labels') for d in documents ] X = v.fit_transform(dox) features = v.get_feature_names() if args.kmeans: km = KMeans(n_clusters=n_clusters) km.fit(X) # Sort cluster centers by proximity to centroid order_centroids = km.cluster_centers_.argsort()[:, ::-1] closest_labels_dict = { i : "" for i in range(n_clusters) } for i in range(n_clusters): for ind in order_centroids[i, :6]: #replace 6 with n words per cluster closest_labels_dict[i] += features[ind] + ", " closest_labels_dict[i] = closest_labels_dict[i].rstrip(', ') clusters = km.labels_.tolist() clusters_dict = { i : [] for i in range(n_clusters) } for c in range(len(clusters)): clusters_dict[clusters[c]].append(doc_names[c]) print('<html>') print('<body>') print('<style>') print('img { height: 75px; }') print('h2 { font-family: sans-serif; } ') print('.box { max-width: 700px; }') print('</style>') print('<div class="box">') for k in clusters_dict: print('<h2>' + str(k) + ": " + closest_labels_dict[k] + '</h2>') for img in clusters_dict[k]: print('<img src="file://' + directory + '/' + img + '">') print('</div>') print('</body>') print('</html>') elif args.dbscan: raise
def BasicNight(): #BasicFuntion Night db = MySQLdb.connect("localhost", "root", "", "ge_hackathon") cursor = db.cursor() sql = "SELECT * FROM answers WHERE finished='1'" data = frame_query(sql, db) #BasicFuntion Night df_training = data df_train = df_training[df_training.columns[2:9]] Y_train = df_train['L11P'].values del df_train['L11P'] #ENCODING X_train = df_train.to_dict('records') X_tr = [] X_tr.extend(X_train) #One Hot Encoding enc = DictVectorizer(sparse=True) X_encoded_train = enc.fit_transform(X_tr) estimator = GradientBoostingClassifier() estimator.fit(X_encoded_train.toarray(), Y_train) joblib.dump(estimator, 'Estbasic.pkl') db = MySQLdb.connect("localhost", "root", "", "ge_hackathon") cursor = db.cursor() sql = "SELECT * FROM answers WHERE finished='1'" data = frame_query(sql, db) df_training = data df_train = df_training[list(data.columns[2:8]) + list(data.columns[10:16])] Y_train = df_train['L21P'].values del df_train['L21P'] #ENCODING X_train = df_train.to_dict('records') X_tr = [] X_tr.extend(X_train) #One Hot Encoding enc = DictVectorizer(sparse=True) X_encoded_train = enc.fit_transform(X_tr) estimator = GradientBoostingClassifier() estimator.fit(X_encoded_train.toarray(), Y_train) joblib.dump(estimator, 'EstL21.pkl') db = MySQLdb.connect("localhost", "root", "", "ge_hackathon") cursor = db.cursor() sql = "SELECT * FROM answers WHERE finished='1'" data = frame_query(sql, db) df_training = data df_train = df_training[list(data.columns[2:8]) + list(data.columns[17:22])] Y_train = df_train['L22P'].values del df_train['L22P'] #ENCODING X_train = df_train.to_dict('records') X_tr = [] X_tr.extend(X_train) #One Hot Encoding enc = DictVectorizer(sparse=True) X_encoded_train = enc.fit_transform(X_tr) estimator = GradientBoostingClassifier() estimator.fit(X_encoded_train.toarray(), Y_train) joblib.dump(estimator, 'EstL22.pkl') db = MySQLdb.connect("localhost", "root", "", "ge_hackathon") cursor = db.cursor() sql = "SELECT * FROM answers WHERE finished='1'" data = frame_query(sql, db) df_training = data df_train = df_training[list(data.columns[2:8]) + list(data.columns[23:28])] Y_train = df_train['L23P'].values del df_train['L23P'] #ENCODING X_train = df_train.to_dict('records') X_tr = [] X_tr.extend(X_train) #One Hot Encoding enc = DictVectorizer(sparse=True) X_encoded_train = enc.fit_transform(X_tr) estimator = GradientBoostingClassifier() estimator.fit(X_encoded_train.toarray(), Y_train) joblib.dump(estimator, 'EstL23.pkl') db = MySQLdb.connect("localhost", "root", "", "ge_hackathon") cursor = db.cursor() sql = "SELECT * FROM answers WHERE finished='1'" data = frame_query(sql, db) df_training = data df_train = df_training[list(data.columns[2:8]) + list(data.columns[29:34])] Y_train = df_train['L24P'].values del df_train['L24P'] #ENCODING X_train = df_train.to_dict('records') X_tr = [] X_tr.extend(X_train) #One Hot Encoding enc = DictVectorizer(sparse=True) X_encoded_train = enc.fit_transform(X_tr) estimator = GradientBoostingClassifier() estimator.fit(X_encoded_train.toarray(), Y_train) joblib.dump(estimator, 'EstL24.pkl') db = MySQLdb.connect("localhost", "root", "", "ge_hackathon") cursor = db.cursor() sql = "SELECT * FROM answers WHERE finished='1'" data = frame_query(sql, db) df_training = data df_train = df_training[list(data.columns[45:49])] Y_train = df_train['L33P'].values del df_train['L33P'] #ENCODING X_train = df_train.to_dict('records') X_tr = [] X_tr.extend(X_train) #One Hot Encoding enc = DictVectorizer(sparse=True) X_encoded_train = enc.fit_transform(X_tr) estimator = GradientBoostingClassifier() estimator.fit(X_encoded_train.toarray(), Y_train) joblib.dump(estimator, 'EstL33.pkl') db = MySQLdb.connect("localhost", "root", "", "ge_hackathon") cursor = db.cursor() sql = "SELECT * FROM answers WHERE finished='1'" data = frame_query(sql, db) df_training = data df_train = df_training[list(data.columns[35:39])] Y_train = df_train['L31P'].values del df_train['L31P'] #ENCODING X_train = df_train.to_dict('records') X_tr = [] X_tr.extend(X_train) #One Hot Encoding enc = DictVectorizer(sparse=True) X_encoded_train = enc.fit_transform(X_tr) estimator = GradientBoostingClassifier() estimator.fit(X_encoded_train.toarray(), Y_train) joblib.dump(estimator, 'EstL31.pkl') db = MySQLdb.connect("localhost", "root", "", "ge_hackathon") cursor = db.cursor() sql = "SELECT * FROM answers WHERE finished='1'" data = frame_query(sql, db) df_training = data df_train = df_training[list(data.columns[40:44])] Y_train = df_train['L32P'].values del df_train['L32P'] #ENCODING X_train = df_train.to_dict('records') X_tr = [] X_tr.extend(X_train) #One Hot Encoding enc = DictVectorizer(sparse=True) X_encoded_train = enc.fit_transform(X_tr) estimator = GradientBoostingClassifier() estimator.fit(X_encoded_train.toarray(), Y_train) joblib.dump(estimator, 'EstL32.pkl') return 'success'
from sklearn.decomposition import LatentDirichletAllocation import string from nltk.stem.porter import PorterStemmer import theano from theano import tensor as T from theano import function from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from sklearn.pipeline import Pipeline import matplotlib.pyplot as plt from sklearn.model_selection import GridSearchCV def clear_title(title,remove_stopwords): raw_text=BeautifulSoup(title,'html').get_text() letters=re.sub('[^a-zA-Z]',' ',raw_text) words=letters.lower().split() if remove_stopwords: stop_words=set(stopwords.words('english')) words=[w for w in words if w not in stop_words] return ' '.join(words) dict_vec=DictVectorizer(sparse=False) PATH_TO_ORIGINAL_DATA = '../datasets/' data = pd.read_csv(PATH_TO_ORIGINAL_DATA + 'cleared_bugs',sep='\t') selected_columns=['Product','Component','Assignee','Summary','Changed'] data=data[selected_columns] text=[] for title in data['Summary']: text.append(clear_title(title,True).split()) print text from gensim.models import word2vec model=word2vec.Word2Vec(text,workers=4,size=50,min_count=1,window=2) model.wv.save_word2vec_format('summary.txt',binary=False)
PREFIX = 'data/' train = pd.read_csv(PREFIX + 'train_items.csv') train = train[train['price'] > 0].reset_index(drop=True) test = pd.read_csv(PREFIX + 'test_items.csv') sid = test.sample_id.values del test['sample_id'] # In[16]: # noinspection PyTypeChecker vectorizer = make_union( on_field('name', tfidf_fabric()), on_field('text', tfidf_fabric()), on_field(['shipping', 'item_condition_id', 'category_name'], FunctionTransformer(to_records, validate=False), DictVectorizer())) # In[17]: cv = KFold(n_splits=10, shuffle=True, random_state=42) train_ids, valid_ids = next(cv.split(train)) train, valid = train.iloc[train_ids], train.iloc[valid_ids] # In[18]: y_train = np.log1p(train['price'].values.reshape(-1, 1)) y_valid = valid['price'].values X_train = vectorizer.fit_transform(preprocess(train)).astype(np.float32) X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32) X_final = vectorizer.transform(preprocess(test)).astype(np.float32)
featureList = [] #字典的列表,每个字典对应一个实例 lableList = [] #标题的列表 with open(r'E:\pycharm\ML\computer.csv', 'r') as f: r = csv.reader(f) header = next(r) #标题行 for line in r: lableList.append(line[-1]) dic = {} for i in range(1, len(line) - 1): dic[header[i]] = line[i] featureList.append(dic) # pprint(featureList) vec = DictVectorizer() #进行特征向量的变换 dummyX = vec.fit_transform(featureList).toarray() # print(dummyX.shape) # print(dummyX) print(vec.get_feature_names()) # print(dummyX) dummyY = preprocessing.LabelBinarizer().fit_transform(lableList) classifier = tree.DecisionTreeClassifier( criterion='entropy') #利用信息熵时,用entropy(熵)。默认是gini classifier.fit(dummyX, dummyY) '''
def PredictionScoreLeaveOneOutSpecifyClassifier(X, y, limit, columnName, classifierNames, classifiers): from sklearn.metrics import f1_score from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.svm import SVC, LinearSVC import matplotlib.pyplot as plt names = classifierNames outFile = open('output.txt', 'a') vec = DictVectorizer() for name, clf in zip(names, classifiers): try: accuracy = 0.0 count = 0.0 total_accuracy = 0.0 total_f1 = 0.0 total_precision = 0.0 total_recall = 0.0 count = 1.0 from sklearn.model_selection import LeaveOneOut loo = LeaveOneOut() loo.get_n_splits(X) # print(loo) y_test_all = [] y_pred_all = [] accuracy_total = 0 count = 0 for train_index, test_index in loo.split(X): # print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() X_train_fit = count_vect.fit(X_train) X_train_counts = X_train_fit.transform(X_train) X_test_counts = X_train_fit.transform(X_test) # from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer() fit = tfidf_transformer.fit(X_train_counts) X_train_tfidf = fit.transform(X_train_counts) X_test_tfidf = fit.transform(X_test_counts) X_train_counts = X_train_tfidf X_test_counts = X_test_tfidf try: clf.fit(X_train_counts.toarray(), y_train) accuracy_total += clf.score(X_test_counts.toarray(), y_test) count += 1 y_pred = clf.predict(X_test_counts.toarray()) # # binary_predictions = [x if x == 'good' else 0 for x in y_pred] # binary_predictions = [x if x == 0 else 1 for x in binary_predictions] # # binary_labels = [x if x == 'good' else 0 for x in y_test] # binary_labels = [x if x == 0 else 1 for x in binary_labels] y_pred_all.append(y_pred[0]) y_test_all.append(y_test[0]) except BaseException as b: print(b) f1 = f1_score(y_test_all, y_pred_all, average='weighted') precision = precision_score(y_test_all, y_pred_all, average='weighted') recall = recall_score(y_test_all, y_pred_all, average='weighted') print( str(columnName) + "\t" + str(limit) + "\t" + str(name) + "\t" + str(accuracy_total / count) + "\t" + str(f1) + "\t" + str(precision) + "\t" + str(recall)) outFile.write( str(columnName) + "\t" + str(limit) + "\t" + str(name) + "\t" + str(accuracy_total / count) + "\t" + str(f1) + "\t" + str(precision) + "\t" + str(recall) + "\n") # acc, f1,prc,rec = classify(clf,X_train,X_test,y_train,y_test) # # total_accuracy +=acc # total_f1 += f1 # total_precision += prc # total_recall += rec except BaseException as b: print(b) outFile.close()
class MateSegmenter(object): """Class for perfoming discourse segmentation on constituency trees. """ #: classifier object: default classification method DEFAULT_CLASSIFIER = LinearSVC(multi_class='ovr', class_weight='auto') #:str: path to default model to use in classification DEFAULT_MODEL = os.path.join(os.path.dirname(__file__), "data", "mate.model") #:pipeline object: default pipeline object used for classification DEFAULT_PIPELINE = Pipeline([('vectorizer', DictVectorizer()), ('var_filter', VarianceThreshold()), ('classifier', DEFAULT_CLASSIFIER)]) def __init__(self, featgen=gen_features_for_segment, model=DEFAULT_MODEL): """Class constructor. """ self.featgen = featgen self.pipeline = None self._update_model(model) def extract_features_from_corpus(self, dep_corpus, seg_corpus=None): all_features = [] all_labels = [] for text in sorted(dep_corpus.keys()): seg_forest = seg_corpus.get(text, None) features, labels = self.extract_features_from_text( dep_corpus[text], seg_forest=seg_forest) all_features.extend(features) all_labels.extend(labels) return all_features, all_labels def extract_features_from_text(self, dep_forest, seg_forest=None): features = [] labels = [] observations = get_observations(seg_forest, dep_forest) for sentence_index, address, dep_tree, class_ in sorted(observations): features.append(self.featgen(dep_tree, address)) labels.append(class_) return features, labels def segment(self, dep_corpus, out_folder): for text, trees in dep_corpus.iteritems(): print text discourse_tree = self.segment_text(trees) with open(out_folder + '/' + text + '.tree', 'w') as fout: fout.write(str(discourse_tree)) def segment_text(self, dep_forest): features, _ = self.extract_features_from_text(dep_forest) predictions = self._predict(features) return self._segment_text(predictions, dep_forest) def _segment_text(self, predictions, parses): all_segments = [] for sentence, dep_graph in enumerate(parses): # slice prediction vector sentence_length = dep_graph.length() sentence_predictions = predictions[:sentence_length] predictions = predictions[sentence_length:] # segment segments = self._segment_sentence(sentence_predictions, dep_graph) segment = segments[0][1] all_segments.append((sentence, segment)) return DiscourseSegment(a_name='TEXT', a_leaves=all_segments) def _segment_sentence(self, sentence_predictions, dep_graph): if dep_graph.is_valid_parse_tree(): # remove prediction annotations (just to be sure) dep_graph.deannotate(PREDICTION) # annotate dep_graph with sentence predictions dep_graph.annotate(sentence_predictions, PREDICTION) # call tree_segmenter segmenter = TreeSegmenter(a_type=DEPENDENCY) segments = segmenter.segment(dep_graph, a_predict=decision_function, a_word_access=word_access, a_strategy=GREEDY, a_root_idx=dep_graph.root[ADDRESS]) else: # make a simple sentence segment for invalid parse trees leaves = [(i, word) for i, (_, word) in enumerate(dep_graph.words(), 1)] dseg = DiscourseSegment(a_name=DEFAULT_SEGMENT, a_leaves=leaves) segments = [(0, dseg)] return segments def train(self, seg_corpus, dep_corpus, path=None): assert seg_corpus.keys() == dep_corpus.keys() features, labels = self.extract_features_from_corpus( dep_corpus, seg_corpus=seg_corpus) self._train(features, labels) if path is not None: joblib.dump(self.pipeline, path, compress=1, cache_size=1e9) def _train(self, features, labels): self.pipeline = MateSegmenter.DEFAULT_PIPELINE self.pipeline.fit(features, labels) def test(self, seg_corpus, dep_corpus): assert seg_corpus.keys() == dep_corpus.keys() features, labels = self.extract_features_from_corpus( dep_corpus, seg_corpus=seg_corpus) predicted_labels = self._predict(features) return self._score(labels, predicted_labels) def _predict(self, features): return self.pipeline.predict(features) def _score(self, labels, predicted_labels): _, _, macro_f1, _ = precision_recall_fscore_support(labels, predicted_labels, average='macro', warn_for=()) _, _, micro_f1, _ = precision_recall_fscore_support(labels, predicted_labels, average='micro', warn_for=()) return macro_f1, micro_f1 def cross_validate(self, seg_corpus, dep_corpus, out_folder=None): assert seg_corpus.keys() == dep_corpus.keys() texts = np.array(sorted(seg_corpus.keys())) folds = KFold(len(texts), number_of_folds) # extract features for all texts all_features = {} all_labels = {} for text in texts: features, labels = self.extract_features_from_text( dep_corpus[text], seg_forest=seg_corpus[text]) all_features[text] = features all_labels[text] = labels # do the cross-validation macro_F1s = [] micro_F1s = [] tp = fp = fn = tp_i = fp_i = fn_i = 0 for i, (train, test) in enumerate(folds): print "# FOLD", i # train train_texts = texts[train] train_features = chained( [all_features[text] for text in train_texts]) train_labels = chained([all_labels[text] for text in train_texts]) print " training on %d items..." % len(train_labels) self._train(train_features, train_labels) print " extracted %d features using the dict vectorizer." % \ len(self.pipeline.named_steps['vectorizer'].get_feature_names()) # test (predicting textwise) test_labels = [] pred_labels = [] for text in texts[test]: features = all_features[text] labels = all_labels[text] predictions = self._predict(features) test_labels.extend(labels) pred_labels.extend(predictions) if out_folder is not None: discourse_tree = self._segment_text( predictions, dep_corpus[text]) with open(out_folder + '/' + text + '.tree', 'w') as fout: fout.write(str(discourse_tree)) macro_f1, micro_f1 = self._score(test_labels, pred_labels) macro_F1s.append(macro_f1) micro_F1s.append(micro_f1) tp_i, fp_i, fn_i = _cnt_stat(test_labels, pred_labels) tp += tp_i fp += fp_i fn += fn_i print "# Average Macro F1 = %3.1f +- %3.2f" % \ (100 * np.mean(macro_F1s), 100 * np.std(macro_F1s)) print "# Average Micro F1 = %3.1f +- %3.2f" % \ (100 * np.mean(micro_F1s), 100 * np.std(micro_F1s)) if tp or fp or fn: print "# F1_{tp,fp} %.2f" % (2. * tp / (2. * tp + fp + fn) * 100) else: print "# F1_{tp,fp} 0. %" def _update_model(self, model): if model is None: self.pipeline = MateSegmenter.DEFAULT_PIPELINE elif isinstance(model, str): if not os.path.isfile(model) or not os.access(model, os.R_OK): raise RuntimeError( "Can't load model from file {:s}".format(model)) self.pipeline = joblib.load(model) else: self.pipeline = model
def PredictionScoreLeaveOneOut(X, y, limit, columnName): from sklearn.metrics import f1_score from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.svm import SVC, LinearSVC import matplotlib.pyplot as plt names = [ "Linear SVM", "Nearest Neighbors", "RBF SVM", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes" ] # names = ["Linear SVM","Linear SVM","Linear SVM","Linear SVM"] classifiers = [ SVC(kernel="linear", C=0.025, probability=True), KNeighborsClassifier(3), SVC(gamma=2, C=1), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB() ] outFile = open('output.txt', 'a') vec = DictVectorizer() for name, clf in zip(names, classifiers): try: accuracy = 0.0 count = 0.0 total_accuracy = 0.0 total_f1 = 0.0 total_precision = 0.0 total_recall = 0.0 count = 1.0 from sklearn.model_selection import LeaveOneOut loo = LeaveOneOut() loo.get_n_splits(X) # print(loo) y_test_all = [] y_pred_all = [] accuracy_total = 0 count = 0 for train_index, test_index in loo.split(X): # print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() X_train_fit = count_vect.fit(X_train) X_train_counts = X_train_fit.transform(X_train) X_test_counts = X_train_fit.transform(X_test) from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer() fit = tfidf_transformer.fit(X_train_counts) X_train_tfidf = fit.transform(X_train_counts) X_test_tfidf = fit.transform(X_test_counts) X_train_counts = X_train_tfidf X_test_counts = X_test_tfidf try: clf.fit(X_train_counts.toarray(), y_train) accuracy_total += clf.score(X_test_counts.toarray(), y_test) count += 1 y_pred = clf.predict(X_test_counts.toarray()) # # binary_predictions = [x if x == 'good' else 0 for x in y_pred] # binary_predictions = [x if x == 0 else 1 for x in binary_predictions] # # binary_labels = [x if x == 'good' else 0 for x in y_test] # binary_labels = [x if x == 0 else 1 for x in binary_labels] y_pred_all.append(y_pred[0]) y_test_all.append(y_test[0]) except BaseException as b: print(b) f1 = f1_score(y_test_all, y_pred_all, average='weighted') precision = precision_score(y_test_all, y_pred_all, average='weighted') recall = recall_score(y_test_all, y_pred_all, average='weighted') print( str(columnName) + "\t" + str(limit) + "\t" + str(name) + "\t" + str(accuracy_total / count) + "\t" + str(f1) + "\t" + str(precision) + "\t" + str(recall)) outFile.write( str(columnName) + "\t" + str(limit) + "\t" + str(name) + "\t" + str(accuracy_total / count) + "\t" + str(f1) + "\t" + str(precision) + "\t" + str(recall) + "\n") # acc, f1,prc,rec = classify(clf,X_train,X_test,y_train,y_test) # # total_accuracy +=acc # total_f1 += f1 # total_precision += prc # total_recall += rec except BaseException as b: print(b) outFile.close()
def __init__(self, mu): self.mu = mu self.vectorizer = DictVectorizer()
def convCatData(data_set): dict_vect = DictVectorizer(sparse=False) data_frm = pd.DataFrame(data_set).convert_objects(convert_numeric=True) converted_data_set = dict_vect.fit_transform( data_frm.to_dict(orient='records')) return converted_data_set
__author__ = 'Harsh' import numpy as np import pandas as pd from sklearn import preprocessing from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.feature_extraction import DictVectorizer from datetime import datetime #Load training data train = pd.read_csv('train.csv') test = pd.read_csv('test.csv') # Find maximum data and subtract days from it to check how old hotel is vec = DictVectorizer() #Encode label #labelencoder = preprocessing.LabelEncoder() #train['City'] = labelencoder.fit_transform(train['City']) #train['City Group'] = labelencoder.fit_transform(train['City Group']) #train['Type'] = labelencoder.fit_transform(train['Type']) #train['Open Date'] = labelencoder.fit_transform(train['Open Date']) def diff_dates_2015(date_x): date_format = "%m/%d/%Y" x = datetime.strptime(date_x, date_format) y = datetime.strptime('01/01/2015', date_format) delta = y - x return delta.days train['Open Date'] = train['Open Date'].apply(lambda x: diff_dates_2015(x)) test['Open Date'] = test['Open Date'].apply(lambda x: diff_dates_2015(x))
for song in playlist['tracks']: artist = song['artist_name'] artist_name.append(artist) unique_artists.add(artist) playlist_vs_artists.append(collections.Counter(artist_name)) print("Slice : " + str(i) + " - Parsed") start = end + 1 challenge_artists = set() slice = json.load(open('challenge_set.json')) for playlist in slice['playlists']: for song in playlist['tracks']: challenge_artists.add(song['artist_name']) v = DictVectorizer(sparse=True) X = csr_matrix.transpose(v.fit_transform(playlist_vs_artists)) print 'Vectorized!' svd = TruncatedSVD(n_components=100) X = svd.fit_transform(X) print 'Reduced!' neighbor = NearestNeighbors(n_neighbors=21, metric='cosine') neighbor.fit(X) print X unique_artists = list(unique_artists) unique_artists.sort() print(len(unique_artists)) print 'Training Done!' nearest_artists = {}
from sklearn.tree import DecisionTreeClassifier, export_graphviz print('Reading dummy data ... ') df = read_csv('flight_data.csv') print('[Done]') # 'Current time at origin', # 'Current time at destination', columns = ['Age', 'Nationality', 'Sleep quality (1-5)'] le = LabelEncoder() y = le.fit_transform(df['Light Color'].values) print('Training on past data ... ') df = df[columns].to_dict(orient='records') vectorizer = DictVectorizer(sparse=False) X = vectorizer.fit_transform(df) X_train = X[:-1] y_train = y[:-1] grd = GradientBoostingClassifier(n_estimators=10) clf = DecisionTreeClassifier() clf.fit(X_train, y_train) print('[Done]\n') # dot_data = export_graphviz(clf, out_file=None, # feature_names=vectorizer.feature_names_, # class_names=le.classes_.tolist(), # filled=True, rounded=True, # special_characters=True,
import pandas as pd, numpy as np from sklearn.feature_extraction import DictVectorizer from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error from random import randint if __name__ == "__main__": br = '\n' tips = pd.read_csv('data/tips.csv') data = tips.drop(['tip'], axis=1) target = tips['tip'] v = ['sex', 'smoker', 'day', 'time'] ls = data[v].to_dict(orient='records') vector = DictVectorizer(sparse=False, dtype=int) d = vector.fit_transform(ls) print('one hot encoding:') print(d[0:3], br) print('encoding order:') encode_order = vector.get_feature_names() print(encode_order, br) data = data.drop(['sex', 'smoker', 'day', 'time'], axis=1) X = data.values print('feature shape after removing categorical columns:') print(X.shape, br) Xls, dls = X.tolist(), d.tolist() X = [np.array(row + dls[i]) for i, row in enumerate(Xls)] X = np.array(X) y = target.values print('feature shape after adding encoded data back:') print(X.shape, br)