class Feat(object): def __init__(self): self.dvec = DictVectorizer(dtype=np.float32, sparse=False) def fit(self, trn): # def fit(self, trn, dev, tst): self.dvec.fit(self.feat_basic(ci, sent) for sent in trn for ci, c in enumerate(sent["cseq"])) # self.tseqenc.fit([t for sent in trn for t in sent['tseq']]) # self.tsenc.fit([t for sent in chain(trn,dev,tst) for t in sent['ts']]) self.feature_names = self.dvec.get_feature_names() # self.ctag_classes = self.tseqenc.classes_ # self.wtag_classes = self.tsenc.classes_ logging.info(self.feature_names) logging.debug(" ".join([fn for fn in self.feature_names])) # logging.info(self.ctag_classes) # logging.info(self.wtag_classes) self.NF = len(self.feature_names) # logging.info('NF: {} NC: {}'.format(self.NF, self.NC)) logging.info("NF: {}".format(self.NF)) def transform(self, sent): Xsent = self.dvec.transform([self.feat_basic(ci, sent) for ci, c in enumerate(sent["cseq"])]) # nchar x nf slen = Xsent.shape[0] ysent = np.zeros((slen, 2), dtype=bool) ysent[range(slen), sent["lseq"]] = True # ysent = np.array(sent['lseq']) return Xsent, ysent def feat_basic(self, ci, sent): return {"c": sent["cseq"][ci]}
class BinTransformer(object): """ bins: int (number of bins) or percentlile """ def __init__(self, bins, percentiles): self._dv = DictVectorizer() self._bins = bins self._bin_boundaries = {} self._percentiles = percentiles self._feature_names = [] def fit(self, data): binned_data = data.copy() for col in data.columns: cut_func = pd.qcut if self._percentiles else pd.cut binned_data[col], self._bin_boundaries[col] = cut_func(data[col], self._bins, retbins=True) self._dv.fit(binned_data.T.to_dict().values()) def transform(self, data): binned_data = data.copy() for col in data.columns: binned_data[col] = pd.cut(data[col], self._bin_boundaries[col]) binnedX = self._dv.transform(binned_data.T.to_dict().values()) self._feature_names += self._dv.get_feature_names() return binnedX def fit_transform(self, data): self.fit(data) return self.transform(data) def get_feature_names(self): return self._feature_names()
def vectorize(train_features, test_features): """ convert set of features to vector representation :param train_features: A dictionary with the following structure { instance_id: {f1:count, f2:count,...} ... } :param test_features: A dictionary with the following structure { instance_id: {f1:count, f2:count,...} ... } :return: X_train: A dictionary with the following structure { instance_id: [f1_count,f2_count, ...]} ... } X_test: A dictionary with the following structure { instance_id: [f1_count,f2_count, ...]} ... } """ X_train = {} X_test = {} vec = DictVectorizer() vec.fit(train_features.values()) for instance_id in train_features: X_train[instance_id] = vec.transform(train_features[instance_id]).toarray()[0] for instance_id in test_features: X_test[instance_id] = vec.transform(test_features[instance_id]).toarray()[0] return X_train, X_test
def dict_vectorize(dict_list): assert isinstance(dict_list, list) from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer() vec.fit(dict_list) return vec
class DictVectWrapper(object): def __init__(self): self.feature_extractors = [extractor[1] for extractor in inspect.getmembers(self, predicate=inspect.ismethod) if extractor[0].find("feature") > 0] self.dv = DictVectorizer() def fit(self, data): data_dics = [] for datum in data: features = {} for feature in self.feature_extractors: features.update(feature(datum)) data_dics.append(features) self.dv.fit(data_dics) def fit_transform(self, data): data_dics = [] for datum in data: features = {} for feature in self.feature_extractors: features.update(feature(datum)) data_dics.append(features) res = self.dv.fit_transform(data_dics) return res def transform(self, datum): features = {} for feature in self.feature_extractors: features.update(feature(datum)) return self.dv.transform(features)
def main(): # load data path = 'exracted_content/extractedrawtext' data = map(json.loads, file(path)) # count word for every tag tags = TAGS + ['boilerplate', 'boilerpipe'] counts_per_tag = {} for eachtag in tags: counts = map(counter, getItems(eachtag, data)) counts_per_tag[eachtag] = counts total = sumUp(counts_per_tag, len(data)) # vectorize vect = DictVectorizer() vect.fit([total]) features = {} for eachtag in tags: features[eachtag] = vect.transform(counts_per_tag[eachtag]) save('textfeature', features) save('textvector', vect)
class DictVectorizerModel: def fit(self, train): temp_list = [] for item in train: dic = {"dict" : item} temp_list.append(dic) self.model = DictVectorizer() self.model.fit(temp_list) def transform(self, dataframe, col_name): temp_list = [] for item in dataframe: dic = {"dict" : item} temp_list.append(dic) df = self.model.transform(temp_list).toarray() df = pd.DataFrame(df) df.index = dataframe.index df.columns = ["%s_%d" % (col_name, data) for data in df.columns] return df def get_model(self): return self.model def set_model(self, model): self.model = model
def _map_goose2streamitem(self, goose_sents, si_sents): goose_wc = [] for sent in goose_sents: c = {} for token in sent: token = token.lower() if isinstance(token, unicode): token = token.encode("utf-8") c[token] = c.get(token, 0) + 1 goose_wc.append(c) si_wc = [] for sent in si_sents: c = {} for token in sent: token = token.lower() if isinstance(token, unicode): token = token.encode("utf-8") c[token] = c.get(token, 0) + 1 si_wc.append(c) vec = DictVectorizer() vec.fit(goose_wc + si_wc) X_goose = vec.transform(goose_wc) X_si = vec.transform(si_wc) K = cosine_similarity(X_goose, X_si) I = np.argmax(K, axis=1) return I
def _train(self, train_data, resources): sample_length = len(train_data) dict_status_path = os.path.join(root_dic, 'dict_vectorizer_{}.status'. format(sample_length)) if os.path.isfile(dict_status_path): dictVectorizer = joblib.load(dict_status_path) else: dictVectorizer = DictVectorizer() dictVectorizer.fit(train_data[self.features]. fillna(0). to_dict('record')) joblib.dump(dictVectorizer, dict_status_path) tfidf_status_path = os.path.join(root_dic, 'tfidf_vectorizer_{}.status'. format(sample_length)) if os.path.isfile(tfidf_status_path): tfidf = joblib.load(tfidf_status_path) else: tfidf = TfidfVectorizer(min_df=40, max_features=300) tfidf.fit(train_data.essay) joblib.dump(tfidf, tfidf_status_path) resources['dictVectorizer'] = dictVectorizer resources['tfidf'] = tfidf print 'Head Processing Completed' return train_data, resources
def dictionary(rows): dl = dictlist(rows2objects(rows)) dv = DictVectorizer() dv.fit(dl) return dv
def vectorize_data(data_list,vec_data_fd): '''Takes in the data as a list of attribute-name:value tuples and converts it into vectorized form for processing by scikit prints the feature mapping in filename.''' vec=DictVectorizer(); vec.fit(data_list) print len(vec.get_feature_names()) vector_data = vec.transform(data_list).toarray() one_hot_names = vec.get_feature_names(); #print the feature mappings feature_indices = range(0,len(one_hot_names)); one_hot_mapping = zip(one_hot_names,feature_indices); with open('one_hot_encoding.txt','w') as file: for (idx,one_hot_name) in one_hot_mapping: print >> file, "%s-->%d\n"%(idx,one_hot_name); # print the one-hot encoding for each tuple. with open('vector_mappings.txt','w') as file: for row in vector_data: print >> file, vec.inverse_transform(row),"\n"; sys.exit(1) for row in vector_data: row = [str(x) for x in row] row = ",".join(row) vec_data_fd.write(row) vec_data_fd.write("\n\n") return vector_data;
def onehot_encoder(df): d = df.T.to_dict().values() dv = DictVectorizer() dv.fit(d) return dv
def main(): # load data #path = 'generated/extracted_text' os.system("mkdir generated") path = 'extracted_text' data = map(json.loads, file(path)) # count word for every tag tags = TAGS + ['boilerplate', 'boilerpipe'] counts_per_tag = {} for tag in tags: counts = map(count, get(tag, data)) counts_per_tag[tag] = counts total = sum_up(counts_per_tag, len(data)) # vectorize v = DictVectorizer() v.fit([total]) features = {} for tag in tags: features[tag] = v.transform(counts_per_tag[tag]) save('text_features', features) save('text_vectorizer', v) os.system("mv generated/text_features . ") os.system("mv generated/text_vectorizer . ")
def encode_categorical_features(features, sparse=True): from sklearn.feature_extraction import DictVectorizer enc = DictVectorizer(sparse=sparse) enc.fit(features) svm_features = enc.transform(features) return svm_features, enc
def generate_matrix(): D = [] y = [] fex = features.IpadicFeature() progress = 0 print('create feature dictionary') for q, a in load_corpus(): D.append(list(fex.transform(q))) a = normalize.normalize_askfm(a, h2z=False) y.append(isnot_shitsumon(a)) progress += 1 if progress % 100 == 0: print(progress) dv = DictVectorizer() dv.fit(itertools.chain(*D)) progress = 0 print('create feature vector') X = [] for ds in D: count = None for d in ds: v = dv.transform(d) if count is None: count = v else: count += v X.append(count) progress += 1 if progress % 100 == 0: print(progress) X = scipy.sparse.vstack(X) y = numpy.array(y) return X, y, dv
def plotFeatures(featureListTrain,trainingsetLabels): cv = DictVectorizer() cv.fit(featureListTrain) print (len(cv.vocabulary_)) print (cv.get_feature_names()) X_train = cv.transform(featureListTrain) svm = LinearSVC() svm.fit(X_train, trainingsetLabels) plot_coefficients(svm, cv.get_feature_names())
def vectorize(self, categorical_features, continuous_features): vec = DictVectorizer(sparse=False) vec.fit(self.training_categorical_features) enc_categorical_features = vec.transform(categorical_features) merged_features = [] for cont, cat in zip(continuous_features, enc_categorical_features): all_features_for_item = list(cont) + list(cat) # TODO why do I need this ||0 ? -- why isn't the imputer handling this? merged_features.append([0.0 if math.isnan(y) else y for y in all_features_for_item]) return merged_features
class make_dummies(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin): '''uses pandas to transform categorical variables into one-hot encoding''' def __init__(self, dummy_cols): self.dummy_cols = dummy_cols self.dv = DictVectorizer() def fit(self, X, y=None): self.dv.fit(X[self.dummy_cols].to_dict(orient='records')) return self def transform(self, X): return self.dv.transform(X[self.dummy_cols].to_dict(orient='records'))
def __get_bag_of_words(self, set, special_chars_bow=None): list_of_bows = list() if special_chars_bow is None: for user_posts in set: list_of_bows.append(self.vectorizer.transform(np.array(user_posts)).toarray()) else: special_vectoriser = DictVectorizer() special_vectoriser.fit(Counter(s.split()) for s in special_chars_bow) print(special_vectoriser.vocabulary_) for user_posts in set: list_of_bows.append( special_vectoriser.transform(Counter(s.split()) for s in np.array(user_posts)).toarray()) return list_of_bows
def vectorize(data, s): ''' :param data: list of instances for a given lexelt with the following structure: { [(instance_id, left_context, head, right_context, sense_id), ...] } :param s: list of words (features) for a given lexelt: [w1,w2,w3, ...] :return: vectors: A dictionary with the following structure { instance_id: [w_1 count, w_2 count, ...], ... } labels: A dictionary with the following structure { instance_id : sense_id } ''' vectors = {} labels = {} # implement your code here vec = DictVectorizer() s_set = set(s) def vectorize_one(t): tokens_left = list(nltk.word_tokenize(t[1])) tokens_right = list(nltk.word_tokenize(t[3])) tokens = tokens_left + [t[2]] + tokens_right context_words = tokens_left[-window_size:] + tokens_right[0:window_size] context_window = dict(map(lambda x: ('BOW_'+x, 0), s)) def inc_one(word): if word in s_set: key = 'BOW_'+word context_window.setdefault(key, 0) context_window[key] += 1 try: map(lambda word: inc_one(word), context_words ) except Exception as e: # print 'word', 'not in s ', e pass try: vectors[t[0]] = context_window except: pass labels[t[0]] = t[-1] map(vectorize_one, data) vec.fit(vectors.values()) for instance_id in vectors: vectors[instance_id] = vec.transform(vectors[instance_id]).toarray()[0] return vectors, labels
def crossval(paths, annDir, eval_type, use_reach): ''' Puts all together ''' print "Parsing data" paths = set(paths) labels, vectors, hashes, data = parse_data(paths, annDir, use_reach) # Group indexes by paper id groups = {p:[] for p in paths} for i, d in enumerate(data): groups[d.namespace].append(i) # Hack!! groups2 = {} for k, v in groups.iteritems(): if len(v) != 0: groups2[k] = v groups = groups2 print "Using %i papers" % len(groups2) # Make it a numpy array to index it more easily data = np.asarray(data) dv = DictVectorizer() dv.fit(vectors) X = dv.transform(vectors) y = np.asarray(labels) f1_diffs = [] model_f1s = {} indices = set(range(len(data))) # Do the "Cross-validation" only on those papers that have more than N papers for path in groups.keys(): others = paths - {path} test_ix = set(groups[path]) train_ix = list(indices - test_ix) test_ix = list(test_ix) policy_f1, model_f1 = machine_learning(X, y, data, train_ix, test_ix) f1_diffs.append(model_f1 - policy_f1) model_f1s[path] = model_f1 return pd.Series(f1_diffs), model_f1s
class Phi(): def __init__(self): self.vectorizer = DictVectorizer() def fit(self, training_sentences): counts = [] for sentence in training_sentences: words = [pair[0] for pair in sentence] tags = [pair[1] for pair in sentence] count = self.extract_freatures(words, tags) counts.append(count) #Fit the dictvectorizer to the data #Fit expects a list of dictonaries but we just give it the one we constructed self.vectorizer.fit(counts) def transform(self, word_sequence, tag_sequence): #Extract the feature count count = self.extract_freatures(word_sequence, tag_sequence) #Convert the count to a sparse vector using dictVectorizer vector = self.vectorizer.transform(count) return vector def extract_freatures(self, word_sequence, tag_sequence): #Force them to be the same length length = len(word_sequence) #Append end to tags tag_sequence.append('END') #Create a list of word-tag and tag-tag features features = [] for i in range(0,length): features.append((word_sequence[i],tag_sequence[i])) features.append((tag_sequence[i],tag_sequence[i+1])) #Create a count out of the list of features count = Counter(features) return count def inverse_transform(self, vector): #Convert the vector to a count using dictVectorizer counts = self.vectorizer.inverse_transform(vector) #inverse_transform returns a list of counts but we only gave it one vector return counts[0]
def encode_categorical_features(train, test): dict_vectorizer = DictVectorizer() categorical_features = [] for column in train: if train[column].dtype == 'object': categorical_features.append(column) train_categorical_features = train[categorical_features].to_dict(outtype='records').toarray() test_categorical_features = test[categorical_features].to_dict(outtype='records').toarray() dict_vectorizer.fit(train_categorical_features) train_categorical_encoded = pandas.DataFrame(dict_vectorizer.transform(train_categorical_features)) test_categorical_encoded = pandas.DataFrame(dict_vectorizer.transform(test_categorical_features))
class CategoricalConverter(BaseEstimator, TransformerMixin): """ An inheritance class of BaseEstimator, TransformerMixin in sklearn. It can be used in sklearn.pipeline. It can convert categorical columns to numeric ones. Parameters: method: string. can be "dummy", "groupmean", "valuecount". Default is "dummy". cate_col: None or a list of string of column names. Return: A pandas dataframe with the categorical columns dropped. The original one will not be affected. """ def __init__(self, method="dummy", cate_cols=None): self.method = method self.cate_cols = cate_cols return def fit(self, X, y): if self.cate_cols is None: self.cate_cols = get_cate_col(X) self.values = {} if self.method == "dummy": self.dvec = DictVectorizer(sparse=False) self.dvec.fit((X[self.cate_cols]).to_dict('record')) elif self.method == "groupmean": for col in self.cate_cols: tempdict = {} tempvals = [val for val in X[col].unique() if str(val) != "nan"] for val in tempvals: tempdict[val] = y[(X[col] == val)].mean() self.values[col] = tempdict elif self.method == "valuecount": for col in self.cate_cols: self.values[col] = X[col].value_counts() return self def transform(self, X, y=None): XX = X.copy() if self.method == "dummy": temp_dummy = pd.DataFrame( data = self.dvec.transform((XX[self.cate_cols]).to_dict('record')), columns = self.dvec.get_feature_names(), index=XX.index) XX = pd.concat([temp_dummy,XX],axis=1) elif self.method in ["groupmean", "valuecount"]: for col in self.cate_cols: XX.loc[:,col+"_gpmean"] = XX[col].map(self.values[col]) XX.drop(self.cate_cols, axis=1, inplace=True) return XX
class FullPickledRSTFeatureExtractor: def __init__(self,instancenums): self.vectorizer = DictVectorizer(dtype=float, sparse=True) self.atInstance = 0 self.instanceNums = instancenums def fit(self, X, y=None): self.vectorizer = self.vectorizer.fit([self.getFeatures(x) for x in X]) return self def getFeatures(self,text): features = {}#{'textlen':len(text)} rstFile = './output_trees/review%d.pickle.gz'%self.instanceNums[self.atInstance] tree = getPickledTree(rstFile).tree features['size'] = tree_size(tree) if features['size']>0: features['normdepth'] = tree_depth(tree)/tree_size(tree) features['balance'] =abs(tree_balance(tree)) features.update(relation_proportion(tree)) features.update(parent_relation_proportion(tree)) self.atInstance = self.atInstance + 1 if self.atInstance>=len(self.instanceNums): self.atInstance = 0 return features def setInstanceNums(self, nums): self.instanceNums = nums def setInstance(self, num): self.atInstance = num def transform(self, X, y=None): return self.vectorizer.transform([self.getFeatures(x) for x in X],y)
class FullTextRSTFeatureExtractor: def __init__(self,instancenums): self.vectorizer = DictVectorizer(dtype=float, sparse=True) self.atInstance = 0 self.instanceNums = instancenums def fit(self, X, y=None): self.vectorizer = self.vectorizer.fit([self.getTextFeatures(text) for text in X]) return self def getTextFeatures(self,text): features = {}#{'textlen':len(text)} rstFile = open('./rstParsed/review%d.brackets'%self.instanceNums[self.atInstance],'r') counter = Counter() for line in rstFile: eduRange, satOrNuc, rangeType = eval(line) counter[satOrNuc] += 1 counter[rangeType] += 1 counter['lines'] += 1 counter['maxEDU'] = max(eduRange[1],counter['maxEDU']) counter['maxDif'] = max(eduRange[1]-eduRange[0],counter['maxDif']) features.update(counter) self.atInstance = self.atInstance + 1 if self.atInstance>=len(self.instanceNums): self.atInstance = 0 return features def setInstanceNums(self, nums): self.instanceNums = nums def setInstance(self, num): self.atInstance = num def transform(self, X, y=None): return self.vectorizer.transform([self.getTextFeatures(text) for text in X],y)
def KFoldPredictionScore (X,y,k,header): from sklearn.svm import SVC from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer() try: accuracy = 0.0 for X_train, y_train, X_test, y_test in k_fold_generator(X, y, k): vec = DictVectorizer() fit = vec.fit(X_train) X_train_counts = fit.transform(X_train) X_test_counts = fit.transform(X_test) clf = SVC(kernel="linear", C=0.025) try: clf.fit(X_train_counts.toarray(), y_train) #predict = clf.predict(X_test_counts.toarray()) accuracy += clf.score(X_test_counts.toarray(),y_test) # coef = clf._get_coef() # print(np.argsort(coef)[-20:]) #for i in range(0,len(X_test)): #print (X_test[i]['ID']+"\t"+y_test[i]+"\t"+predict[i]) except BaseException as b: print (b) print (header+"\t"+str(accuracy)) except BaseException as b: print (b)
def _trainPOSDictVectorizer(self, goldTree, to_classify=None): sentences = list(goldTree) if to_classify: sentences.extend(to_classify) pos_tagged = self.get_pos_tags_for_sentences(sentences) items = [] assert len(pos_tagged) == len(sentences) for sentence, pos in itertools.izip(sentences, pos_tagged): # feels silly, but there is the occasional encoding error # when using str(sentence) self.posCache[sentence.pprint().encode('utf-8')] = pos items.extend(self.extract_POS(sentence, pos)) dv = DictVectorizer(sparse=False) dv.fit(items) #logger.debug("DictVectorizer vocab: %s", dv.vocabulary_) return dv
def numberize_features(dataset, unrolled_dataset, dv=None): ''' turn non-numeric features into sparse binary features; also return the feature map ''' # http://fastml.com/converting-categorical-data-into-numbers-with-pandas-and-scikit-learn/ # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html if dv is None: dv = DictVectorizer(sparse=False) # can we make it true? dv = dv.fit(unrolled_dataset.flatten()) return np.array(list(map(dv.transform, dataset))), dv
def numberize_features(dataset, sparse=True, dv=None): ''' turn non-numeric features into sparse binary features; also return the feature map ''' # http://fastml.com/converting-categorical-data-into-numbers-with-pandas-and-scikit-learn/ # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html if dv is None: dv = DictVectorizer(sparse=sparse) dv = dv.fit(dataset) return dv.transform(dataset), dv
import pandas as pd from sklearn.feature_extraction import DictVectorizer df = pd.read_csv( 'https://raw.githubusercontent.com/dlsun/data-science-book/master/data/titanic.csv' ) # define the training data cols = ["age", "sex", "pclass"] X_train = df[cols] y_train = df["survived"] # convert categorical variables to dummy variables vec = DictVectorizer(sparse=False) X_train = X_train.to_dict(orient='records') vec.fit(X_train) X_train = pd.DataFrame(vec.transform(X_train)).fillna(0) X_train.columns = vec.get_feature_names() # fit the 5-nearest neighbors model model = KNeighborsClassifier(n_neighbors=5) scaler = StandardScaler() pipeline = Pipeline([("scaler", scaler), ("model", model)]) y_train = df["survived"] pipeline.fit(X_train, y_train) y_train_pred = pipeline.predict(X_train) print("Accuracy", accuracy_score(y_train, y_train_pred))
else: temp[name] = entry[name] for name in num_var: if entry[name] == 'NA': continue else: temp[name] = float(entry[name]) #temp["self_reported_fishing_vessel"] = entry["self_reported_fishing_vessel"] == "TRUE" examples.append(temp) #%% vectorize: from sklearn.feature_extraction import DictVectorizer feature_numbering = DictVectorizer(sort=True, sparse=False) feature_numbering.fit(examples) X = feature_numbering.transform(examples) print("Features as {} matrix.".format(X.shape)) del examples #%% Split data from sklearn.model_selection import train_test_split import numpy as np RANDOM_SEED = 12345678 y = np.array(ys) # split off 10% for train/validate (tv) pieces. X_tv, rX_test, y_tv, y_test = train_test_split( X, y, train_size=0.1, shuffle=True, random_state=RANDOM_SEED
train_size=0.9, shuffle=True, random_state=RANDOM_SEED, ) # split off train, validate from (tv) pieces. ex_train, ex_vali, y_train, y_vali = train_test_split( ex_tv, y_tv, train_size=0.9, shuffle=True, random_state=RANDOM_SEED ) #%% vectorize: from sklearn.preprocessing import StandardScaler, MinMaxScaler feature_numbering = DictVectorizer(sparse=False) # Learn columns from training data (again) feature_numbering.fit(ex_train) # Translate our list of texts -> matrices of counts rX_train = feature_numbering.transform(ex_train) rX_vali = feature_numbering.transform(ex_vali) rX_test = feature_numbering.transform(ex_test) scaling = StandardScaler() X_train = scaling.fit_transform(rX_train) X_vali = scaling.transform(rX_vali) X_test = scaling.transform(rX_test) print(X_train.shape, X_vali.shape) #%% train a model: from sklearn.tree import DecisionTreeRegressor from sklearn.linear_model import SGDRegressor from sklearn.neural_network import MLPRegressor
for n in range(npoints): x = X.iloc[n, :] list_X.append(x) return list_X time_start = time.time() df = pd.read_csv('datasets/training_data.csv') X = df.iloc[:, :-1] y = df.iloc[:,-1] # attribute = ['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat'] # 43 features # if dont want using feature, df_train.drop(df_train.columns[[0]], axis=1, inplace=True) X = get_data_to_list(X) dv = DictVectorizer() dv.fit(X) X_vec = dv.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size = 0.3) clf_inforGain = DecisionTreeClassifier(criterion='entropy', random_state=100, max_depth=50, min_samples_leaf=2) clf_inforGain.fit(X_train, y_train) pred = clf_inforGain.predict(X_test) print('rate:', metrics.accuracy_score(y_test, pred)) time_end = time.time() print('time run:', time_end - time_start)
class SupervisedLearner: def __init__(self, abstract_reader, target="n", hold_out_a_test_set=False, test_set_p=None): ''' abstract_reader: a LabeledAbstractReader instance. target: the tag of interest (i.e., to be predicted) ''' self.abstract_reader = abstract_reader self.target = target # this is a special target because we # enforce the additional constraint that # there be only one 'yes' vote per citation self.predicting_sample_size = target == "n" # reserve some data for testing? self.holding_out_a_test_set = hold_out_a_test_set if self.holding_out_a_test_set: assert test_set_p is not None self.test_set_p = test_set_p self.n_citations = len(self.abstract_reader) def plot_preds(self, preds, y): # (preds, y) = sl.cv() # sklearn wraps up the predicted results pos_indices = [i for i in xrange(len(y)) if y[i] > 0] all_preds = [preds[i][1] for i in xrange(len(y))] pos_preds = [preds[i][1] for i in pos_indices] def generate_features(self): print "generating feature vectors" # I don't think we ever want to flatten abstracts. self.features, self.y = self.features_from_citations( flatten_abstracts=False) self.vectorizer = DictVectorizer(sparse=True) # note that we keep structure around that keeps features # in citations together. specifically, features will be a # list of feature vectors representing words # in abstracts comprising distinct citations all_features = [] for citation_fvs in self.features: all_features.extend(citation_fvs) self.vectorizer.fit(all_features) self.X_fv = [] no_abstracts = 0 for X_citation in self.features: if len(X_citation) > 0: self.X_fv.append(self.vectorizer.transform(X_citation)) else: self.X_fv.append(None) no_abstracts += 1 print "({0} had no abstracts!)".format(no_abstracts) #self.X_fv = [self.vectorizer.transform(X_citation) for X_citation in self.features if len(X_citation) > 0] if self.holding_out_a_test_set: self.set_held_out_indices() def set_held_out_indices(self): test_set_size = int(self.test_set_p * self.n_citations) print "setting aside a test set of size {0}".format(test_set_size) #import pdb; pdb.set_trace() self.test_indices = random.sample(range(self.n_citations), test_set_size) def select_train_citation_indices(self, train_p): ''' this is somewhat confusing, but the idea here is to allow one to have a single, consistent test set and to increase the training set to see how this affects performance on said set. ''' # first remove the held out indices. self.train_indices = [ i for i in range(self.n_citations) if not i in self.test_indices ] # now draw a sample from the remaining (train) abstracts. train_set_size = int(train_p * len(self.train_indices)) print "going to train on {0} citations".format(train_set_size) self.train_indices = random.sample(self.train_indices, train_set_size) ''' @TODO this method is meant to supplant the following routine. The idea is that is more general, i.e., allows us to assess performance on <tx>, etc; not just <n> ''' def train_and_test(self, test_size=.2, train_p=None): test_citation_indices = None train_citation_indices = None if self.holding_out_a_test_set: print "using the held-out test set!" test_size = len(self.test_indices) test_citation_indices = self.test_indices train_citation_indices = self.train_indices else: test_size = int(test_size * self.n_citations) test_citation_indices = random.sample(range(self.n_citations), test_size) print "test set of size {0} out of {1} total citations".format( test_size, self.n_citations) @staticmethod def max_index(self, a): return max((v, i) for i, v in enumerate(a))[1] def train_and_test_sample_size(self, test_size=.2, train_p=None): ''' @TODO need to amend for predicting things other than sample size in retrospect, should probably never flatten abstracts; at test time we'll want to enforce certain constraints @TODO refactor -- this method is too long. ''' test_citation_indices = None train_citation_indices = None if self.holding_out_a_test_set: print "using the held-out test set!" test_size = len(self.test_indices) test_citation_indices = self.test_indices train_citation_indices = self.train_indices else: test_size = int(test_size * self.n_citations) test_citation_indices = random.sample(range(self.n_citations), test_size) print "test set of size {0} out of {1} total citations".format( test_size, self.n_citations) X_train, y_train = [], [] X_test, y_test = [], [] test_citation_indices.sort() # not necessary; tmp for i in xrange(self.n_citations): if self.X_fv[i] is not None: is_a_training_instance = (train_citation_indices is None or i in train_citation_indices) if not i in test_citation_indices and is_a_training_instance: # we flatten these for training. X_train.extend(self.X_fv[i]) y_train.extend(self.y[i]) elif i in test_citation_indices: # these we keep structured, though. X_test.append(self.X_fv[i]) y_test.append(self.y[i]) clf = SupervisedLearner._get_SVM() X_train = scipy.sparse.vstack(X_train) clf.fit(X_train, y_train) print "ok -- testing!" max_index = lambda a: max((v, i) for i, v in enumerate(a))[1] ''' @TODO refactor. note that this will have to change for other targets (TX's, etc.) ''' TPs, FPs, N_pos = 0, 0, 0 for test_citation_i, citation_fvs in enumerate(X_test): true_lbls_i = y_test[test_citation_i] preds_i = clf.best_estimator_.decision_function(citation_fvs) # we set the index corresponding to the max # val (most likely entry) to 1; all else are 0 preds_i_max = max_index(preds_i) preds_i = [-1] * len(preds_i) preds_i[preds_i_max] = 1 # *abstract level* predictions. if not 1 in true_lbls_i: cit_n = test_citation_indices[test_citation_i] print "-- no sample size for abstract (biview_id) {0}!".format( self.abstract_reader[cit_n]["biview_id"]) # since we force a prediction for every abstract right now, # i'll penalize us here. this is an upperbound on precision. FPs += 1 else: N_pos += 1 if preds_i.index(1) == true_lbls_i.index(1): TPs += 1 else: FPs += 1 N = len(X_test) return TPs, FPs, N_pos, N def cv(self, predict_probs=False): X_train, X_test, y_train, y_test = cross_validation.train_test_split( self.X_fv, self.y, test_size=0.1) clf = SupervisedLearner._get_SVM() clf.fit(X_train, y_train) preds = None if predict_probs: # well, *log* probs, anyway preds = [p[1] for p in clf.predict_log_proba(X_test)] else: preds = clf.predict(X_test) return preds, y_test @staticmethod def _get_SVM(): tune_params = [{"C": [1, 5, 10, 100, 1000]}] return GridSearchCV(LinearSVC(), tune_params, scoring="f1") def train(self): features, y = self.features_from_citations() self.vectorizer = DictVectorizer(sparse=True) X_fv = self.vectorizer.fit_transform(self.features) self.clf = _get_SVM() ## # @TODO grid search over c? self.clf.fit(X_fv, y) def features_from_citations(self, flatten_abstracts=False): X, y = [], [] pb = progressbar.ProgressBar(len(self.abstract_reader), timer=True) for cit_id in range(len(self.abstract_reader)): # first we perform feature extraction over the # abstract text (X) merged_tags = self.abstract_reader.get(cit_id) #pdb.set_trace() p = TaggedTextPipeline(merged_tags, window_size=4) p.generate_features() # @TODO will eventually want to exploit sentence # structure, I think #### # IM: 'punct' = token has all punctuation # filter here is a lambda function used on the # individual word's hidden features ### # X_i = p.get_features(flatten=True, filter=lambda w: w['punct']==False) # y_i = p.get_answers(flatten=True, answer_key=lambda w: "n" in w["tags"], filter=lambda w: w['punct']==False) #### # IM: xml annotations are now all available in w["tags"] for each word in the features list #### if self.predicting_sample_size: ### # restrict to integers only ### #X_i = p.get_features(flatten=True, filter=lambda w: w['num']==True) X_i = p.get_features(flatten=True, filter=integer_filter) y_i = p.get_answers(flatten=True, answer_key=is_sample_size, filter=integer_filter) else: X_i = p.get_features(flatten=False) y_i = p.get_answers(flatten=False, answer_key=is_target) if flatten_abstracts: X.extend(X_i) y.extend(y_i) else: X.append(X_i) y.append(y_i) pb.tap() return X, y def train_on_all_data(self): X_train, y_train = [], [] for i in xrange(self.n_citations): if self.X_fv[i] is not None: # we flatten these for training. X_train.extend(self.X_fv[i]) y_train.extend(self.y[i]) clf = SupervisedLearner._get_SVM() X_train = scipy.sparse.vstack(X_train) print "fitting...." clf.fit(X_train, y_train) print "success!" return clf, self.vectorizer
time_start = time.time() df = pd.read_csv('datasets/training_data.csv') #df_test = pd.read_csv('datasets/test.csv') X = df.iloc[:, :-1] y = df.iloc[:,-1] # attribute = ['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat'] # 43 features # if dont want using feature, df_train.drop(df_train.columns[[0]], axis=1, inplace=True) X = get_data_to_list(X) #print(X.head()) #print(y.head()) #for i in test: print(i) dv = DictVectorizer() dv.fit(X) X_vec = dv.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size = 0.3) clf = RandomForestClassifier() clf.fit(X_train, y_train) pred = clf.predict(X_test) print('rate:', metrics.accuracy_score(y_test, pred)) time_end = time.time() print('time run:', time_end - time_start) '''
df_no_priceLink = df_no_price.drop(['link'], 1) df_no_priceLinkTitle = df.drop(['ID', 'price', 'link', 'title'], 1) # -------------------------------------------------------------------------------------------------------- # Do the same for our testing file testing_no_priceLinkTitle = testing.drop(['price', 'link', 'title'], 1) # -------------------------------------------------------------------------------------------------------- #Now use the sklearn DictVectorizor libary to map each colum from the data frame into a numpy array # Transforms lists of feature-value mappings to vectors. # # dv = DictVectorizer() dv.fit(df_no_priceLinkTitle.T.to_dict().values()) # -------------------------------------------------------------------------------------------------------- # Create linear regression object LR = LinearRegression() # Train the model using the training sets(DataFrame without title, link or price and then price by itself) LR.fit(dv.transform(df_no_priceLinkTitle.T.to_dict().values()), df.price) # Explained variance score: 1 is perfect prediction print( 'Variance score: %.2f' % LR.score( dv.transform(df_no_priceLinkTitle.T.to_dict().values()), df.price)) # --------------------------------------------------------------------------------------------------------
class BaseLabeler(object): def __init__(self, path_train="", fileid_train_labeled="", fileid_train_unlabeled="", train_instances_labeled=None, train_instances_unlabeled=None, fileid_train_labeled_dep="", fileid_train_unlabeled_dep="", filter_func=None, special_label="", featselec_featorder=[], feature_list=[], path_test="", fileid_test="", model=None, target_verbs=[], discard_labels=[], train_method="supervised", verbose_fileid=None): self.filter_func = filter_func self.feature_list = feature_list self.path_test = path_test self.fileid_test = fileid_test self.model = model self.target_verbs = target_verbs self.discard_labels = discard_labels self.prop_num_per_verb = None self.verbose_fileid = verbose_fileid self.train_feats, self.train_labels, self.train_props = self._load_instances( path_train, fileid_train_labeled, train_instances_labeled, filter_func, special_label, False, fileid_train_labeled_dep, self.prop_num_per_verb) if train_method == "self-training": unlabeled_instances = self._load_instances( path_train, fileid_train_unlabeled, train_instances_unlabeled, filter_func, special_label, True, fileid_train_unlabeled_dep, self.prop_num_per_verb) # Extract features self.train_unlabeled_props = [] self.train_unlabeled_feats = [] self.train_unlabeled_labels = [] for argcand in unlabeled_instances: argcand_feats, argcand_prop = self.extract_features(argcand) self.train_unlabeled_feats.append(argcand_feats) self.train_unlabeled_props.append(argcand_prop) argcand_label = argcand["info"]["label"] if argcand_label == "NULL": self.train_unlabeled_labels.append("NULL") elif special_label != "": self.train_unlabeled_labels.append(special_label) else: self.train_unlabeled_labels.append(argcand_label) self.featselec_featorder = featselec_featorder self.train_method = train_method def set_params(self, path_test="", fileid_test="", fileid_test_dep="", prop_num_per_verb=None): self.path_test = path_test self.fileid_test = fileid_test self.fileid_test_dep = fileid_test_dep try: if self.prop_num_per_verb is None or prop_num_per_verb is not None: self.prop_num_per_verb = prop_num_per_verb except AttributeError: self.prop_num_per_verb = prop_num_per_verb return def _load_instances(self, path, fileid, instances=None, filter_func=None, special_label="", test=False, fileid_dep="", prop_num_per_verb=None): if instances is None: column_types = [ "id", "words", "lemma", "pos", "feat", "clause", "fclause", "tree", "srl" ] reader = PropbankBrConllCorpusReader(path, fileid, column_types, None, "S", False, True, "utf-8") column_types_dep = [ "id", "words", "lemma", "pos", "feat", "head", "deprel", "fillpred", "srl" ] reader_dep = PropbankBrConllCorpusReader(path, fileid_dep, column_types_dep, None, "FCL", False, False, "utf-8") # Get the argument candidates argcands, self.prop_num_per_verb = self._read_instances( reader, filter_func, reader_dep, prop_num_per_verb) else: argcands = instances if test: return argcands # Extract the necessary features from the argument candidates train_argcands_props = [] train_argcands_feats = [] train_argcands_target = [] for argcand in argcands: argcand_label = argcand["info"]["label"] if (argcand_label in self.discard_labels) or ("C-" in argcand_label): continue arg_feats, arg_prop = self.extract_features(argcand) train_argcands_feats.append(arg_feats) train_argcands_props.append(arg_prop) if argcand_label == "NULL": train_argcands_target.append("NULL") elif special_label != "": train_argcands_target.append(special_label) else: train_argcands_target.append(argcand_label) # Create an encoder for the features self.feature_encoder = DictVectorizer() self.feature_encoder.fit(train_argcands_feats) # Create and encoder for the target labels self.label_encoder = LabelEncoder() self.label_encoder.fit(train_argcands_target) return train_argcands_feats, train_argcands_target, train_argcands_props def _read_instances(self, reader, filter_func=None, reader_dep=None, prop_num_per_verb=None): arg_cands = [] if reader_dep is None: info_sent = zip(reader.lexicalinfo_sents(), reader.srl_instances(None, None, False), None) else: info_sent = zip(reader.lexicalinfo_sents(), reader.srl_instances(None, None, False), reader_dep.dep_parsed_sents()) if prop_num_per_verb is None: prop_num_per_verb = dict() for lexinfo_sent, sent_ins, sent_ins_depgraph in info_sent: # Get the parse tree of the sentence tree = sent_ins.tree for ins in sent_ins: # Check if the instance belongs to one of the target verbs if (ins.verb_stem in self.target_verbs) or (self.target_verbs == []): if ins.verb_stem in prop_num_per_verb: prop_num_per_verb[ins.verb_stem] += 1 else: prop_num_per_verb[ins.verb_stem] = 1 verb_prop_num = prop_num_per_verb[ins.verb_stem] if filter_func is None: # Get the gold arguments for arg in ins.arguments: arg_cands.append({ "ins": ins, "verb_prop_num": verb_prop_num, "info_sent": lexinfo_sent, "info": self._format_argcand(arg, lexinfo_sent, tree), "depgraph": sent_ins_depgraph }) else: # Prune the constituents of the sentence to get the argument candidates pruned_argcands = filter_func( tree, tree.leaf_treeposition(ins.verb_head)) # Format each argument candidate for argcand_treepos in pruned_argcands: argcand_span = util.treepos_to_tuple( tree, argcand_treepos) # Get the label of the argument candidate for arg in ins.arguments: if argcand_span == arg[0]: argcand_label = arg[-1] break else: argcand_label = "NULL" arg_cands.append({ "ins": ins, "verb_prop_num": verb_prop_num, "info_sent": lexinfo_sent, "depgraph": sent_ins_depgraph, "info": self._format_argcand( (argcand_span, argcand_label), lexinfo_sent, tree, argcand_treepos) }) return arg_cands, prop_num_per_verb def extract_features(self, argcand): feats_dep = feature_extractor_dep(argcand, self.feature_list) feats_const = feature_extractor_const(argcand, self.feature_list, argcand["depgraph"]) feats_const.update(feats_dep) return feats_const, argcand["verb_prop_num"] def fit_mix(self, model_name="LogisticRegression"): if self.model == None: if model_name == "LinearSVC": model = LinearSVC(C=1, loss="l2") elif model_name == "SVC": model = SVC(kernel="poly") elif model_name == "LogisticRegression": model = LogisticRegression(C=8, penalty="l1") else: raise ValueError("Invalid model name.") if self.train_method == "supervised": self.model = model self.model.fit(self.feature_encoder.transform(self.train_feats), self.label_encoder.transform(self.train_labels)) return self.model def _join_test_discarded(self, test, discarded, test_order, discarded_order): joined = test for arg, order in zip(discarded, discarded_order): joined.insert(order, arg) return joined def predict_mix(self, test_instances=[], filter_first=False): if test_instances == []: if (self.fileid_test <> None): # Get the instances from the test set test_instances = self._load_instances( path=self.path_test, fileid=self.fileid_test, filter_func=self.filter_func, test=True, fileid_dep=self.fileid_test_dep, prop_num_per_verb=self.prop_num_per_verb) else: return [] # We got the instances right from the output of the identification system # Therefore, we need to filter out first those that are not argument candidates if filter_first: test_argcands = [] test_argcands_order = [] discarded_argcands = [] discarded_argcands_order = [] discarded_argcands_labels = [] order = 0 for argcand, label in test_instances: if label != "NULL": test_argcands.append(argcand) test_argcands_order.append(order) else: discarded_argcands.append(argcand) discarded_argcands_order.append(order) discarded_argcands_labels.append("NULL") order += 1 else: test_argcands = test_instances # Extract features test_argcands_feats = [] for argcand in test_argcands: argcands_feats, _ = self.extract_features(argcand) test_argcands_feats.append(argcands_feats) # Transform the features to the format required by the classifier test_argcands_feats = self.feature_encoder.transform( test_argcands_feats) # Classify the candidate arguments test_argcands_targets = self.model.predict(test_argcands_feats) # Get the correct label names test_argcands_labels = self.label_encoder.inverse_transform( test_argcands_targets) if filter_first: test_argcands = self._join_test_discarded( test_argcands, discarded_argcands, test_argcands_order, discarded_argcands_order) test_argcands_labels = self._join_test_discarded( test_argcands_labels.tolist(), discarded_argcands_labels, test_argcands_order, discarded_argcands_order) return zip(test_argcands, test_argcands_labels) def set_model_parameters(self, model_name, verbose=3, file_path=""): if not self.model is None: model_name = self.model.__class__.__name__ if model_name == "LinearSVC": model_to_set = LinearSVC() parameters = {"C": [1, 2, 4, 8], "loss": ["l1", "l2"]} elif model_name == "SVC": model_to_set = OneVsRestClassifier(SVC(kernel="poly")) parameters = { "estimator__C": [1, 2, 4, 8], "estimator__kernel": ["poly", "rbf"], "estimator__degree": [1, 2, 3, 4] } elif model_name == "LogisticRegression": model_to_set = LogisticRegression() parameters = {"penalty": ["l1", "l2"], "C": [1, 2, 4, 8]} else: raise ValueError("Invalid model name.") # Perform Grid Search with 10-fold cross-validation to estimate the parameters # cv_generator = StratifiedKFold(self.label_encoder.transform(self.train_labels), n_folds=7) cv_generator = KFold(len(self.train_labels), n_folds=10, shuffle=True) model_tunning = GridSearchCV(model_to_set, param_grid=parameters, scoring=f1_score, n_jobs=1, cv=cv_generator, verbose=verbose) # Perform parameter setting model_tunning.fit(self.train_feats, self.label_encoder.transform(self.train_labels)) if verbose > 0: print "Best model:" print model_tunning.best_estimator_ print "Best parameters:" print model_tunning.best_params_ print "Best score {}:".format( model_tunning.get_params()["score_func"]) print model_tunning.best_score_ if file_path != "": file_name = file_path + model_name + "AI_Semi.bin" if verbose > 0: print "Saving best model {}...".format(file_name) tunned_model_file = open(file_name, "wb") cPickle.dump(model_tunning.best_estimator_, tunned_model_file) tunned_model_file.close() self.model = model_tunning.best_estimator_ return self.model def analyse_feature_salience(self, model_name="LogisticRegression", forward=True, verbose=0): if self.model == None: if model_name == "LinearSVC": model = LinearSVC(C=1, loss="l2") elif model_name == "SVC": model = SVC(kernel="poly") elif model_name == "LogisticRegression": model = LogisticRegression(C=8, penalty="l1") else: raise ValueError("Invalid model name.") else: model = self.model # cv_generator = KFold(len(self.train_labels), n_folds=10, shuffle=True) cv_generator = StratifiedKFold(self.label_encoder.transform( self.train_labels), n_folds=7) fscv = FeatureSalienceCV( model, cv=cv_generator, forward=forward, score_func=[precision_score, recall_score, f1_score], sort_by="f1_score", verbose=verbose) fscv.fit_mix(self.train_feats, self.label_encoder.transform(self.train_labels)) return fscv def analyse_feature_selection(self, model_name="LogisticRegression", forward=True, verbose=0): if self.model == None: if model_name == "LinearSVC": model = LinearSVC(C=1, loss="l2") elif model_name == "SVC": model = SVC(kernel="poly") elif model_name == "LogisticRegression": model = LogisticRegression(C=8, penalty="l1") else: raise ValueError("Invalid model name.") else: model = self.model # cv_generator = KFold(len(self.train_labels), n_folds=10, shuffle=True) cv_generator = StratifiedKFold(self.label_encoder.transform( self.train_labels), n_folds=7) fscv = FeatureSelectionCV(model, cv=cv_generator, feature_order=self.featselec_featorder, score_func=f1_score, verbose=verbose) fscv.fit_mix(self.train_feats, self.label_encoder.transform(self.train_labels)) return fscv def _format_argcand(self, argcand_tuple, lexinfo_sent, tree, argcand_treepos=None): start_arg, end_arg = argcand_tuple[0] if argcand_treepos == None: argcand_treepos = tree.treeposition_spanning_leaves( start_arg, end_arg) argcand = dict() argcand["treepos"] = argcand_treepos argcand["span"] = argcand_tuple[0] argcand["label"] = argcand_tuple[-1] argcand["cat"] = util.get_postag(tree[argcand_treepos]) argcand["lexinfo"] = dict() for i in range(start_arg, end_arg): id_token, word, lemma, pos, feat = lexinfo_sent[i] argcand["lexinfo"][id_token] = { "word": word, "lemma": lemma, "pos": pos, "feat": feat } return argcand
def make_conversion_data(num_feat_files, from_suffix, to_suffix, with_labels=True): num_examples = 500 num_feats_per_file = 7 np.random.seed(1234567890) convert_dir = join(_my_dir, 'train', 'test_conversion') if not exists(convert_dir): os.makedirs(convert_dir) # Create lists we will write files from ids = [] features = [] labels = [] if with_labels else None for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) # if we are not using labels, we do not want zero-valued features # because it may be the case that some subset of features end up # being all 0 and if this subset ends up being written out to a file # below, then for some formats (e.g., megam) nothing will get written # out which can cause issues when reading this file lowest_feature_value = 0 if with_labels else 1 x = { "f{:03d}".format(feat_num): np.random.randint(lowest_feature_value, 4 + lowest_feature_value) for feat_num in range(num_feat_files * num_feats_per_file) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) if with_labels: labels.append(y) features.append(x) # Create vectorizers/maps for libsvm subset writing feat_vectorizer = DictVectorizer() feat_vectorizer.fit(features) if with_labels: label_map = { label: num for num, label in enumerate( sorted({ label for label in labels if not isinstance(label, (int, float)) })) } # Add fake item to vectorizer for None label_map[None] = '00000' else: label_map = None # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # use '_unlabeled' as part of any file names when not using labels with_labels_part = '' if with_labels else '_unlabeled' # Write out unmerged features in the `from_suffix` file format for i in range(num_feat_files): train_path = join( convert_dir, '{}_{}{}{}'.format(feature_name_prefix, i, with_labels_part, from_suffix)) sub_features = [] for example_num in range(num_examples): feat_num = i * num_feats_per_file x = { "f{:03d}".format(feat_num + j): features[example_num]["f{:03d}".format(feat_num + j)] for j in range(num_feats_per_file) } sub_features.append(x) train_fs = FeatureSet('sub_train', ids, labels=labels, features=sub_features, vectorizer=feat_vectorizer) if from_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() elif from_suffix in ['.arff', '.csv', '.tsv']: label_col = 'y' if with_labels else None Writer.for_path(train_path, train_fs, label_col=label_col).write() else: Writer.for_path(train_path, train_fs).write() # Write out the merged features in the `to_suffix` file format train_path = join( convert_dir, '{}{}_all{}'.format(feature_name_prefix, with_labels_part, to_suffix)) train_fs = FeatureSet('train', ids, labels=labels, features=features, vectorizer=feat_vectorizer) # we need to do this to get around the FeatureSet using NaNs # instead of None when there are no labels which causes problems # later when comparing featuresets if not with_labels: train_fs.labels = [None] * len(train_fs.labels) if to_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() elif to_suffix in ['.arff', '.csv', '.tsv']: label_col = 'y' if with_labels else None Writer.for_path(train_path, train_fs, label_col=label_col).write() else: Writer.for_path(train_path, train_fs).write()
def Feature_Names(cluster_dict): vectorizer = DictVectorizer(sparse=False) vectorizer.fit([OrderedDict.fromkeys(cluster_dict.keys(), 1)]) cluster_id_features = vectorizer.get_feature_names() return cluster_id_features
def encode_categorical_features(features, sparse=True): encoder = DictVectorizer(sparse=sparse) encoder.fit(features) encoded_features = encoder.transform(features) return encoded_features, encoder
train_set_size = int(train_set_ration * len(titan)) test_set_size = len(titan) - train_set_size train_set = titan.loc[:train_set_size] test_set = titan.loc[train_set_size:] # Create features and target for train train_features = feature_cleaning(train_set) train_target = train_set['Survived'] test_features = feature_cleaning(test_set) test_target = test_set['Survived'] # Pre-processing vectorizer = DictVectorizer (sparse = False) vectorizer.fit(train_features.to_dict(orient = 'records')) cleaned_train_features = ghosl(train_features, vectorizer) cleaned_test_features = ghosl(test_features, vectorizer) # Make model (Logistic regression) lr = lrm.LogisticRegression() lr.fit(cleaned_train_features,train_target) print("Train set accuracy: " + str (show_result(cleaned_train_features,train_target,lr))) print("Test set accuracy: " + str(show_result(cleaned_test_features, test_target, lr))) eval_data = ps.read_csv('data/test.csv') cleaned_eval_data = feature_cleaning(eval_data,False) result = solve(cleaned_eval_data,lr,vectorizer)
class NMF_Label: ''' ''' def __init__(self, n_components=20, distance='cos', sparse=True, tfidf=False): self.n_components = n_components self.distance = distance self.tfidf = tfidf # DictVectorizer for embedding self.dv_x = DictVectorizer(sparse=sparse) self.dv_y = DictVectorizer(sparse=sparse) # Model for Tfidf self.TFIDF = None def build(self, universe_x, universe_y): # Build DictVectorizer for feature (x) self.dv_x.fit([ {x: 1} for x in universe_x ]) self.map_v2i_x = self.dv_x.vocabulary_ self.map_i2v_x = dict(zip(self.map_v2i_x.values(), self.map_v2i_x.keys())) # Build DictVectorizer for target (y) self.dv_y.fit([ {x: 1} for x in universe_y ]) self.map_v2i_y = self.dv_y.vocabulary_ self.map_i2v_y = dict(zip(self.map_v2i_y.values(), self.map_v2i_y.keys())) def compile(self): # Deterministic, do nothing pass def fit(self, x=None, y=None, verbose=False): # Embed feature (x) embed_matrix_x = self.dv_x.transform([ {v: 1 for v in arr} for arr in x ]) if self.tfidf: self.TFIDF = TfidfModel(list( [ (j, row[0,j]) for j in row.nonzero()[1] ] for row in embed_matrix_x ), normalize=False) embed_matrix_x = self.dv_x.transform([ { self.map_i2v_x[i]: w for i,w in self.TFIDF[list( (self.map_v2i_x[v], 1.0) for v in arr if v in self.map_v2i_x )] } for arr in x ]) # Embed target (y) embed_matrix_y = self.dv_y.transform([ {v: 1 for v in arr} for arr in y ]) # Co-occurance matrix # Raw co_matrix = embed_matrix_y.T.dot(embed_matrix_x) # Normalized (row-wise) co_matrix_norm = co_matrix / np.linalg.norm(co_matrix.A, ord=2, axis=1, keepdims=True) # Factorize using NMF nmf = NMF(n_components=self.n_components, random_state=RANDOM_STATE)#, beta_loss='kullback-leibler', solver='mu') self.U = nmf.fit_transform(co_matrix) self.V = nmf.components_.T if verbose: print('Recon error: {0}, Raw matrix norm: {1}'.format(nmf.reconstruction_err_, np.linalg.norm(co_matrix.A, ord=2))) def predict(self, x, n_best=1): # Embed feature (x) if self.tfidf: embed_matrix_x = self.dv_x.transform([ { self.map_i2v_x[i]: w for i,w in self.TFIDF[list( (self.map_v2i_x[v], 1.0) for v in arr if v in self.map_v2i_x )] } for arr in x ]) else: embed_matrix_x = self.dv_x.transform([ {v: 1 for v in arr} for arr in x ]) # Transform embedded description into encoded space enc_x = embed_matrix_x.dot(self.V) # Match by finding NN in encoded space wrt rows of U if self.distance == 'cos': # Cosine distance # Normalize encoded vector U_norm = self.U / (np.linalg.norm(self.U, ord=2, axis=1, keepdims=True) + EPS) enc_x_norm = enc_x / (np.linalg.norm(enc_x, ord=2, axis=1, keepdims=True) + EPS) dist_matrix = U_norm.dot(enc_x_norm.T) # y_idx = np.argmax(dist_matrix, axis=0) y_idx = np.argsort(dist_matrix, axis=0, )[-n_best:, :].T # Recover target (y) from embed idx y = utils.asarray_of_list([ [ self.map_i2v_y[i] for i in arr ] for arr in y_idx ]) return y def save_model(self, filename): with open(filename, 'wb') as file: pickle.dump((self.U, self.V), file) def load_model(self, filename): with open(filename, 'rb') as file: self.U, self.V = pickle.load(file)
{ 'packed': 1, 'contains_encrypted': 0 }, { 'packed': 0, 'contains_encrypted': 0 }, { 'packed': 0, 'contains_encrypted': 0 }, ] ground_truth = [1, 1, 1, 1, 0, 0, 0, 0] # initialize the vectorizer with the training data vectorizer.fit(training_examples) # transform the training examples to vector form X = vectorizer.transform(training_examples) y = ground_truth # call ground truth 'y', by convention # train the classifier (a.k.a. 'fit' the classifier) classifier.fit(X, y) test_example = {'packed': 1, 'contains_encrypted': 0} test_vector = vectorizer.transform(test_example) print ` classifier.predict(test_vector) ` # prints [1] #visualize the decision tree with open("classifier.dot", "w") as output_file: tree.export_graphviz(classifier, feature_names=vectorizer.get_feature_names(), out_file=output_file)
class NGramFrequencyExtractor(BaseEstimator, TransformerMixin): """ Transformer object turning messages into frequency feature vectors counting ngrams up to specified maximum. Sci-kit learn documentation on creating estimators: http://scikit-learn.org/dev/developers/contributing.html#rolling-your-own-estimator """ def __init__(self, lexicon, form=None, default_form=lambda word, lexicon: word, ngram_size=1, adjust_for_message_len=True): self.lexicon = lexicon self.form = form self.default_form = default_form self.ngram_size = ngram_size self.vectorizer = DictVectorizer() self.adjust_for_message_len = adjust_for_message_len def extract_frequency_dicts(self, X): frequency_dicts = [] for message in X: tuple_ngrams = nltk.ngrams(self.retrieve_lexical_form(message), self.ngram_size) string_ngrams = [] for ngram in tuple_ngrams: string_ngrams.append(",".join(ngram)) frequency_dict = Counter(string_ngrams) if self.adjust_for_message_len: for ngram in frequency_dict: frequency_dict[ngram] = frequency_dict[ngram] / len( string_ngrams) frequency_dicts.append(frequency_dict) return frequency_dicts def fit(self, X, y=None): """ Determines the list of tokens and ngrams to be used :param X: List of tokenised messages :type X: list(list(str)) """ frequency_dicts = self.extract_frequency_dicts(X) self.vectorizer.fit(frequency_dicts) return self def transform(self, X, y=None): """ Transforms tokenised messages into frequency vectors :return: frequency vectors :rtype: numpy array of shape [n_samples, n_features] """ frequency_dicts = self.extract_frequency_dicts(X) return self.vectorizer.transform(frequency_dicts) def fit_transform(self, X, y=None, **fit_params): """ Fit to data then transform it :return: frequency vectors :rtype: numpy array of shape [n_samples, n_features] """ frequency_dicts = self.extract_frequency_dicts(X) return self.vectorizer.fit_transform(frequency_dicts) def get_feature_names(self): try: return self.vectorizer.get_feature_names() except AttributeError: raise AttributeError( "No feature names, object has not been fitted") def retrieve_lexical_form(self, message): if self.form is None: return message assert self.lexicon.has_feature(self.form) transformed_message = [] for word in message: if word in self.lexicon and self.lexicon.get_feature_value_by_word( word, self.form): transformed_message.append( self.lexicon.get_feature_value_by_word(word, self.form)) else: transformed_message.append( self.default_form(word, self.lexicon)) return transformed_message
"Bedroom AbvGr", "Year Built", "Yr Sold", "Neighborhood"] X_train_dict = housing[features].to_dict(orient="records") y_train = housing["SalePrice"] # - # Now we will use Scikit-Learn to preprocess the features... # + from sklearn.feature_extraction import DictVectorizer from sklearn.preprocessing import StandardScaler vec = DictVectorizer(sparse=False) vec.fit(X_train_dict) X_train = vec.transform(X_train_dict) scaler = StandardScaler() scaler.fit(X_train) X_train_sc = scaler.transform(X_train) # - # ...and to fit the $k$-nearest neighbors model to the data. # + from sklearn.neighbors import KNeighborsRegressor # Fit a 10-nearest neighbors model. model = KNeighborsRegressor(n_neighbors=10) model.fit(X_train_sc, y_train)
class Normalizer(object): '''Convertidor y normalizador de datos para los clasificadores.''' def __init__(self, norm="l2"): super(Normalizer, self).__init__() self.featureNames = [ "Provincia", "Canton", "Totalpobla", "Superficie", "Densidadpobln", "Urbano/Rural", "Genero", "Edad", "Dependencia", "Alfabeta", "Escolaridadpromedio", "Escolaridadregular", "Trabaja", "Asegurado", "Cantcasas", "Ocupantespromedio", "Condicion", "Hacinada", "Nacido", "Discapacitado", "Jefaturafemenina", "Jefaturacompartida", "Votoronda1", "Votoronda2" ] self.converter = DictVectorizer(sparse=False) self.norm = norm self.normalData = {} self.convertedData = {} self.tensorColumns = [] def prepare_data_tensor(self, samples, pct_test): data = self.separate_data(samples, pct_test) for key in data: if "Classes" not in key: data[key] = self.convert_to_dict_list(data[key]) return data ''' Retorna los datos de las muestras pasadas por parametro en un diccionario de la forma: { "trainingFeatures": <Datos de entrenamiento>, "testingFeatures": <Datos de testing>, "trainingFeaturesFirstInclude": <Datos de entrenamiento con primer voto>, "testingFeaturesFirstInclude": <Datos de testin con primer voto>, "trainingClassesFirst": <Resultados de primera ronda de los datos de entrenamiento>, "trainingClassesSecond": <Resultados de segunda ronda de los datos de entrenamiento>, "testingClassesFirst": <Resultados de primera ronda de los datos de pruebas>, "testingClassesSecond": <Resultados de segunda ronda de los datos de pruebas> } Entrada: Datos generados por el generador de muestras, porcentaje que se usara para pruebas. Salida: Los datos en forma de diccionario ''' def prepare_data(self, samples, pct_test): data = self.separate_data(samples, pct_test) # Los datos se transforman a solo numeros self.convert_data(data) return data ''' Convierte todos los datos a valores entre 0 y 1 Entrada: datos ya transformados a numeros Salida: los nuevos datos se guardan en el mismo diccionario ''' def normalize_data(self, data): data["trainingFeatures"] = normalize(data["trainingFeatures"], norm=self.norm, copy=False) data["testingFeatures"] = normalize(data["testingFeatures"], norm=self.norm, copy=False) data["trainingFeaturesFirstInclude"] = normalize( data["trainingFeaturesFirstInclude"], norm=self.norm, copy=False) data["testingFeaturesFirstInclude"] = normalize( data["testingFeaturesFirstInclude"], norm=self.norm, copy=False) ''' Convierte los datos en un diccionario segun los indicadores (nombre de las propiedades) Entrada: datos a transformar de la forma: [["Genero", "Canton",...],...] Salida: los datos en forma de una lista de diccionarios ''' def convert_to_dict_list(self, samples): features = [] for featureList in samples: dictFeatures = {} featureNum = 0 for feature in featureList: try: feature = float(feature) except ValueError: # La propiedad es un string pass dictFeatures[self.featureNames[featureNum]] = feature featureNum += 1 features.append(dictFeatures) return features def convert_to_list(self, dict): list = [] for key in dict: list.append(dict[key]) return list ''' Convierte los datos en numericos Entrada: lista de datos en forma de diccionario Salida: guarda los datos en el mismo diccionario de entrada ''' def convert_data(self, data): for key in data: if "Classes" not in key: data[key] = self.convert_to_dict_list(data[key]) self.converter.fit( np.append(data["trainingFeatures"], data["testingFeatures"], axis=0)) data["trainingFeatures"] = self.converter.transform( data["trainingFeatures"]) data["testingFeatures"] = self.converter.transform( data["testingFeatures"]) self.converter.fit( np.append(data["trainingFeaturesFirstInclude"], data["testingFeaturesFirstInclude"], axis=0)) data["trainingFeaturesFirstInclude"] = self.converter.transform( data["trainingFeaturesFirstInclude"]) data["testingFeaturesFirstInclude"] = self.converter.transform( data["testingFeaturesFirstInclude"]) self.convertedData = copy.deepcopy(data) ''' Separa los datos en datos de entrenamiento y de pruebas Entrada: Los datos generados por el generador, el procentaje a usar para pruebas. Salida: Un diccionario con los datos separados ''' def separate_data(self, samples, pct_test): samplesArray = np.array(samples) X = samplesArray[:, :22] y = samplesArray[:, 22:] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=pct_test, random_state=42) y_train_first_round = y_train[:, 0] y_train_second_round = y_train[:, 1] y_test_first_round = y_test[:, 0] y_test_second_round = y_test[:, 1] X_train_2 = np.append(X_train, y_train[:, :1], axis=1) X_test_2 = np.append(X_test, y_test[:, :1], axis=1) self.normalData = { "trainingFeatures": X_train, "testingFeatures": X_test, "trainingFeaturesFirstInclude": X_train_2, "testingFeaturesFirstInclude": X_test_2, "trainingClassesFirst": y_train_first_round, "trainingClassesSecond": y_train_second_round, "testingClassesFirst": y_test_first_round, "testingClassesSecond": y_test_second_round } return { "trainingFeatures": X_train, "testingFeatures": X_test, "trainingFeaturesFirstInclude": X_train_2, "testingFeaturesFirstInclude": X_test_2, "trainingClassesFirst": y_train_first_round, "trainingClassesSecond": y_train_second_round, "testingClassesFirst": y_test_first_round, "testingClassesSecond": y_test_second_round } def separate_data_2(self, samples, pct_test): samplesArray = np.array(samples) X = samplesArray y = samplesArray[:, 22:] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=pct_test, random_state=42) X_train_2 = np.delete(X_train, [22], axis=1) X_test_2 = np.delete(X_test, [22], axis=1) X_train_3 = X_train X_test_3 = X_test X_train = np.delete(X_train, [23], axis=1) X_test = np.delete(X_test, [23], axis=1) y_train_first_round = y_train[:, 0] y_train_second_round = y_train[:, 1] y_test_first_round = y_test[:, 0] y_test_second_round = y_test[:, 1] self.normalData = { "trainingFeatures": X_train, "testingFeatures": X_test, "trainingFeaturesSecond": X_train_2, "testingFeaturesSecond": X_test_2, "trainingFeaturesFirstInclude": X_train_3, "testingFeaturesFirstInclude": X_test_3, "trainingClassesFirst": y_train_first_round, "trainingClassesSecond": y_train_second_round, "testingClassesFirst": y_test_first_round, "testingClassesSecond": y_test_second_round } return { "trainingFeaturesFirst": X_train, "testingFeaturesFirst": X_test, "trainingFeaturesSecond": X_train_2, "testingFeaturesSecond": X_test_2, "trainingFeaturesFirstInclude": X_train_3, "testingFeaturesFirstInclude": X_test_3, "trainingClassesFirst": y_train_first_round, "trainingClassesSecond": y_train_second_round, "testingClassesFirst": y_test_first_round, "testingClassesSecond": y_test_second_round } def get_normal_data(self): return self.normalData def get_converted_data(self): return self.convertedData ''' Compara el tamaño de dos listas y retorna el número del tamaño de la lista más grande Entrada: Recibe dos listas Salida: Tamaño de la lista más grande ''' def bigger_size(self, list1, list2): if len(list1) >= len(list2): return len(list1) else: return len(list2) ''' Crea una lista de ceros del tamaño deseado Entrada: Tamaño de la lista a crear Salida: Lista con ceros del tamaño ingresado ''' def extra_list(self, num): temp = [] for i in range(num): temp += [0] return temp
def model(load_model, model_type): # Create training and testing set X_dict_train, y_train = process_data('train') X_dict_validation = process_data('validation') # Creating test set and turn into one-hot encoded vectors dict_one_hot_encoder = DictVectorizer(sparse=False) dict_one_hot_encoder.fit(X_dict_train) X_validation = dict_one_hot_encoder.transform(X_dict_validation) # Load Model if load_model: print('Loading model from previous training...') if model_type == 'decision_tree': d_tree_file = open('./models/decision_tree_model.sav', 'rb') elif model_type == 'random_forest': d_tree_file = open('./models/random_forest_model.sav', 'rb') else: print("Cannot load model without model_type") return 0 train_model = pickle.load(d_tree_file) d_tree_file.close() if load_model == False: # Transform training dictionary into one-hot encoded vectors X_train = dict_one_hot_encoder.transform(X_dict_train) print('Completed processing data') # Train decision tree classifier if model_type == 'decision_tree': train_model = DecisionTreeClassifier(criterion='gini', min_samples_split=30) elif model_type == 'random_forest': train_model = RandomForestClassifier(n_estimators=100, criterion='gini', min_samples_split=30, n_jobs=-1) else: print("Cannot set up model without model_type") return 0 print("Started training...") train_model.fit(X_train, y_train) print('Completed training') # Save Model if model_type == 'decision_tree': model_file = open('./models/decision_tree_model.sav', "wb") elif model_type == 'random_forest': model_file = open('./models/random_forest_model.sav', "wb") else: print("Cannot save model without model_type") return 0 pickle.dump(train_model, model_file) model_file.close() print('Saved model') # Evaluate and run model on validation data print('Tuning base bid for the model...') pCTRs = train_model.predict_proba(X_validation)[:, 1] if model_type == 'decision_tree': f = open('tune_base_bid_decision_tree.csv', 'w') elif model_type == 'random_forest': f = open('tune_base_bid_random_forest.csv', 'w') else: print("Cannot save model without model_type") return 0 f.write('basebid,clicks, CTR, spend, avgCPM, avgCPC\n') for base_bid in range(1, 201, 1): bidding_results = bidding(pCTRs, base_bid) for bidding_result in bidding_results: f.write(str(bidding_result) + ',') f.write('\n') f.close() return 0
for index, (term, class_) in enumerate(pos_tags): # Add basic NLP features for each sentence term X.append(add_basic_features(untag(pos_tags), index)) y.append(class_) return X, y # get the right set X_train, y_train = transform_to_dataset(training_sentences) X_test, y_test = transform_to_dataset(testing_sentences) X_val, y_val = transform_to_dataset(validation_sentences) # Fit our DictVectorizer with our set of features from sklearn.feature_extraction import DictVectorizer dict_vectorizer = DictVectorizer(sparse=True) dict_vectorizer.fit(X_train + X_test + X_val) # Convert dict features to vectors X_train = dict_vectorizer.transform(X_train) X_test = dict_vectorizer.transform(X_test) X_val = dict_vectorizer.transform(X_val) from sklearn.preprocessing import LabelEncoder # Fit LabelEncoder with our list of classes label_encoder = LabelEncoder() label_encoder.fit(y_train + y_test + y_val) # Encode class values as integers y_train = label_encoder.transform(y_train) y_test = label_encoder.transform(y_test) y_val = label_encoder.transform(y_val)
def main(): import gp from sklearn.feature_extraction import DictVectorizer parser = argparse.ArgumentParser() parser.add_argument( '-g', '--gpu-split', type=float, default=1, help="Num ways we'll split the GPU (how many tabs you running?)") parser.add_argument('-n', '--net-type', type=str, default='conv2d', help="(lstm|conv2d) Which network arch to use") parser.add_argument( '--guess', type=int, default=-1, help="Run the hard-coded 'guess' values first before exploring") parser.add_argument( '--boost', action="store_true", default=False, help= "Use custom gradient-boosting optimization, or bayesian optimization?") args = parser.parse_args() # Encode features hsearch = HSearchEnv(gpu_split=args.gpu_split, net_type=args.net_type) hypers_, hardcoded = hsearch.hypers, hsearch.hardcoded hypers_ = {k: v for k, v in hypers_.items() if k not in hardcoded} hsearch.close() # Build a matrix of features, length = max feature size max_num_vals = 0 for v in hypers_.values(): l = len(v['vals']) if l > max_num_vals: max_num_vals = l empty_obj = {k: None for k in hypers_} mat = pd.DataFrame([empty_obj.copy() for _ in range(max_num_vals)]) for k, hyper in hypers_.items(): for i, v in enumerate(hyper['vals']): mat.loc[i, k] = v mat.ffill(inplace=True) # Above is Pandas-friendly stuff, now convert to sklearn-friendly & pipe through OneHotEncoder vectorizer = DictVectorizer() vectorizer.fit(mat.T.to_dict().values()) feat_names = vectorizer.get_feature_names() # Map TensorForce actions to GP-compatible `domain` # instantiate just to get actions (get them from hypers above?) bounds = [] for k in feat_names: hyper = hypers_.get(k, False) if hyper: bounded, min_, max_ = hyper['type'] == 'bounded', min( hyper['vals']), max(hyper['vals']) b = [min_, max_] if bounded else [0, 1] bounds.append(b) def hypers2vec(obj): h = dict() for k, v in obj.items(): if k in hardcoded: continue if type(v) == bool: h[k] = float(v) else: h[k] = v or 0. return vectorizer.transform(h).toarray()[0] def vec2hypers(vec): # Reverse the encoding # https://stackoverflow.com/questions/22548731/how-to-reverse-sklearn-onehotencoder-transform-to-recover-original-data # https://github.com/scikit-learn/scikit-learn/issues/4414 reversed = vectorizer.inverse_transform([vec])[0] obj = {} for k, v in reversed.items(): if '=' not in k: obj[k] = v continue if k in obj: continue # we already handled this x=y logic (below) # Find the winner (max) option for this key score, attr, val = v, k.split('=')[0], k.split('=')[1] for k2, score2 in reversed.items(): if k2.startswith(attr + '=') and score2 > score: score, val = score2, k2.split('=')[1] obj[attr] = val # Bools come in as floats. Also, if the result is False they don't come in at all! So we start iterate # hypers now instead of nesting this logic in reversed-iteration above for k, v in hypers_.items(): if v['type'] == 'bool': obj[k] = bool(round(obj.get(k, 0.))) return obj # Specify the "loss" function (which we'll maximize) as a single rl_hsearch instantiate-and-run def loss_fn(params): hsearch = HSearchEnv(gpu_split=args.gpu_split, net_type=args.net_type) reward = hsearch.execute(vec2hypers(params)) hsearch.close() return [reward] guess_i = 0 while True: # Every iteration, re-fetch from the database & pre-train new model. Acts same as saving/loading a model to disk, # but this allows to distribute across servers easily conn_runs = data.engine_runs.connect() sql = "select hypers, advantages, advantage_avg from runs where flag=:f" runs = conn_runs.execute(text(sql), f=args.net_type).fetchall() conn_runs.close() X, Y = [], [] for run in runs: X.append(hypers2vec(run.hypers)) Y.append([utils.calculate_score(run)]) boost_model = print_feature_importances(X, Y, feat_names) if args.guess != -1: guess = {k: v['guess'] for k, v in hypers_.items()} guess.update(utils.guess_overrides[args.guess][guess_i]) loss_fn(hypers2vec(guess)) guess_i += 1 if guess_i > len(utils.guess_overrides[args.guess]) - 1: args.guess = -1 # start on GP continue if args.boost: print('Using gradient-boosting') boost_optimization(model=boost_model, loss_fn=loss_fn, bounds=np.array(bounds), x_list=X, y_list=Y) else: # Evidently duplicate values break GP. Many of these are ints, so they're definite duplicates. Either way, # tack on some small epsilon to make them different (1e-6 < gp.py's min threshold, make sure that #'s not a # problem). I'm concerned about this since many hypers can go below that epislon (eg learning-rate). for x in X: for i, v in enumerate(x): x[i] += np.random.random() * 1e-6 gp.bayesian_optimisation2(loss_fn=loss_fn, bounds=np.array(bounds), x_list=X, y_list=Y)
imp = Imputer(missing_values='NaN', strategy='mean', axis=1) for i in num_in: train.iloc[:, i] = np.transpose(imp.fit_transform(train.iloc[:, i])) test.iloc[:, i] = np.transpose(imp.fit_transform(test.iloc[:, i])) # Replace nan in Dataframe as it is confused to NaN, here nan is actually a string value except in LotFrontage and MasVnrArea train.iloc[:, :] = train.iloc[:, :].replace(np.nan, 'nnn') test.iloc[:, :] = test.iloc[:, :].replace(np.nan, 'nnn') # One Hot Encoder for string data enc = DictVectorizer(sparse=False) # convert Dataframe with selected columns to dictionary as the OneHotEncoder for string values DictVectorizer needs dictionary data train_dic = train.iloc[:, cat_ind].to_dict(orient='records') test_dic = test.iloc[:, cat_ind].to_dict(orient='records') enc.fit(train_dic) x_train_categorical = enc.transform(train_dic) x_test_categorical = enc.transform(test_dic) # Numerical features x_train_numeric = train.iloc[:, num_in] x_train_numeric = normalize(x_train_numeric) x_train = np.concatenate((x_train_numeric, x_train_categorical), axis=1) x_test_numeric = test.iloc[:, num_in] x_test_numeric = normalize(x_test_numeric) x_test = np.concatenate((x_test_numeric, x_test_categorical), axis=1) # the last is the target/class variable y_train = train.iloc[:, 80] m = x_train.shape[0]
def make_conversion_data(num_feat_files, from_suffix, to_suffix): num_examples = 500 num_feats_per_file = 7 np.random.seed(1234567890) convert_dir = join(_my_dir, 'train', 'test_conversion') if not exists(convert_dir): os.makedirs(convert_dir) # Create lists we will write files from ids = [] features = [] labels = [] for j in range(num_examples): y = "dog" if j % 2 == 0 else "cat" ex_id = "{}{}".format(y, j) x = { "f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num in range(num_feat_files * num_feats_per_file) } x = OrderedDict(sorted(x.items(), key=lambda t: t[0])) ids.append(ex_id) labels.append(y) features.append(x) # Create vectorizers/maps for libsvm subset writing feat_vectorizer = DictVectorizer() feat_vectorizer.fit(features) label_map = { label: num for num, label in enumerate( sorted({ label for label in labels if not isinstance(label, (int, float)) })) } # Add fake item to vectorizer for None label_map[None] = '00000' # get the feature name prefix feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.')) # Write out unmerged features in the `from_suffix` file format for i in range(num_feat_files): train_path = join( convert_dir, '{}_{}{}'.format(feature_name_prefix, i, from_suffix)) sub_features = [] for example_num in range(num_examples): feat_num = i * num_feats_per_file x = { "f{:03d}".format(feat_num + j): features[example_num]["f{:03d}".format(feat_num + j)] for j in range(num_feats_per_file) } sub_features.append(x) train_fs = FeatureSet('sub_train', ids, labels=labels, features=sub_features, vectorizer=feat_vectorizer) if from_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() else: Writer.for_path(train_path, train_fs).write() # Write out the merged features in the `to_suffix` file format train_path = join(convert_dir, '{}_all{}'.format(feature_name_prefix, to_suffix)) train_fs = FeatureSet('train', ids, labels=labels, features=features, vectorizer=feat_vectorizer) if to_suffix == '.libsvm': Writer.for_path(train_path, train_fs, label_map=label_map).write() else: Writer.for_path(train_path, train_fs).write()
def main_gp(): import gp, GPyOpt from sklearn.feature_extraction import DictVectorizer parser = argparse.ArgumentParser() parser.add_argument('-a', '--agent', type=str, default='ppo_agent', help="Agent to use (ppo_agent|dqn_agent|etc)") parser.add_argument( '-g', '--gpu_split', type=float, default=1, help="Num ways we'll split the GPU (how many tabs you running?)") parser.add_argument('-n', '--net_type', type=str, default='lstm', help="(lstm|conv2d) Which network arch to use") parser.add_argument( '--guess', action="store_true", default=False, help="Run the hard-coded 'guess' values first before exploring") parser.add_argument( '--gpyopt', action="store_true", default=False, help= "Use GPyOpt library, or use basic sklearn GP implementation? GpyOpt shows more promise, but has bugs." ) args = parser.parse_args() # Encode features hsearch = HSearchEnv(gpu_split=args.gpu_split, net_type=args.net_type) hypers_, hardcoded = hsearch.hypers, hsearch.hardcoded hypers_ = {k: v for k, v in hypers_.items() if k not in hardcoded} hsearch.close() # Build a matrix of features, length = max feature size max_num_vals = 0 for v in hypers_.values(): l = len(v['vals']) if l > max_num_vals: max_num_vals = l empty_obj = {k: None for k in hypers_} mat = pd.DataFrame([empty_obj.copy() for _ in range(max_num_vals)]) for k, hyper in hypers_.items(): for i, v in enumerate(hyper['vals']): mat.loc[i, k] = v mat.ffill(inplace=True) # Above is Pandas-friendly stuff, now convert to sklearn-friendly & pipe through OneHotEncoder vectorizer = DictVectorizer() vectorizer.fit(mat.T.to_dict().values()) feat_names = vectorizer.get_feature_names() # Map TensorForce actions to GPyOpt-compatible `domain` # instantiate just to get actions (get them from hypers above?) bounds = [] for k in feat_names: hyper = hypers_.get(k, False) if hyper: bounded, min_, max_ = hyper['type'] == 'bounded', min( hyper['vals']), max(hyper['vals']) if args.gpyopt: b = {'name': k, 'type': 'discrete', 'domain': (0, 1)} if bounded: b.update(type='continuous', domain=(min_, max_)) else: b = [min_, max_] if bounded else [0, 1] bounds.append(b) def hypers2vec(obj): h = dict() for k, v in obj.items(): if k in hardcoded: continue if type(v) == bool: h[k] = float(v) else: h[k] = v or 0. return vectorizer.transform(h).toarray()[0] def vec2hypers(vec): # Reverse the encoding # https://stackoverflow.com/questions/22548731/how-to-reverse-sklearn-onehotencoder-transform-to-recover-original-data # https://github.com/scikit-learn/scikit-learn/issues/4414 if not args.gpyopt: vec = [vec] # gp.py passes as flat, GPyOpt as wrapped reversed = vectorizer.inverse_transform(vec)[0] obj = {} for k, v in reversed.items(): if '=' not in k: obj[k] = v continue if k in obj: continue # we already handled this x=y logic (below) # Find the winner (max) option for this key score, attr, val = v, k.split('=')[0], k.split('=')[1] for k2, score2 in reversed.items(): if k2.startswith(attr + '=') and score2 > score: score, val = score2, k2.split('=')[1] obj[attr] = val # Bools come in as floats. Also, if the result is False they don't come in at all! So we start iterate # hypers now instead of nesting this logic in reversed-iteration above for k, v in hypers_.items(): if v['type'] == 'bool': obj[k] = bool(round(obj.get(k, 0.))) return obj # Specify the "loss" function (which we'll maximize) as a single rl_hsearch instantiate-and-run def loss_fn(params): hsearch = HSearchEnv(gpu_split=args.gpu_split, net_type=args.net_type) reward = hsearch.execute(vec2hypers(params)) hsearch.close() return [reward] while True: conn = data.engine.connect() sql = "SELECT hypers, reward_avg FROM runs WHERE flag=:f" runs = conn.execute(text(sql), f=args.net_type).fetchall() conn.close() X, Y = [], [] for run in runs: X.append(hypers2vec(run.hypers)) Y.append([run.reward_avg]) print_feature_importances(X, Y, feat_names) if args.guess: guesses = {k: v['guess'] for k, v in hypers_.items()} X.append(hypers2vec(guesses)) Y.append([None]) args.guess = False if args.gpyopt: pretrain = {'X': np.array(X), 'Y': np.array(Y)} if X else {} opt = GPyOpt.methods.BayesianOptimization(f=loss_fn, domain=bounds, maximize=True, **pretrain) # using max_iter=1 because of database setup. Normally you'd go until convergence, but since we're using # a database for the runs we can parallelize runs across machines (connected to the same database). Then # between each run we can grab the result from the other machines and merge with our own; so only run # once, reset the model-fitting w/ the full database (which may have grown), and repeat opt.run_optimization(max_iter=1) else: gp.bayesian_optimisation2(n_iters=1, loss_fn=loss_fn, bounds=np.array(bounds), x_list=X, y_list=Y)
class GlobalFeatures(object): def __init__(self, word2vec_model=None, cluster_vocabs=None, dict_features=None, cat_names=None, WORD_IDX=0): self.word2vec_model = word2vec_model self.cluster_vocabs = cluster_vocabs self.dict_features = dict_features self.WORD_IDX = WORD_IDX self.cat_names = cat_names def get_global_sequence_features(self, sent, predictions=None): features = dict() sent_length = len(sent) * 1. for word in sent: word = word[self.WORD_IDX] lookup_key = preprocess_token(word, to_lower=True) if self.word2vec_model and lookup_key in self.word2vec_model: for i, v in enumerate(self.word2vec_model[lookup_key]): features["_GLOBAL_WORDVEC_%s" % i] = dict.get( features, "_GLOBAL_WORDVEC_%s" % i, 0) + v if self.cluster_vocabs and lookup_key in self.cluster_vocabs: v = dict.get(self.cluster_vocabs, lookup_key) features["_GLOBAL_CLUSTER_=%s" % v] = dict.get( features, "_GLOBAL_CLUSTER_=%s" % v, 0) + 1 features = {k: v / sent_length for k, v in six.iteritems(features)} if predictions: for k, prob in six.iteritems(predictions): features["_MODEL_=%s" % k] = prob return [features for word in sent] def tweet_features(self, sent): features = {} sent_length = len(sent) * 1. for widx, word in enumerate(sent): word = word[self.WORD_IDX] lookup_key = preprocess_token(word, to_lower=True) if self.word2vec_model and lookup_key in self.word2vec_model: for i, v in enumerate(self.word2vec_model[lookup_key]): features["_GLOBAL_WORDVEC_%s" % i] = dict.get( features, "_GLOBAL_WORDVEC_%s" % i, 0) + v if self.cluster_vocabs and lookup_key in self.cluster_vocabs: v = dict.get(self.cluster_vocabs, lookup_key) features["_GLOBAL_CLUSTER_=%s" % v] = dict.get( features, "_GLOBAL_CLUSTER_=%s" % v, 0) + 1 if self.dict_features: d_features = self.dict_features.GetDictFeatures( [k[WORD_IDX] for k in sent], widx) for k in d_features: features[k] = dict.get(features, k, 0) + 1 d_hashtag_features = self.dict_features.GetHashtagDictFeatures( word) for k in d_hashtag_features: features[k] = dict.get(features, k, 0) + 1 #features = {k: v / sent_length for k,v in six.iteritems(features)} return features def get_sequence_features(self, sequences): features = [self.tweet_features(sent) for sent in sequences] return features def is_tweet_type(self, sent, cat_type): for t in sent: if t.tag != "O": if t.tag[2:] == cat_type: return 1 return 0 def fit_feature_dict(self, sequences): train_data = self.get_sequence_features(sequences) self.feature2matrix = DictVectorizer() self.feature2matrix.fit(train_data) def tranform_sequence2feature(self, sequences): train_data = self.get_sequence_features(sequences) return self.feature2matrix.transform(train_data) def fit_model(self, train_sequences, test_sequences=None): if test_sequences is None: test_sequences = train_sequences self.fit_feature_dict(train_sequences) tweet_X_train = self.tranform_sequence2feature(train_sequences) tweet_X_test = self.tranform_sequence2feature(test_sequences) self.models = dict() for cat_type in self.cat_names: print("Processing: %s" % cat_type) y_train = np.array([ self.is_tweet_type(sent, cat_type) for sent in train_sequences ]) y_test = np.array([ self.is_tweet_type(sent, cat_type) for sent in test_sequences ]) model = LogisticRegression(solver="lbfgs", multi_class="multinomial") model.fit(tweet_X_train, y_train) y_pred = model.predict(tweet_X_test) print(classification_report(y_test, y_pred)) self.models[cat_type] = model def get_global_predictions(self, sequences): predictions = {} X_train = self.tranform_sequence2feature(sequences) for k, model in six.iteritems(self.models): y_pred = model.predict_proba(X_train)[:, 1] predictions[k] = y_pred keys = predictions.keys() predictions = [dict(zip(keys, v)) for v in zip(*predictions.values())] return predictions
X, y = list(zip(*data)) # split and randomize X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, shuffle=True) print(X_train) print(y_train) from sklearn.tree import DecisionTreeClassifier from sklearn.feature_extraction import DictVectorizer dict_vectorizer = DictVectorizer() name_classifier = DecisionTreeClassifier() # Scikit-Learn models work with arrays not dicts # We need to train the vectorizer so that # it knows what's the format of the dicts dict_vectorizer.fit(X_train) # Vectorize the training data X_train_vectorized = dict_vectorizer.transform(X_train) # Train the classifier on vectorized data name_classifier.fit(X_train_vectorized, y_train) # Test the model X_test_vectorized = dict_vectorizer.transform(X_test) # Compute Accuracy (0.75) print(name_classifier.score(X_test_vectorized, y_test))
class ModelTrainer(BaseProcessor): """ ModelTrainer :param _dic_vec: DictVectorizer model for entity features _tfidf_vec: TfidfVectorizer model for intent features _intent_clf: RandomForestClassifier for intent classification _entity_clf: BernoulliNB for entity classification _ent_feature: Entity features _ent_label: Entity Labels _int_feature: Intent features _int_label: Intent Labels path: Path to save models :Function: _check_file_format: :description: Checks for YML/YAML file format :param file: Filename :return status: Is valid or not load: :description: The function loads the yaml training file and generates the training data. The load uses the function _intent_entity_extractor, _entity_label_extract and _get_features from BaseProcessor. To create features and label. :param file: Training file _intent_entity_extractor: :description: Intent Entity Extractor uses the TF-IDF for intent features and labels genration. While DicVect and _get_features for entity features and labels genration. The _entity_label_extract helps for entity label extraction :param data: training file data. _entity_label_extract: :description: Following training data schema function used to extract entity labels from that scehma. Default entity label is 'O'. :param question_dict: Traning data dictionary token_pos: token postion defined in trainig data. i.e Entity value postion _persist_helper: :description: Model object is saved as pickle file :param filename:File name to save object_: Model to save. _persist_models: :description: Saves all four models and classifers. train: :description: Trains the Intent and Entity Model """ def __init__(self, path): super().__init__() self._dic_vec = DictVectorizer() self._tfidf_vec = TfidfVectorizer() self._intent_clf = RandomForestClassifier(n_estimators=200) self._entity_clf = BernoulliNB(alpha=0.1, binarize=0.1) self._ent_feature = [] self._ent_label = [] self._int_feature = [] self._int_label = [] self.path = path def _check_file_format(self, file): if file.split('.')[1] in FILE_FORMAT: return True return False def load(self, file): file_foarmt = self._check_file_format(file) if not file_foarmt: raise FileFormatError("Only YML/YAML file is allowed") with open(file, 'r') as f: data = yaml.load(f) ent_train_list, int_train_dict = self._intent_entity_extractor(data) int_feature_arr = np.array(list(int_train_dict.keys())) int_labels_arr = np.array(list(int_train_dict.values())) self._tfidf_vec.fit(int_feature_arr) self._int_feature = self._tfidf_vec.transform( int_feature_arr).toarray() self._int_label = int_labels_arr self._dic_vec.fit(ent_train_list) self._ent_feature = self._dic_vec.transform(ent_train_list).toarray() def _intent_entity_extractor(self, data): ent_train_list = [] int_train_dict = {} for intent, question_list in tqdm(data.items()): for question_dict in question_list: token = question_dict['text'].split(' ') int_train_dict[question_dict['text']] = intent for i, word in enumerate(token): self._entity_label_extract(question_dict, i) ent_train_list.append(self._get_features(i, word, token)) return ent_train_list, int_train_dict def _entity_label_extract(self, question_dict, token_pos): try: for ent in question_dict['entity']: k, v = list(ent.items())[1] if ent['pos'] == token_pos: self._ent_label.append(k) break else: self._ent_label.append('O') except: self._ent_label.append('O') def _persist_helper(self, filename, object_): with open(os.path.join(self.path, filename), 'wb+') as f: pickle.dump(object_, f) def _persist_models(self): self._persist_helper(self.dic_vec_name, self._dic_vec) self._persist_helper(self.tfidf_name, self._tfidf_vec) self._persist_helper(self.entity_name, self._entity_clf) self._persist_helper(self.intent_name, self._intent_clf) def train(self): self._entity_clf.fit(self._ent_feature, self._ent_label) self._intent_clf.fit(self._int_feature, self._int_label) self._persist_models()
from sklearn.externals import joblib def convierte_a_listas(oracion): wap=oracion.split(" ") words=['-','-'] for i in wap: words.append(i) words=words+['-','-'] return words def prepara_frase(words): features=[] feature={} for i in range(len(words[2:-2])): i=i+2 feature['0']=str(words[i-2]).lower() feature['1']=str(words[i-1]).lower() feature['2']=str(words[i]).lower() feature['3']=str(words[i+1]).lower() feature['4']=str(words[i+2]).lower() features.append(feature) feature={} return features lista=convierte_a_listas("Que tal esta es una lista para el mercado algunas de las cosas que quiero comprar es jamon queso pechuga de pavo y cereal") print(lista) features=prepara_frase(lista) print(features) v = DictVectorizer(sparse=False) #X = v.fit_transform(features) v.fit(features) joblib.dump(v, 'vectorizer.pkl')
random.shuffle(dataset) x_train=[row[0] for row in dataset[:7000]] y_train=[row[1] for row in dataset[:7000]] x_test=[row[0] for row in dataset[7000:]] y_test=[row[1] for row in dataset[7000:]] #We are using DictVectorizer to transform the list of feature dictionary into numerical form or simply for word embedding from sklearn.feature_extraction import DictVectorizer d = DictVectorizer() d.fit([row[0] for row in dataset]) X_train_count = d.transform(x_train) X_test_count = d.transform(x_test) models=get_models() for name,model in models.items(): scores=evaluate_model(model) #results.append(scores) print(name,scores)
'firstThree-letters': name[:8], 'last-letter': name[-5:], 'lastTwo-letters': name[-4:], 'lastThree-letters': name[-3:], } features = np.vectorize(features) Name = features(names[:, 0]) Gender = names[:, 2] Name_Train, Name_Test, Gender_Train, Gender_Test = train_test_split( Name, Gender, test_size=0.3) vectorizer = DictVectorizer() vectorizer.fit(Name_Train) clf = DecisionTreeClassifier() clf.fit(vectorizer.transform(Name_Train), Gender_Train) Gender_pred = clf.predict(vectorizer.transform(Name_Test)) print( clf.predict( vectorizer.transform( features([ "Nguyễn Ánh Dương", "Vũ Tiến Đạt", "Ngô Văn Vĩ", "Phạm Ngọc Hà", "Hoàng Mai Hương" ])))) from sklearn.metrics import accuracy_score print('Accuracy = ', accuracy_score(Gender_Test, Gender_pred))
def format_data(df0, df_ts): # df = shuffle(df0, random_state=0) df = df0 train_size = df.shape[0] print df.head() y = df['Criminal'] df = df.drop('Criminal', axis=1) assert isinstance(df, DataFrame) df_combined = df.append(df_ts) df_combined.fillna('NA', inplace=True) if isinstance(df_combined, dict): df_to_dict = df_combined else: df_to_dict = df_combined.to_dict(orient="records") vec = DictVectorizer(sparse=False) vec.fit(df_to_dict) X = vec.transform(df_to_dict) print('inside make model after one hot encoding= ', X.shape) columns_names = vec.feature_names_ input_dataframe = DataFrame(data=X, columns=columns_names) # This part is removing un important columns rf_clf = RandomForestClassifier(n_estimators=100, max_depth=10) rf_clf.fit(X[0:train_size], y) imp = rf_clf.feature_importances_ threshold_for_features = 0.001 for index, value in enumerate(imp): if value <= threshold_for_features: key = columns_names[index] input_dataframe = input_dataframe.drop(key, axis=1) temp3 = list(input_dataframe) for feat in temp3: if feat.endswith("=NA") or feat.endswith("=nan") or feat.endswith( "=99"): # print("dropping feature with no value = ", feat) input_dataframe = input_dataframe.drop(feat, axis=1) # This part was about removing un important columns df_to_dict = input_dataframe.to_dict(orient="records") vec = DictVectorizer(sparse=False) vec.fit(df_to_dict) print(" modified data frame ", input_dataframe.shape) input_train_df = input_dataframe[0:train_size] input_test_df = input_dataframe[train_size:] with open('train_encoded_2.csv', 'wb') as infile: input_train_df['Criminal'] = y print("input df shape to csv ", input_train_df.shape) input_train_df.to_csv(infile, index=False) with open('test_encoded_2.csv', 'wb') as infile: print("input df shape to csv ", input_test_df.shape) input_test_df.to_csv(infile, index=False)