record.append([w1,dist[w1][w2],w2,dist[w2][w3],w3,dist[w3][w1],w1]) return record if __name__ == '__main__': dirname = ''#'../../..' # dirname = '../MODELS/' fname = 'wiki-news-300d-1M.vec' # fname = 'wiki-news-300d-1M.bin' NDIMS = 300 VOCAB = 1000 nsim = 10 thresh = 0.65 word_vecs = ps.load_embedding(path.join(dirname, fname), VOCAB) word_vecs = ps.normalize(word_vecs, 0, NDIMS) word_vecs = ps.discretize(word_vecs, 0, NDIMS) g = Graph(word_vecs, nsim, thresh) g2 = g.makeGraph() print("graph made") # g2 = g.removeWts() mode = 2 #null mode if(mode==0): mode=input("Enter mode: 1.Prt Grph 2.Prt SCC 3.Chck Tri Ineq") #Print Graph if(mode==1): print(g2)
def experiment_clfs(df, target_col, unused_cols, numeric_cols, clfs, model_lst, grid, test_length=6, is_temporal=True, draw=True, table=True): ''' Experiment with different parameters for classifiers. Loop through each model and evaluate correspondingly. Inputs: df: dataframe (joint table) target_col: (numpy array) target variable unused_cols: (numpy array) unused varaibles in df numeric_cols: (numpy array) numerical variables in df clfs: (dictionary) classifiers from create_clfs_params() function model_lst: (list of strings) model names to use grid: (dictionary) grid from create_clfs_params() function test_length: (positive int) testing window (unit=month) is_temporal: (bool) True if use temporal validation to split data; False if use random split draw: (bool) True if plot PR curve for each variable table: (bool) True if output evaluation results Outputs: PR-curves classifiers_eval.csv: csv file that stores evaluation results ''' output_cols = ('model', 'parameters', 'train_time', 'test_time', 'accuracy', 'F1_score', 'auc', 'p@1', 'p@2', 'p@5', 'p@10', 'p@20', 'p@30', 'p@50', 'r@1', 'r@2', 'r@5', 'r@10', 'r@20', 'r@30', 'r@50') output_df = pd.DataFrame(columns=output_cols) if is_temporal: x_train, x_test, y_train, y_test = utils.split_data( df, target_col, unused_cols, test_length) if x_train is None and x_test is None and y_train is None and y_test is None: print( "Temporal split failed. Switch to random split at test size=30%." ) x_train, x_test, y_train, y_test = utils.random_split( df, target_col, unused_cols) else: x_train, x_test, y_train, y_test = utils.random_split( df, target_col, unused_cols) #discretize numeric cols: x_train, x_test = preprocess.discretize(x_train, x_test, numeric_cols) clf_lst = [clfs[x] for x in model_lst] for i, clf in enumerate(clf_lst): print(model_lst[i]) params = grid[model_lst[i]] for p in ParameterGrid(params): try: model = clf.set_params(**p) start_train = time.time() model.fit(x_train, y_train) end_train = time.time() train_time = end_train - start_train start_test = time.time() y_pred = model.predict(x_test) end_test = time.time() test_time = end_test - start_test y_pred_probs = model.predict_proba(x_test)[:, 1] scores = evaluate(y_pred, y_pred_probs, y_test) index = len(output_df) output_df.loc[index] = [ model_lst[i], p, train_time, test_time, scores['accuracy'], scores['F1_score'], scores['auc'], scores['p@1'], scores['p@2'], scores['p@5'], scores['p@10'], scores['p@20'], scores['p@30'], scores['p@50'], scores['r@1'], scores['r@2'], scores['r@5'], scores['r@10'], scores['r@20'], scores['r@30'], scores['r@50'] ] if draw: model_name = model_lst[i] + str(index) plot_pr_curve(y_test, y_pred_probs, model_name, p) index += 1 except Exception as e: print(e) pass print("1 classifier completed.") if table: output_df.to_csv('eval_results/classifiers_eval.csv') return output_df
def preprocess_features(feat_meta: FeatureMeta, data: pd.DataFrame, split_continuous_category=False): r"""Transform raw data into index and value form. Continuous features will be discretized, standardized, normalized or scaled according to feature meta. Categorical features will be encoded with a label encoder. :param feat_meta: The FeatureMeta instance that describes raw_data. :param data: The raw_data to be transformed. :param split_continuous_category: Whether to return value of continuous features and index of category features. :return: feat_index, feat_value, category_index, continuous_value """ logger = create_console_logger(name='feat_meta') write_info_log(logger, 'preprocess started') idx = 0 continuous_feats = feat_meta.continuous_feats categorical_feats = feat_meta.categorical_feats columns = list(continuous_feats.keys()) columns.extend(list(categorical_feats.keys())) data = data[columns] feat_idx = pd.DataFrame() # transform continuous features write_info_log(logger, 'transforming continuous features') feat_value_continuous = pd.DataFrame() for name in continuous_feats: feat = continuous_feats[name] feat.start_idx = idx if not feat.discretize: # standardized, normalize or scale processor = feat.transformation col_data = np.reshape(data[name].values, (-1, 1)) col_data = processor.fit_transform(col_data) col_data = np.reshape(col_data, -1) feat_value_continuous[name] = col_data feat_idx[name] = np.repeat(idx, repeats=len(data)) idx += 1 else: # discretize discrete_data, intervals = discretize(data[name], feat.discretize, feat.dim) feat.bins = intervals feat_idx[name] = discrete_data + idx feat_value_continuous[name] = pd.Series(np.ones(len(data[name]))) idx += feat.dim write_info_log(logger, 'transforming categorical features') # transform categorical features category_index = pd.DataFrame() for name in categorical_feats: categorical_feat = categorical_feats[name] le = LabelEncoder() feat_idx[name] = le.fit_transform(data[name]) + idx category_index[name] = feat_idx[name] categorical_feat.processor = le num_classes = len(le.classes_) categorical_feat.dim = num_classes categorical_feat.start_idx = idx idx += num_classes # TODO add multi category features feat_idx = feat_idx.apply(lambda x: x.values, axis=1) category_index = category_index.apply(lambda x: x.values, axis=1) feat_value_category = pd.DataFrame(np.ones((len(data), len(categorical_feats.keys())))) feat_value = pd.concat([feat_value_continuous, feat_value_category], axis=1) feat_value = feat_value.apply(lambda x: x.values, axis=1) continuous_value = feat_value_continuous.apply(lambda x: x.values, axis=1) write_info_log(logger, 'preprocess finished') if split_continuous_category: return feat_idx, feat_value, category_index, continuous_value return feat_idx, feat_value