def test(path, filter=True): # read dataset ds = dataset(path) entity_size = ds.entity_nums + 1 # add 1 avoid out_of_dict relation_size = ds.relation_nums[0] + 1 model_path = path + '/model/' e_emb, r_emb = np.load(model_path + '_TransE_ent.npy'), \ np.load(model_path + '_TransE_rel.npy') # filter build def get_head_tail(pairs, filter_list=[]): for p in pairs: filter_list.append(p[0]) filter_list.append(p[1]) filter_list = list(set(filter_list)) return filter_list filter_l = get_head_tail(ds.train_pair) filter_l = get_head_tail(ds.test_pair, filter_l) filter_l = get_head_tail(ds.val_pair, filter_l) print("filter build done.") # eval eval_h, eval_t, index_h, index_t = [], [], [], [] for test_p in ds.test_pair[:100]: h, t, r = e_emb[test_p[0]], e_emb[test_p[1]], r_emb[test_p[2]] index_h.append(test_p[0]) index_t.append(test_p[1]) if filter: head_predict_list = [l1_distance(e_emb[i], t, r) for i in filter_l] tail_predict_list = [l1_distance(h, e_emb[i], r) for i in filter_l] else: head_predict_list = [l1_distance(e_emb[i], t, r) for i in range(entity_size)] tail_predict_list = [l1_distance(h, e_emb[i], r) for i in range(entity_size)] head_sorted_rank = np.argsort(head_predict_list) tail_sorted_rank = np.argsort(tail_predict_list) eval_h.append(head_sorted_rank) eval_t.append(tail_sorted_rank) h_result = eval_ranking(rank_l=eval_h, index_l=index_h), eval_top_k( rank_l=eval_h, index_l=index_h) t_result = eval_ranking(rank_l=eval_t, index_l=index_t), eval_top_k( rank_l=eval_t, index_l=index_t) print("result of h predict is {0} (rank,top_10), t predict is {1}.".format( h_result, t_result)) return h_result, t_result
''' 分类变量创建虚拟变量 ''' from util import dataset import pandas as pd print('Loading data......') train = dataset.load('categorical', 'train') test = dataset.load('categorical', 'test') cat_col = dataset.load('categorical', 'feature') for col in cat_col: dummies = pd.get_dummies(train[col], prefix=col) train = pd.concat([train, dummies], axis=1) train.drop([col], axis=1, inplace=True) dummies = pd.get_dummies(test[col], prefix=col) test = pd.concat([test, dummies], axis=1) test.drop([col], axis=1, inplace=True) print('Saving data......') dataset(categorical_dummy=train).save('train') dataset(categorical_dummy=test).save('test') print('Done!')
''' 连续变量进行对数转换 ''' from util import dataset from sklearn.preprocessing import MinMaxScaler print('Loading data......') train = dataset.load('numeric', 'train').astype(float) test = dataset.load('numeric', 'test').astype(float) num_col = dataset.load('numeric', 'feature') scaler = MinMaxScaler() for col in num_col: scaler.fit(train[col].values.reshape(-1, 1)) train[col] = scaler.transform(train[col].values.reshape(-1, 1)) test[col] = scaler.transform(test[col].values.reshape(-1, 1)) print(train.head()) print('=' * 20) print(test.head()) print('=' * 20) print('Saving data......') dataset(numeric_maxmin=train).save('train') dataset(numeric_maxmin=test).save('test') print('Done!')
# if y == 0: # return 0 # else: # return x / y # df['AverageWorkingYears'] = df[['WorkingYearsBefore', # 'NumCompaniesWorked']].apply( # average_years, axis=1) X_train = df[df['source'] == 'train'].copy() X_test = df[df['source'] == 'test'].copy() X_train.drop(['source'], axis=1, inplace=True) X_test.drop(['source'], axis=1, inplace=True) dataset(new_feature=X_train).save('train') dataset(new_feature=X_test).save('test') # dataset(new_numeric=X_train).save('train') # dataset(new_numeric=X_test).save('test') # process = NumProcess(X_train, X_test) # train, test = process.boxcox() # dataset(new_numeric_boxcox=train).save('train') # dataset(new_numeric_boxcox=test).save('test') # train, test = process.log1p() # dataset(new_numeric_log1p=train).save('train') # dataset(new_numeric_log1p=test).save('test') # train, test = process.maxmin()
print('=' * 20) print(test.isnull().sum()) print('=' * 20) # 将分类变量转化为数值 label_enc = LabelEncoder() for x in [col for col in cat_col if train.dtypes[col] == 'object']: label_enc.fit(train[x]) train[x] = label_enc.transform(train[x]) test[x] = label_enc.transform(test[x]) num_col = [x for x in test.columns if x not in cat_col] # 将数据保存为pickle文件 print('Saving feature name') dataset(numeric=num_col).save('feature') dataset(categorical=cat_col).save('feature') print('Saving train set') dataset(train=train).save('all') print('Saving test set') dataset(test=test).save('all') print('Saving categorical data') dataset(categorical=train[cat_col]).save('train') dataset(categorical=test[cat_col]).save('test') np.save('cat.npy', train[cat_col]) print('Saving numeric data') dataset(numeric=train[num_col]).save('train')
''' 连续变量进行对数转换 ''' from util import dataset from sklearn.preprocessing import StandardScaler print('Loading data......') train = dataset.load('numeric', 'train').astype(float) test = dataset.load('numeric', 'test').astype(float) num_col = dataset.load('numeric', 'feature') scaler = StandardScaler() for col in num_col: scaler.fit(train[col].values.reshape(-1, 1)) train[col] = scaler.transform(train[col].values.reshape(-1, 1)) test[col] = scaler.transform(test[col].values.reshape(-1, 1)) print(train.head()) print('=' * 20) print(test.head()) print('=' * 20) print('Saving data......') dataset(numeric_stdscale=train).save('train') dataset(numeric_stdscale=test).save('test') print('Done!')
'DistanceFromHome', 'Education', 'PerformanceRating', 'RelationshipSatisfaction', 'TrainingTimesLastYear' ], axis=1, inplace=True) test.drop([ 'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsWithCurrManager', 'JobRole', 'StockOptionLevel', 'Gender', 'DistanceFromHome', 'Education', 'PerformanceRating', 'RelationshipSatisfaction', 'TrainingTimesLastYear' ], axis=1, inplace=True) print(train.head()) new_cat = [x for x in train.columns if x in cat_col] new_num = [x for x in train.columns if x in num_col] new_ord = [x for x in train.columns if x in ord_col] print('Saving data...') dataset(numeric=new_num, categorical=new_cat, order=new_ord).save('feature') dataset(train=train, test=test).save('all') dataset( categorical=train[new_cat], numeric=train[new_num], order=train[new_ord]).save('train') dataset( categorical=test[new_cat], numeric=test[new_num], order=test[new_ord]).save('test') print('Done!')
def trans_e_model(path): #read dataset ds = dataset(path) entity_size = ds.entity_nums + 1 #add 1 avoid out_of_dict relation_size = ds.relation_nums[0] + 1 model_path = path + 'model/' #the distance of h r t def l1_energy(batch): #h = t+r return tf.reduce_sum( tf.abs(batch[:, 1, :] - batch[:, 0, :] - batch[:, 2, :]), 1) with tf.device('/cpu:0'): e_embedding_table = tf.Variable(tf.truncated_normal( [entity_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)), name='e_embed') r_embedding_table = tf.Variable(tf.truncated_normal( [relation_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)), name='r_embed') postive_sample = tf.placeholder(tf.int32, shape=[batch_size, 3], name='p_sample') negtive_sample = tf.placeholder(tf.int32, shape=[batch_size, 3], name='n_sample') pos_embed_e = tf.nn.embedding_lookup(e_embedding_table, postive_sample[:, :2]) pos_embed_r = tf.nn.embedding_lookup(r_embedding_table, postive_sample[:, -1:]) pos_embed = tf.concat([pos_embed_e, pos_embed_r], axis=1) neg_embed_e = tf.nn.embedding_lookup(e_embedding_table, negtive_sample[:, :2]) neg_embed_r = tf.nn.embedding_lookup(r_embedding_table, negtive_sample[:, -1:]) neg_embed = tf.concat([neg_embed_e, neg_embed_r], axis=1) p_loss, n_loss = l1_energy(pos_embed), l1_energy(neg_embed) loss = tf.reduce_sum(tf.nn.relu(margin + p_loss - n_loss)) #loss of TransE optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss) #opt #session with tf.Session(config=tf_config) as sess: sess.run(tf.initialize_all_variables()) saver = tf.train.Saver(max_to_keep=None) #e_emb, r_emb = [],[] print("start training with total {0} epochs and each batch size is{1}". format(epochs, batch_size)) for e in range(epochs): for step in range(len(ds.train_pair) / batch_size): p, n = ds.get_next_batch(batch_size=batch_size, corpus=ds.train_pair) feed_dict = {postive_sample: p, negtive_sample: n} loss_val, _, e_emb, r_emb = sess.run( [loss, optimizer, e_embedding_table, r_embedding_table], feed_dict=feed_dict) print(" loss_val {1} at epoch {2}".format(step, loss_val, e)) saver.save(sess, save_path=model_path + '_TransE.model') np.save(model_path + "_TransE_ent.npy", e_emb) np.save(model_path + "_TransE_rel.npy", r_emb) print("Train Done!")
def run_ctc(): print("cur dir: ", os.path.curdir) ckpt = tf.train.get_checkpoint_state('./checkpoint_10/') checkpoint_file = ckpt.model_checkpoint_path config_file = str('./config_4.json') img_dir = str('./predictImg/') print("len arg: ", len(sys.argv)) if len(sys.argv) == 1: print("Execution without arguments, default arguments") print("checkpoints_file=", checkpoint_file) print("config_file=", config_file) print("img_dir=", img_dir) elif len(sys.argv) == 2: print("Execution without some arguments, default arguments") print("checkpoints_file=", checkpoint_file) print("config_file=", config_file) img_dir = str(sys.argv[1]) elif len(sys.argv) == 3: print("Execution without some arguments, default arguments") print("config_file=", config_file) print("img_dir=", img_dir) img_dir = str(sys.argv[1]) checkpoint_file = str(sys.argv[2]) elif len(sys.argv) == 4: img_dir = str(sys.argv[1]) checkpoint_file = str(sys.argv[2]) config_file = str(sys.argv[3]) else: print() print("ERROR") print("Wrong number of arguments. Execute:") print( ">> python3 predict.py [checkpoint_file] [config_file] [img_dir]") print( "e.g. python predict.py ./checkpoints/model.ckpt_1000 config.json ./img_to_predict/" ) exit(1) try: config = json.load(open(config_file)) except FileNotFoundError: print() print("ERROR") print("No such config file : " + config_file) exit(1) BATCH_SIZE = 4 std_height = 300 std_width = 1024 ctc_input_len = int(config['ctc_input_len']) word_len = int(config['word_len']) net = modelctc1.model(config) graph = net[0] X = net[1] Y = net[2] keep_prob = net[3] seq_len = net[4] optimizer = net[5] cost = net[6] ler = net[7] decoded = net[8] wer = net[9] #result_test = pd.DataFrame() sess_config = tf.ConfigProto() sess_config.gpu_options.allocator_type = 'BFC' with tf.Session(graph=graph, config=sess_config) as session: #with tf.Session(graph=graph) as session: saver = tf.train.Saver() saver.restore(session, checkpoint_file) print("Loaded Model") predict_set = util.dataset(img_dir, BATCH_SIZE, ctc_input_len, word_len, 0, 0) cont = 1 while cont > 0: outputs = [] pre_inputs, pre_seq_len, img_list = predict_set.extract_predict_data_batch( std_height, std_width) #print("img list: ", img_list) if len(pre_inputs) > 0: predict_feed = { X: pre_inputs, keep_prob: 1, seq_len: pre_seq_len } result = session.run(decoded[0], predict_feed) #print("result: ", result.values) #print("result.indices: ", result.indices) output = convert_word(result.indices, result.values, result.dense_shape) #print("val step: ", count, "total cost: ", total_val_cost, "total ler: ", total_val_ler) else: cont = 0 print("outputs: ", outputs) for img_file, word in zip(img_list, output): print("image: " + img_file + "predict: " + str(word)) return outputs
连续变量进行对数转换 ''' from util import dataset from scipy import stats print('Loading data......') train = dataset.load('numeric', 'train').astype(float) test = dataset.load('numeric', 'test').astype(float) num_col = dataset.load('numeric', 'feature') for col in num_col: if stats.skew(train[col]) > 0.25: values, lam = stats.boxcox(train[col].values + 1) train[col] = values print(col) if stats.skew(test[col]) > 0.25: values, lam = stats.boxcox(test[col].values + 1) test[col] = values print(train.head()) print('=' * 20) print(test.head()) print('=' * 20) print('Saving data......') dataset(numeric_boxcox=train).save('train') dataset(numeric_boxcox=test).save('test') print('Done!')
temp = pd.DataFrame({ 'Attriton': train.groupby('JobRole')['Attrition'].mean().sort_values(), 'ranking': np.arange(1, 10) }) train['JobRole'] = train['JobRole'].map(lambda x: temp.loc[x, 'ranking']) test['JobRole'] = test['JobRole'].map(lambda x: temp.loc[x, 'ranking']) temp = pd.DataFrame({ 'Attriton': train.groupby('MaritalStatus')['Attrition'].mean().sort_values(), 'ranking': np.arange(1, 4) }) train['MaritalStatus'] = train['MaritalStatus'].map( lambda x: temp.loc[x, 'ranking']) test['MaritalStatus'] = test['MaritalStatus'].map( lambda x: temp.loc[x, 'ranking']) train.drop(['Attrition'], axis=1, inplace=True) print(train.head()) print('Saving data......') dataset(custom_label=train).save('train') dataset(custom_label=test).save('test') print('Done!')
target = dataset.load('target', 'train') df = pd.concat([train, target], axis=1) for col in num_col: _, interval_list = chimerge.ChiMerge(df, col, 'Attrition') train[col] = train[col].map(lambda x: meger(x, interval_list)) test[col] = test[col].map(lambda x: meger(x, interval_list)) print(train.head()) print('=' * 20) print(test.head()) print('=' * 20) print('Saving data......') dataset(numeric_disc=train).save('train') dataset(numeric_disc=test).save('test') train['source'] = 'train' test['source'] = 'test' df = pd.concat([train, test], axis=0) for col in num_col: dummies = pd.get_dummies(df[col], prefix=col) df = pd.concat([df, dummies], axis=1) df.drop([col], axis=1, inplace=True) train = df[df['source'] == 'train'].copy() test = df[df['source'] == 'test'].copy()
@author: miha """ import matplotlib.pyplot as plt import numpy as np import seaborn as sns sns.set() import kmeans from util import dataset points = np.vstack(((np.random.randn(150, 2) * 0.75 + np.array([1, 0])), (np.random.randn(50, 2) * 0.25 + np.array([-0.5, 0.5])), (np.random.randn(50, 2) * 0.5 + np.array([-0.5, -0.5])))) dataset = dataset(points) dataset.reduce(5) plt.scatter(points[:, 0], points[:, 1]) #ax = plt.gca() #ax.add_artist(plt.Circle(np.array([1, 0]), 0.75/2, fill=False, lw=3)) #ax.add_artist(plt.Circle(np.array([-0.5, 0.5]), 0.25/2, fill=False, lw=3)) #ax.add_artist(plt.Circle(np.array([-0.5, -0.5]), 0.5/2, fill=False, lw=3)) #centroids = kmeans.cluster(points, 3) centroids, closest = kmeans.cluster(dataset.reduced_data, 3) arg1 = np.argwhere(closest == 0) cluster1 = np.array(points[arg1]) print(cluster1) plt.scatter(cluster1[:, 0], cluster1[:, 1], c='g')
@author: miha """ from util import files #from util import kmeans import kmeans from util import dataset import numpy as np import pandas as pd from matplotlib import pyplot as plt from sklearn.cluster import KMeans data = files.read_numpy("data.csv") dataset = dataset(data) dataset.reduce(10) row_count = dataset.row_count() column_count = dataset.column_count() populated = np.array((row_count, column_count)) for col in range(0, column_count): removed = dataset.remove_column(col) y = removed[0] X = removed[1] k = kmeans.get_centroid_count(y) centroids, closest = kmeans.cluster(dataset.reduced_data, k) x = dataset.remove_column(2)
''' 连续变量进行对数转换 ''' from util import dataset import pandas as pd import numpy as np print('Loading data......') train = dataset.load('numeric', 'train') test = dataset.load('numeric', 'test') num_col = dataset.load('numeric', 'feature') for col in num_col: train[col] = np.log1p(train[col]) test[col] = np.log1p(test[col]) print(train.head()) print('=' * 20) print(test.head()) print('=' * 20) print('Saving data......') dataset(numeric_log1p=train).save('train') dataset(numeric_log1p=test).save('test') print('Done!')
train_score = cv_result['train_score'].mean() test_score = cv_result['test_score'].mean() result.loc[len(result)] = [ name, '{} + {}:'.format(num_feature, cat_feature), train_score, test_score ] print('train score:{:.4f}, test score:{:.4f}'.format( train_score, test_score)) clf.fit(X, y) y_pred = clf.predict(X_test) submission = pd.DataFrame({ 'Loan_Status': y_pred, 'Loan_ID': X_test.index.tolist() }) # submission['Loan_Status'] = submission['Loan_Status'].map({ # 1: 'Y', # 0: 'N' # }) filename = './result/{}_{}_{}.csv'.format(num_feature, cat_feature, name) submission.to_csv(filename, index=False) dataset(cv_result=result).save('all') print('Done!')