def main(): # load data training_data = load_data.read_data("train.csv") testing_data = load_data.read_data("test.csv") testing_labels = load_data.read_data("submission.csv") X_train, X_test = load_data.vectorize_data(training_data, testing_data) Y_train = np.array(training_data)[:, -1] Y_test = np.array(testing_labels)[:, -1] svm(X_train, Y_train, X_test, Y_test)
def input_data(file_path): sentences, tags = read_data(file_path) print("sentences length: %s " % len(sentences)) print("last sentence: ", sentences[-1]) # ALBERT ERCODING print("start ALBERT encding") x = np.array([f(sent) for sent in sentences]) print("end ALBERT encoding") # 对y值统一长度为MAX_SEQ_LEN new_y = [] for seq in tags: num_tag = [label_id_dict[_] for _ in seq] if len(seq) < MAX_SEQ_LEN: num_tag = num_tag + [0] * (MAX_SEQ_LEN - len(seq)) else: num_tag = num_tag[:MAX_SEQ_LEN] new_y.append(num_tag) # 将y中的元素编码成ont-hot encoding y = np.empty(shape=(len(tags), MAX_SEQ_LEN, len(label_id_dict.keys()) + 1)) for i, seq in enumerate(new_y): y[i, :, :] = to_categorical(seq, num_classes=len(label_id_dict.keys()) + 1) return x, y
def train_model(): # 读取训练集,验证集和测试集数据 train_x, train_y = input_data(train_file_path) train_x, test_x, train_y, test_y = train_test_split(train_x, train_y, test_size=0.2) # # dev_x, dev_y = input_data(dev_file_path) # test_x, test_y = input_data(test_file_path) # 模型训练 model = build_model(MAX_SEQ_LEN, len(label_id_dict.keys()) + 1) history = model.fit(train_x, train_y, validation_data=(test_x, test_y), batch_size=16, epochs=5) model.save("%s_ner.h5" % event_type) # 绘制loss和acc图像 plt.subplot(2, 1, 1) epochs = len(history.history['loss']) plt.plot(range(epochs), history.history['loss'], label='loss') plt.plot(range(epochs), history.history['val_loss'], label='val_loss') plt.legend() plt.subplot(2, 1, 2) epochs = len(history.history['crf_viterbi_accuracy']) plt.plot(range(epochs), history.history['crf_viterbi_accuracy'], label='crf_viterbi_accuracy') plt.plot(range(epochs), history.history['val_crf_viterbi_accuracy'], label='val_crf_viterbi_accuracy') plt.legend() plt.savefig("%s_loss_acc.png" % event_type) # 模型在测试集上的表现 # 预测标签 y = np.argmax(model.predict(test_x), axis=2) pred_tags = [] for i in range(y.shape[0]): pred_tags.append([id_label_dict[_] for _ in y[i] if _]) # 因为存在预测的标签长度与原来的标注长度不一致的情况,因此需要调整预测的标签 test_sents, test_tags = read_data(test_file_path) final_tags = [] for test_tag, pred_tag in zip(test_tags, pred_tags): if len(test_tag) == len(pred_tag): final_tags.append(test_tag) elif len(test_tag) < len(pred_tag): final_tags.append(pred_tag[:len(test_tag)]) else: final_tags.append(pred_tag + ['O'] * (len(test_tag) - len(pred_tag))) # 利用seqeval对测试集进行验证 print(classification_report(test_tags, final_tags, digits=4))
def main(): # load data training_data = load_data.read_data("train.csv") testing_data = load_data.read_data("test.csv") testing_labels = load_data.read_data("submission.csv") X_train, X_test = load_data.vectorize_data(training_data, testing_data) Y_train = np.array(training_data)[:, -1] Y_test = np.array(testing_labels)[:, -1] #uncommment for grid searching #params = grid_search_kmeans(X_train, Y_train) params = {'n_clusters': 2} kmeans(X_train, Y_train, X_test, Y_test, params)
def main(): # load data training_data = load_data.read_data("train.csv") testing_data = load_data.read_data("test.csv") testing_labels = load_data.read_data("submission.csv") X_train, X_test = load_data.vectorize_data(training_data, testing_data) Y_train = np.array(training_data)[:, -1] Y_test = np.array(testing_labels)[:, -1] print(X_train.shape) print(X_test.shape) X_train = X_train.toarray() X_test = X_test.toarray() # reduce data X_train, X_test = fld(X_train, Y_train, X_test, 2)
def main(): # load data training_data = load_data.read_data("train.csv") testing_data = load_data.read_data("test.csv") testing_labels = load_data.read_data("submission.csv") X_train, X_test = load_data.vectorize_data(training_data, testing_data) Y_train = np.array(training_data)[:, -1] Y_test = np.array(testing_labels)[:, -1] print(X_train.shape) print(X_test.shape) X_train = X_train.toarray() X_test = X_test.toarray() # # means = np.mean(X_train.T, axis=1) # # center columns # cols = X_train - means # # print(cols) # # # cov matrix # cov = np.cov(cols.T) # # # calculate dims needed to be kept based on error rate # values, vectors = np.linalg.eig(cov) # dim = pca_error_rate(values, 0.2) # print("Reduced DIMS to: " + str(dim) + " from " + str(len(training_data[0]))) # reduce data X_train, X_test = pca(X_train, X_test, 2952) print(X_train.shape) print(X_test.shape) params = { 'activation': 'relu', 'solver': 'lbfgs', 'hidden_layer_sizes': (100, 10), 'learning_rate_init': 0.0009 } bpnn.bpnn(X_train, Y_train, X_test, Y_test, params)
def main(): computer = 'laptop' #computer = 'TS' if computer == 'laptop': data_file = 'C:/local/sandp500/sp470.csv' var_filename = 'C:/GoogleDrivePushpakUW/UW/6thYear/CSE546/Project/return_adj.pkl' else: data_file = 'H:/local/sandp500/sp470.csv' var_filename = 'H:/CSE546/Project/return_adj.pkl' ret_data, vol_data, comp_list = load_data.read_data() ################################################ # Doing it on a small sample of ten firms # ret_data = ret_data.iloc[0:100 ,0:50] # print(ret_data.shape) # print("Return data: \n", ret_data) col_names = ret_data.columns.tolist() #print("Column names: \n", col_names) ################################################ lagged_ret = ret_data.shift(1).dropna() num_obs = lagged_ret.shape[0] num_firms = lagged_ret.shape[1] # Aligning indices of X and y ( each y is a column in Y) Y = ret_data[ret_data.index.isin(lagged_ret.index)] train_size = int(0.8 * num_obs) # print("Num of obs in train set: ", train_size) train_index = np.random.choice(num_obs, train_size, replace=False) test_index = np.setdiff1d(np.arange(num_obs), train_index) # convert pandas dataframe to numpy array lagged_ret = lagged_ret.values Y = Y.values ############################################################################## # Do multiprocessing #import multiprocessing #num_cores = multiprocessing.cpu_count() #print("How many cores: ", num_cores) # num of cores = 4 inputs = np.arange(num_firms) with ProcessPoolExecutor(max_workers=2) as executor: res = executor.map(est_coefs, inputs) #print("Results: \n" + str(res)) return res
def train(epoch): global epoch_start epoch_start = time() batch_size = int(len(train_list_all) / BATCH_SIZE) #25000 / 6 i_global = 0 for s in range(batch_size): train_x_1, train_x_2, train_y_ = read_data(train_list_all, s) start_time = time() i_global, _, batch_loss, batch_acc, yy, yyy, gt = sess.run( [global_step, optimizer, cost, accuracy, ya, y, y_], feed_dict={ x_1: train_x_1, x_2: train_x_2, y_: train_y_, learning_rate: lr(epoch), phase: True, dr_rate: 1 }) duration = time() - start_time if s % 10 == 0: percentage = int(round((s / batch_size) * 100)) bar_len = 29 filled_len = int((bar_len * int(percentage)) / 100) bar = '=' * filled_len + '>' + '-' * (bar_len - filled_len) msg = "Global step: {:>5} - [{}] {:>3}% - acc: {:.4f} - loss: {:.4f} - {:.1f} sample/sec - lr: {:.8f}" print( msg.format(i_global, bar, percentage, batch_acc, batch_loss, BATCH_SIZE / duration, lr(epoch))) #print("saving training result.. :" + SAVE_PATH + '/train/epoch_' + str(epoch)+'_s_'+str(s) + '.bmp') thresholded = [] for yy_row in yy[0, :, :, 0]: thresholded.append( [255 if yyy > THRESHOLD else 0 for yyy in yy_row]) cv2.imwrite( SAVE_PATH + '/train/epoch_' + str(epoch) + '_s_' + str(s) + '_thresholded.bmp', np.uint8(thresholded)) cv2.imwrite( SAVE_PATH + '/train/epoch_' + str(epoch) + '_s_' + str(s) + '_graysacale.bmp', np.uint8(yy[0, :, :, 0] * 255)) cv2.imwrite( SAVE_PATH + '/train/epoch_' + str(epoch) + '_s_' + str(s) + '_groundtruth.bmp', np.uint8(gt[0] * 255)) del thresholded test_and_save(i_global, epoch)
def main(): # load data training_data = load_data.read_data("train.csv") testing_data = load_data.read_data("test.csv") testing_labels = load_data.read_data("submission.csv") X_train, X_test = load_data.vectorize_data(training_data, testing_data) Y_train = np.array(training_data)[:, -1] Y_test = np.array(testing_labels)[:, -1] #uncommment for grid searching #params = grid_search_bpnn(X_train, Y_train) params = { 'activation': 'relu', 'solver': 'lbfgs', 'hidden_layer_sizes': (100, 10), 'learning_rate_init': 0.0009 } bpnn(X_train, Y_train, X_test, Y_test, params)
def process_labels(label_list, data_file): entity_list, token_list, _ = load_data.read_data(data_file) label_res_list = [] assert len(entity_list) == len(token_list) == len(label_list) for idx in range(len(label_list)): label_res_list.append( get_tag_and_index(label_list[idx], token_list[idx])) for idx in range(len(label_res_list)): curr_entity = "".join(entity_list[idx]) label_res_list[idx] = [ val for val in label_res_list[idx] if "".join(val[-1]) != curr_entity ] return label_res_list
def analyse_data_single_chained(filename): emp_cov = load_data.read_data( filename, nodes=['French', 'Han', 'Karitiana', 'Sardinian', 'Yoruba'], noss=True) print emp_cov df = 100 summaries = [ summary.s_posterior(), summary.s_variable('mhr'), summary.s_no_admixes(), summary.s_tree_identifier(), summary.s_average_branch_length(), summary.s_total_branch_length(), summary.s_basic_tree_statistics( Rtree_operations.get_number_of_ghost_populations, 'ghost_pops', output='integer'), summary.s_basic_tree_statistics( Rtree_operations.get_max_distance_to_root, 'max_root'), summary.s_basic_tree_statistics( Rtree_operations.get_min_distance_to_root, 'min_root'), summary.s_basic_tree_statistics( Rtree_operations.get_average_distance_to_root, 'average_root'), summary.s_basic_tree_statistics( tree_statistics.unique_identifier_and_branch_lengths, 'tree', output='string'), summary.s_basic_tree_statistics( tree_statistics.majority_tree, 'majority_tree', output='string'), summary.s_variable('proposal_type', output='string'), summary.s_variable('sliding_regraft_adap_param'), summary.s_variable('rescale_adap_param'), summary.s_tree_identifier_new_tree() ] + [ summary.s_variable(s, output='double_missing') for s in ['prior', 'branch_prior', 'no_admix_prior', 'top_prior'] ] r = simulation_sanity.test_posterior_model(None, None, 300000, summaries=summaries, thinning_coef=20, wishart_df=df, emp_cov=emp_cov, no_leaves_true_tree=5)
def load_photos(): df = load_data.read_data() # Get names pizza_names, pizza_eng_names = load_data.get_pizza_names(df) print(pizza_eng_names) # prepare image paths image_paths = [] for name in pizza_eng_names: path = os.path.join(name, name + '3.jpg') image_paths.append(path) print(image_paths) images = load_data.load_images(image_paths) # cut pizza from photo pizza_imgs = load_data.cut_pizza_from_images(images) return pizza_eng_names, pizza_imgs
def ms_to_treemix(filename='tmp.txt', samples_per_pop=20, no_pops=4, n_reps=1, filename2='tmp.treemix_in', treemix_files='tmp'): data=[] with open(filename, 'r') as f: for r in f.readlines(): #print r[:5] data.append(map(int,list(r.rstrip()))) m= array(data) if n_reps>1:#reorder the data so that there are more SNPs in stead of more samples/populations #print m.shape #print 'samples, pops, reps', samples_per_pop, no_pops, n_reps m=hstack(vsplit(m, n_reps)) #print samples_per_pop sums=tuple([sum(m[(i*samples_per_pop):((i+1)*samples_per_pop), : ], axis=0) for i in xrange(no_pops)]) #print sums, 'sums' with open(filename2, 'w') as f: f.write(' '.join(get_trivial_nodes(no_pops))+'\n') for s_vec in zip(*sums): f.write(' '.join([str(s)+','+str(samples_per_pop-s) for s in s_vec])+'\n') filename2_gz=filename2+'.gz' subprocess.call(['gzip','-f', filename2]) return read_data(filename2_gz, blocksize=10000 ,outgroup='s3', noss=True, outfile=treemix_files)
def input_data(file_path): sentences, tags = read_data(file_path) print("sentences length: %s " % len(sentences)) print("last sentence: ", sentences[-1]) # ALBERT ERCODING print("start ALBERT encding") x = [] pbar = tqdm(sentences) for i, sent in zip(pbar, sentences): pbar.set_description("Processing bar: ") # 先bert编码 x.append(f(sent)) x = np.array(x) print("end ALBERT encoding") # 对y值统一长度为MAX_SEQ_LEN new_y = [] for seq in tags: num_tag = [label_id_dict[_] for _ in seq] if len(seq) < MAX_SEQ_LEN: # 补0 num_tag = num_tag + [0] * (MAX_SEQ_LEN - len(seq)) else: num_tag = num_tag[:MAX_SEQ_LEN] new_y.append(num_tag) # 将y中的元素编码成ont-hot encoding (个数,最大文本长度,分类个数) y = np.empty(shape=(len(tags), MAX_SEQ_LEN, len(label_id_dict.keys()) + 1)) # to_categorical ont-hot encoding for i, seq in enumerate(new_y): y[i, :, :] = to_categorical(seq, num_classes=len(label_id_dict.keys()) + 1) return x, y
def predict(request,input_tablename,output_tablename): # opt = option.Options() # weight_path = opt.weight_path weight_path = 'C:/Users/bong/project/semiconductor_project/semiconductor_project/web_server/web/predict/test_model_new.pth' device = 'cuda' if torch.cuda.is_available() else 'cpu' """# 데이터 로드""" data_realtime = load_data.read_data(input_tablename,output_tablename) """# 모델생성""" model = modeling.make_model(device, weight_path) """# 두께 예측""" data_realtime = data_realtime.iloc[:,1:-1] data_realtime_numpy = torch.from_numpy(data_realtime.astype(float).values) data_realtime_numpy_de = data_realtime_numpy.to(device) outputs = model(data_realtime_numpy_de.float()).cpu().detach().numpy() # outputs = model(data_realtime_numpy_de.float()).cpu().detach().numpy().round(-1) pred_test = pd.DataFrame(outputs) pred_test.columns = ['layer_1', 'layer_2', 'layer_3', 'layer_4'] return pred_test,request
import os os.system('CLS') from pylab import plot, show from numpy import vstack, array from numpy.random import rand import numpy as np from scipy.cluster.vq import kmeans, kmeans2, vq import pandas as pd from math import sqrt from sklearn.cluster import KMeans from matplotlib import pyplot as plt import load_data ret_data, vol_data, volume_dat, comp_list = load_data.read_data() #print(ret_data.head()) #print(vol_data.head()) returns = ret_data.mean() * 252 # annualized return vol = vol_data.mean() volume = volume_dat.mean() #print(returns) data = np.asarray([np.asarray(returns), np.asarray(vol), np.asarray(volume)]).T X = data max_cluster = 20 incr = 1 distortions = [] for k in range(incr, max_cluster): k_means = KMeans(n_clusters=k, random_state=0)
nb_eval_examples += ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps print("Validation loss: {}".format(eval_loss)) print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps)) if __name__ == '__main__': # Preparing for CPU or GPU usage dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") tokenizer = BertTokenizer.from_pretrained('./{}'.format(BERT_MODEL_DIR)) # Creating the Dataset and DataLoader for the neural network train_sentences, train_labels = read_data(train_file_path) train_labels = [[tag2idx.get(l) for l in lab] for lab in train_labels] test_sentences, test_labels = read_data(test_file_path) test_labels = [[tag2idx.get(l) for l in lab] for lab in test_labels] print("TRAIN Dataset: {}".format(len(train_sentences))) print("TEST Dataset: {}".format(len(test_sentences))) training_set = CustomDataset(tokenizer, train_sentences, train_labels, MAX_LEN) testing_set = CustomDataset(tokenizer, test_sentences, test_labels, MAX_LEN) train_params = { 'batch_size': TRAIN_BATCH_SIZE, 'shuffle': True, 'num_workers': 0
def __init__(self): super().__init__() self.canvas_show = 'pie' # or 'sleep' self.withdraw() splash = Splash(self) splash.pgb['maximum'] = 5 import matplotlib import math self.math = math import matplotlib.pyplot as plt self.plt = plt from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg from tkinter.ttk import Progressbar, Style splash.pgb['value'] = 1 splash.label.image = splash.gif1 splash.update() plt.style.use('ggplot') matplotlib.rcParams['font.family'] = 'SimHei' self.Set3 = plt.cm.Set3(range(10)) self.Paired = plt.cm.Paired(range(10)) splash.pgb['value'] = 2 splash.label.image = splash.gif1 splash.update() # +++++++++++++++ # + GUI_setup + # +++++++++++++++ from tkinter import Frame, Text from tkinter import ttk from tkinter.ttk import Button self.title('Weekery') #Style().theme_use('clam') ''' root |- Progressbar.bottom |- Frame_Left | |- Frame_btn_left.top | | |- left(btn{d,w,m,y}) | | |- right(btn{sleep,freq_pic,freq_bar}) | | |- Frame_btn_mid | | |- <.left | | |- +.left | | |- >.right | | |- -.right | | |- calendar.middle | | |- label_date | |- fig_up.top | |- fig_down.top |- Frame_Right |- Frame_btn_right | |- btn_setting.right | |- btn_reload.right |- Text.bottom.top expand=both ''' # ====== Frames ====== self.frame_left = Frame(self) self.frame_right = Frame(self) self.frame_btn_left = Frame(self.frame_left) self.frame_btn_left.config(bg='white') self.frame_btn_mid = Frame(self.frame_btn_left) self.frame_btn_right = Frame(self.frame_right) self.frame_btn_right.config(bg='white') # ====== Buttons ====== ttk.Style().configure("TButton", background='white') ttk.Style().configure("symbol.TButton", font=(20)) #ttk.Style().configure("TButton", foreground='white') self.btn_days = Button(self.frame_btn_left, text='日', command=self.days, width=3) self.btn_days.config(state='disable') #bg='white', self.btn_weeks = Button(self.frame_btn_left, text='周', command=self.weeks, width=3) #self.btn_weeks.config(bg='white') self.btn_months = Button(self.frame_btn_left, text='月', command=self.months, width=3) #self.btn_months.config(bg='white') self.btn_years = Button(self.frame_btn_left, text='年', command=self.years, width=3) #self.btn_years.config(bg='white') self.btn_switch_freq_pie = Button(self.frame_btn_left, text='饼图', command=self.pie, width=6) #self.btn_switch_freq_pie.config(bg='white') self.btn_switch_sleep = Button(self.frame_btn_left, text='睡眠', command=self.sleep, width=6) #self.btn_switch_sleep.config(bg='white') self.btn_switch_freq_bar = Button(self.frame_btn_left, text='词频', command=self.bar, width=6) #self.btn_switch_freq_bar.config(bg='white') self.btn_previous = Button(self.frame_btn_mid, text='◀', style='symbol.TButton', command=self.previous, width=2) #self.btn_previous.config(bg='white') self.btn_backward = Button(self.frame_btn_mid, text='▶', style='symbol.TButton',command=self.backward, width=2) #self.btn_backward.config(bg='white') self.btn_calendar = Button(self.frame_btn_mid, text="▦", style='symbol.TButton',command=self.ask_selected_date, width=2) #self.btn_calendar.config(bg='white') self.btn_plus = Button(self.frame_btn_mid, text='+', style='symbol.TButton', command=self.plus, width=2) #self.btn_plus.config(bg='white') self.btn_minus = Button(self.frame_btn_mid, text='-',style='symbol.TButton', command=self.minus, width=2) #self.btn_minus.config(bg='white') self.btn_reload = Button(self.frame_btn_right, text='重载', command=self.reload, width=6) #self.btn_reload.config(bg='white') self.btn_settings = Button(self.frame_btn_right, text='设置', command=self.settings, width=6) #self.btn_settings.config(bg='white') # ====== Others ====== self.fig_up = plt.figure(figsize=(7, 3)) self.fig_down = plt.figure(figsize=(7, 3)) self.canvas_up = FigureCanvasTkAgg(self.fig_up, master=self.frame_left) self.canvas_down = FigureCanvasTkAgg(self.fig_down, master=self.frame_left) self.pgb = Progressbar(self, orient='horizontal', length=1000, mode='determinate') self.notes = Text(self.frame_right, width=50) self.notes.config(bg='azure') self.label_date = Label(self.frame_btn_mid, text='加载中...', width=15) splash.pgb['value'] = 3 splash.label.image = splash.gif1 splash.update() # ++++++++++++++++++ # + GUI Packing + # ++++++++++++++++++ # level-1 self.pgb.pack(side='bottom', fill='both') # level-1 self.frame_left.pack(side='left', fill='both', expand='YES') # # level-2 self.frame_btn_left.pack(side='top', fill='both') self.canvas_up.get_tk_widget().pack(side='top', fill='both', expand='YES') self.canvas_down.get_tk_widget().pack(side='top', fill='both', expand='YES') # # # level-3 self.btn_days.pack(side='left') self.btn_weeks.pack(side='left') self.btn_months.pack(side='left') self.btn_years.pack(side='left') self.btn_switch_freq_bar.pack(side='right') self.btn_switch_freq_pie.pack(side='right') self.btn_switch_sleep.pack(side='right') self.frame_btn_mid.pack(side='top') # # # # level-4 self.btn_previous.pack(side='left') self.btn_minus.pack(side='left') self.btn_backward.pack(side='right') self.btn_plus.pack(side='right') self.btn_calendar.pack(side='left') self.label_date.pack(side='right') # level-1 self.frame_right.pack(side='left', fill='both', expand='YES') # # level-2 self.frame_btn_right.pack(side='top', fill='both') # # # level-3 self.btn_settings.pack(side='right') self.btn_reload.pack(side='right') # # level-2 self.notes.pack(side='top', fill='both', expand='YES') splash.pgb['value'] = 4 splash.label.image = splash.gif1 splash.update() # ++++++++++++++++++ # + Import class + # ++++++++++++++++++ import sqlite3 from config import Config from controls import Controls from load_data import wiz_week_index, read_data splash.pgb['value'] = 5 splash.label.image = splash.gif1 splash.update() splash.destroy() # ============= Show Main GUI ============== self.protocol('WM_DELETE_WINDOW', self.close_window) self.wm_state('zoomed') # maximize windows self.deiconify() self.cfg = Config(self) # user choose to cancle in configuration if self.cfg.cancel: return self.db_path = self.cfg.cache_dir + '/weekery.db' self.conn = sqlite3.connect(self.db_path) self.id_filenames, self.id_dates = wiz_week_index(self.cfg) if self.cfg.last_read == 20160000: read_data(self, self.cfg, self.pgb, self.id_dates, self.id_filenames, 'all') else: read_data(self, self.cfg, self.pgb, self.id_dates, self.id_filenames, dialog=False) self.controls = Controls(self.conn) self.conn.commit() self.colors = {v: (int(f[4:7])/255, int(f[9:12])/255, int(f[14:17])/255, 1) for f, v in self.cfg.color_kind.items()} self._paint()
def reload(self): reload_option = ReloadOption() self.wait_window(reload_option) if reload_option.reload_mode: if reload_option.reload_mode == '全部重载': read_data(self, self.cfg, self.pgb, self.id_dates, self.id_filenames, 'all', dialog=False) showinfo('提示', '全部数据重载完成!') self.weeks() elif reload_option.reload_mode == '最近一周': read_data(self, self.cfg, self.pgb, self.id_dates, self.id_filenames, 1, dialog=False) showinfo('提示', '最近一周数据重载完成!') self.weeks() elif reload_option.reload_mode == '最近一个月': read_data(self, self.cfg, self.pgb, self.id_dates, self.id_filenames, 4, dialog=False) showinfo('提示', '最近一个月数据重载完成!') self.weeks() elif reload_option.reload_mode == '最近三个月': read_data(self, self.cfg, self.pgb, self.id_dates, self.id_filenames, 12, dialog=False) showinfo('提示', '最近三个月数据重载完成!') self.weeks() elif reload_option.reload_mode == '最近半年': read_data(self, self.cfg, self.pgb, self.id_dates, self.id_filenames, 26, dialog=False) showinfo('提示', '最近半年数据重载完成!') self.weeks() elif reload_option.reload_mode == '最近一年': read_data(self, self.cfg, self.pgb, self.id_dates, self.id_filenames, 52, dialog=False) showinfo('提示', '最近一年数据重载完成!') self.weeks() else: pass
weights=self.weights, biases=self.biases, layers=self.layers, epochs=self.epochs, learning_rate=self.learning_rate ) def load_model(self, filename='model.npz'): model = np.load(os.path.join(os.curdir, 'models', filename)) self.weights = model['weights'] self.biases = model['biases'] self.layers = model['layers'] self.num_layers = len(layers) self.epochs = model['epochs'] self.learning_rate = model['learning_rate'] self.activations = [np.zeros((x, 1)) for x in self.layers] load_data.download_data() training_images, training_labels, test_images, test_labels = load_data.read_data() nn = NNClassifier(n_features=N_FEATURES, layers = [N_FEATURES, 30, 10], l2 = 0.5, epochs = 30, learning_rate = 0.001) nn.fit(training_images, training_labels, test_images, test_labels)
from load_data import read_data, load_last_user_logs, get_num_user_logs ## ========================= 1. Load and clean data ======================== ## ''' train = pd.read_csv('../01_Data/train.csv') train = pd.concat((train, pd.read_csv('../01_Data/train_v2.csv')), axis=0, ignore_index=True).reset_index(drop=True) test = pd.read_csv('../01_Data/sample_submission_v2.csv') members = pd.read_csv('../01_Data/members_v3.csv') transactions = pd.read_csv('../01_Data/transactions.csv') transactions = pd.concat((transactions, pd.read_csv('../01_Data/transactions_v2.csv')), axis=0, ignore_index=True).reset_index(drop=True) ''' print("\n1. Load and data ...\n") train, test, members, transactions = read_data() ## ========================= 2. Feature engineering ======================== ## print("\n2. Adding and selecting features ...\n") # Prepare transactions current_transactions = transactions.sort_values(by=['transaction_date'], ascending=[False]).reset_index(drop=True) # Get features for current transaction print("\n a) Creating features from most recent transaction ...\n") # get most recent transaction current_transactions = current_transactions.drop_duplicates(subset=['msno'], keep='first') # Calculate discount current_transactions['discount'] = current_transactions['plan_list_price'] - current_transactions['actual_amount_paid'] # Calculate cost per day current_transactions['payment_plan_days'] = current_transactions['payment_plan_days'].replace(0, 30)
from sklearn.neighbors import KNeighborsClassifier def doWork(train, test, labels): print("Converting training to matrix") train_mat = np.mat(train) print("Fitting knn") knn = KNeighborsClassifier(n_neighbors=10, algorithm="kd_tree") print(knn.fit(train_mat, labels)) print("Preddicting") predictions = knn.predict(test) print("Writing to file") write_to_file(predictions) return predictions def write_to_file(predictions): f = open("output-knn-skilearn.csv", "w") for p in predictions: f.write(str(p)) f.write("\n") f.close() if __name__ == '__main__': from load_data import read_data train, labels = read_data("train.csv") test, tmpl = read_data("test.csv", test=True) predictions = doWork(train, test, labels) print(predictions)
# -*- coding: utf-8 -*- # @Time : 2021/1/26 13:55 # @Author : ztwu4 # @Email : [email protected] # @File : test.py # @Software: PyCharm # 测试句子 from load_data import read_data from model_train import PreProcessInputData from util import train_file_path, test_file_path text = "经过工作人员两天的反复验证、严密测算,记者昨天从上海中心大厦得到确认:被誉为上海中心大厦“定楼神器”的阻尼器,在8月10日出现自2016年正式启用以来的最大摆幅。" word_labels, seq_types = PreProcessInputData([text]) print(word_labels) print(seq_types) input_train, result_train = read_data(train_file_path) for sent, tag in zip(input_train[:10], result_train[:10]): print(sent, tag)
print "Fitting kNN with k=10, kd_tree" knn = KNeighborsClassifier(n_neighbors=10, algorithm="kd_tree") print knn.fit(X_train_reduced, labels) print "Reducing test to %d components" % PCA_COMPONENTS X_test_reduced = pca.transform(test) print "Preddicting numbers" predictions = knn.predict(X_test_reduced) print "Writing to file" write_to_file(predictions) return predictions def write_to_file(predictions): f = open("output-pca-knn-skilearn-v3.csv", "w") for p in predictions: f.write(str(p)) f.write("\n") f.close() if __name__ == '__main__': from load_data import read_data train, labels = read_data("../data/train.csv") test, tmpl = read_data("../data/output3.csv", test=True) print tmpl print doWork(train, labels, test)
for item in line: tag.append(int(label_id_dict[item.strip()])) tag.append(0) tags.append(tag) pad_tags = pad_sequences(tags, maxlen=MAX_SEQ_LEN, padding="post", truncating="post") result_tags = np.expand_dims(pad_tags, 2) return result_tags if __name__ == '__main__': # 读取训练集和测试集数据 input_train, result_train = read_data(train_file_path) input_test, result_test = read_data(test_file_path) for sent, tag in zip(input_train[:10], result_train[:10]): print(sent, tag) for sent, tag in zip(input_test[:10], result_test[:10]): print(sent, tag) # 训练集 input_train_labels, input_train_types = PreProcessInputData(input_train) print(input_train_types[0]) result_train = PreProcessOutputData(result_train) # 测试集 input_test_labels, input_test_types = PreProcessInputData(input_test) result_test = PreProcessOutputData(result_test) # add warmup total_steps, warmup_steps = calc_train_steps(
print "Fitting kNN with k=10, kd_tree" knn = KNeighborsClassifier(n_neighbors=10, algorithm="kd_tree") print knn.fit(X_train_reduced, labels) print "Reducing test to %d components" % PCA_COMPONENTS X_test_reduced = pca.transform(test) print "Preddicting numbers" predictions = knn.predict(X_test_reduced) print "Writing to file" write_to_file(predictions) return predictions def write_to_file(predictions): f = open("output-pca-knn-skilearn-v3.csv", "w") for p in predictions: f.write(str(p)) f.write("\n") f.close() if __name__ == '__main__': from load_data import read_data train, labels = read_data("/home/prasad/kaggle-digit-recognizer-master/data/train.csv") test, tmpl = read_data("/home/prasad/kaggle-digit-recognizer-master/data/test.csv", test=True) print doWork(train, labels, test)
from adaboost import adaboost_trian,adaClassify from load_data import read_data if __name__ =='__main__': data,label = read_data() classifier_array = adaboost_trian(data,label,9) # print(classifier_array) re = adaClassify([[5,5],[0,0]],classifier_array) print(re)
from feature_selection import generate_random_solution from feature_selection import sort_pop from feature_selection import random_crossover from feature_selection import mutation from feature_selection import half_crossover from feature_selection import remove_duplicates import pandas as pd import random import time import matplotlib.pyplot as plt # Set random seed random.seed(1) time.sleep(1) dataset = read_data() dataset = clean_data(dataset) print("!!!!!!!!!") print(len(dataset.index)) print("Fitness if using all features", fitness_function(dataset)) # Use the standard 13 features as a benchmark to measure against standard_dataset = dataset[[ "age", "sex", "CP", "trestbps", "chol", "FBS", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num" ]] print("Fitness of the standard features typically included", fitness_function(standard_dataset))
return sorted(a.iteritems(), key=operator.itemgetter(1), reverse=True)[0][0] def doWorkNumpy(train, test, labels): k = 20 train_mat = np.mat(train) output_file = open("output-numpy2.csv", "w", 0) idx = 0 size = len(test) for test_sample in test: idx += 1 start = time.time() knn = np.argsort(np.sum(np.power(np.subtract(train_mat, test_sample), 2), axis=1), axis=0)[:k] s = time.time() prediction = majority_vote(knn, labels) output_file.write(str(prediction)) output_file.write("\n") print "Knn: %f, majority %f" % (time.time() - start, time.time() - s) print "Done: %f" % (float(idx) / size) output_file.close() output_file = open("done.txt", "w") output_file.write("DONE") output_file.close() if __name__ == '__main__': from load_data import read_data train, labels = read_data("../data/train.csv") test, tmpl = read_data("../data/test.csv", test=True) doWorkNumpy(train, test, labels)
import math import tensorflow as tf from load_data import read_data from network import Model train_files, train_labels, val_files, val_labels = read_data(no_of_train=275) filenames = tf.constant(train_files, dtype=tf.string) labels = tf.constant(train_labels, dtype=tf.float32) val_filenames = tf.constant(val_files, dtype=tf.string) val_labels = tf.constant(val_labels, dtype=tf.float32) MIN_VAL = math.inf EPOCHS = 100 BATCHES = 1 NO_OF_ITERS = int(filenames.get_shape()[0]) // BATCHES LOG_DIR = '/tmp' SAVE_DIR = '/tmp/macula-iqa.cpkt' LEARNING_RATE = 1e-3 DROPOUT_PROB = 0.5 sess = tf.Session() def _build_dataset(_filenames, _labels, epochs, batches): dataset = tf.data.Dataset.from_tensor_slices((_filenames, _labels)) dataset = dataset.prefetch(100) dataset = dataset.map(_parse_function, 10)
import numpy as np from matplotlib.mlab import PCA as mlabPCA import matplotlib.pyplot as plt from load_data import read_data from mpl_toolkits.mplot3d import Axes3D from mpl_toolkits.mplot3d import proj3d all_samples = read_data("data/train1.csv") y_train = np.array([x[0] for x in all_samples]) X_train = np.array([x[1:] for x in all_samples]) data_array = X_train mlab_pca = mlabPCA(data_array) Class0 = [i for i in range(len(y_train)) if y_train[i]==0 ] Class1 = [i for i in range(len(y_train)) if y_train[i]==1 ] fig = plt.figure(figsize=(8,8)) ax = fig.add_subplot(111, projection='3d') ax.plot(mlab_pca.Y[Class0,0], mlab_pca.Y[Class0,1],mlab_pca.Y[Class0,2], 'o', markersize=8, color='blue', alpha=0.5, label='class1') ax.plot(mlab_pca.Y[Class1,0], mlab_pca.Y[Class1,1],mlab_pca.Y[Class1,2], '^', markersize=8, alpha=0.5, color='red', label='class2') #plt.plot(mlab_pca.Y[Class0,0],mlab_pca.Y[Class0,1],mlab_pca.Y[Class0,2] ,'o', markersize=7,color='blue', alpha=0.5, label='class1') #plt.plot(mlab_pca.Y[Class1,0], mlab_pca.Y[Class1,1],mlab_pca.Y[Class1,2], '^', markersize=7,color='red', alpha=0.5, label='class2') plt.show()
def further_process_labels(label_list, data_file): entity_list, token_list, tag_list = load_data.read_data(data_file) for idx in range(len(label_list)): manual_rule(entity_list[idx], label_list[idx]) return label_list
#####User input: choose a dataset here##### dataset = "13" training = False # change to False if there is no vicon data ########################################### if __name__ == '__main__': start = timeit.default_timer() # start timer ifile = "imu/imuRaw" + dataset + ".p" if training: vfile = "vicon/viconRot" + dataset + ".p" # load data ts = load_data.tic() imud = load_data.read_data(ifile) if training: vicd = load_data.read_data(vfile) load_data.toc(ts, "Data import") # format data imu_vals = imud['vals'] imu_ts = imud['ts'] imu_n = len(imu_ts[0]) if training: vic_vals = vicd['rots'] vic_ts = vicd['ts'] vic_n = len(vic_ts[0]) bias, scale = ukf_lib.bias_scale(imu_vals, 100) imu_vals = ukf_lib.unbias_reorder(imu_vals, imu_n, bias, scale)