def download_data(kic): try: folder_path = utils.download_files(kic) utils.process_data(folder_path) except Exception as e: print(e) return e
def get(kic): try: folder_path = utils.download_files(kic) utils.process_data(folder_path) data = read_csv(kic) return data except Exception as e: print(e) return e
def prepare_data(self): test, train, val = utils.load_test_train_val(self.data_num) # df train_texts = list(train.posts) glove = Glove() glove.create_custom_embedding([word for text in train_texts for word in text.split()]) self.train_tuple = utils.process_data(train, glove, self.max_words, self.max_posts) self.test_tuple = utils.process_data(test, glove, self.max_words, self.max_posts) self.val_tuple = utils.process_data(val, glove, self.max_words, self.max_posts)
def train_specialists(pretrain): for setting in SPECIALIST_SETTINGS: cols = setting["columns"] X, y = process_data(TRAIN_PATH, cols) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_PROP) if pretrain: model = model_from_json(open(MODEL_PATH).read()) model.load_weights(WEIGHTS_PATH) else: model = build_model() model.layers.pop() model.outputs = [model.layers[-1].output] model.layers[-1].outbound_nodes = [] model.add(Dense(len(cols), name="dense_3")) flipgen = FlippedImageDataGenerator() flipgen.flip_idxs = setting["flip_idxs"] sgd = SGD(lr=0.08, decay=1e-4, momentum=0.9, nesterov=True) model.compile(loss="mse", optimizer=sgd) early_stop = EarlyStopping(monitor="val_loss", patience=100, mode="min") print("Training {}...".format(cols[0])) model.fit_generator(flipgen.flow(X_train, y_train), samples_per_epoch=X_train.shape[0], nb_epoch=1000, validation_data=(X_val, y_val), callbacks=[early_stop]) model_path = "data/model_{}.json".format(cols[0]) weights_path = "data/weights_{}.h5".format(cols[0]) print("Saving model to ", model_path) print("Saving weights to ", weights_path) open(model_path, 'w').write(model.to_json()) model.save_weights(weights_path, overwrite=True)
def evaluate(model, data_source, cuda=args.cuda): # Turn on evaluation mode which disables dropout. model.eval() model.criterion.loss_type = 'full' eval_loss = 0 total_length = 0 t = tt = 0.0 with torch.no_grad(): for data_batch in tqdm(data_source): data, target, length = process_data(data_batch, cuda=cuda, sep_target=sep_target) l1, l2 = model.forward_normalized(data, target, length) cur_length = int(length.data.sum()) eval_loss += l1.sum().item() t += torch.exp(l2 - l1).sum().item() tt += (torch.exp(l2 - l1)**2).sum().item() total_length += cur_length mean = (t / total_length) variance = tt / total_length - mean * mean model.criterion.loss_type = args.loss return math.exp(eval_loss / total_length), mean, variance
def train(model, data_source, lr=1.0, weight_decay=1e-5, momentum=0.9): params = model.parameters() optimizer = optim.SGD(params=params, lr=lr, momentum=momentum, weight_decay=weight_decay) # Turn on training mode which enables dropout. model.train() total_loss = 0 for num_batch, data_batch in enumerate(corpus.train): optimizer.zero_grad() data, target, length = process_data(data_batch, cuda=args.cuda) loss = model(data, target, length) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(params, args.clip) optimizer.step() total_loss += loss.data[0] if num_batch % args.log_interval == 0 and num_batch > 0: if args.prof: break cur_loss = total_loss / args.log_interval print('| epoch {:3d} | {:5d}/{:5d} batches' ' | lr {:02.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format(epoch, num_batch, len(corpus.train), lr, cur_loss, math.exp(cur_loss))) total_loss = 0 print('-' * 87)
def train_model(pretrain): X, y = process_data(TRAIN_PATH) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_PROP) if pretrain: model = model_from_json(open(MODEL_PATH).read()) model.load_weights(WEIGHTS_PATH) else: model = build_model() flipgen = FlippedImageDataGenerator() sgd = SGD(lr=0.08, decay=1e-4, momentum=0.9, nesterov=True) model.compile(loss="mse", optimizer=sgd) early_stop = EarlyStopping(monitor="val_loss", patience=100, mode="min") model.fit_generator(flipgen.flow(X_train, y_train), samples_per_epoch=X_train.shape[0], nb_epoch=5000, validation_data=(X_val, y_val), callbacks=[early_stop]) print("Saving model to ", MODEL_PATH) print("Saving weights to ", WEIGHTS_PATH) open(MODEL_PATH, 'w').write(model.to_json()) model.save_weights(WEIGHTS_PATH, overwrite=True) mse = model.evaluate(X_val, y_val, batch_size=BATCH_SIZE) print("MSE: ", mse) print("RMSE: ", np.sqrt(mse)*IMG_SIZE)
def main(): file_name = 'data/processed_digits.csv' df = create_dataframe(file_name) X_train, y_train, X_valid, y_valid, X_test, y_test = process_data(df) DigitNN = DigitNeuralNetwork(epochs=100, batch_size=32) DigitNN.fit(X_train, y_train, X_valid, y_valid, X_test, y_test)
def main(): """ 主函数 """ # 读取数据集 raw_data = pd.read_csv(dataset_path, usecols=sel_cols) # 查看数据集 utils.insepct_data(raw_data) # 处理数据集 proc_data = utils.process_data(raw_data) # 借贷金额分析可视化 utils.visualise_loan_amnt(proc_data, col_name='term', title='借贷周期vs借贷金额', xlabel='借贷周期', save_path='./output/term_amnt.png') utils.visualise_loan_amnt(proc_data, col_name='loan_status', title='借贷状态vs借贷金额', xlabel='借贷状态', save_path='./output/status_amnt.png') utils.visualise_loan_amnt(proc_data, col_name='purpose', title='借贷目的vs借贷金额', xlabel='借贷目的', save_path='./output/purpose_amnt.png') utils.visualise_loan_amnt(proc_data, col_name='addr_state', title='州vs借贷金额', xlabel='州', save_path='./output/state_amnt.png') # 借贷目的占比可视化 utils.visualise_loan_purpose_percent(proc_data['purpose'], './output/purpose_percent.png') # 变量间关系可视化 utils.visualise_relation(proc_data, './output/var_relation.png')
def __getitem__(self, index): file = h5py.File( self.rootdir + "train" + str(index // self.filelen) + ".hdf5", "r") im_a, im_b, label = self.getimhdf5(file, index % self.filelen) if type(label) == np.uint8: label = np.expand_dims(label, -1) return process_data(im_a, im_b, label, self.preprocess)
def run_evaluate(self, fold, seed): test_ = process_data(self.test) feature_cols = [c for c in test_.columns if c not in ['sig_id']] x_test = test_[feature_cols].values testdataset = TestDataset(x_test) testloader = torch.utils.data.DataLoader( testdataset, batch_size=self.cfg.batch_size, shuffle=False) target_cols = self.target.drop('sig_id', axis=1).columns.values.tolist() model = Model_old( num_features=len(feature_cols), num_targets=len(target_cols), hidden_size=self.cfg.hidden_size, ) """ model.load_state_dict(torch.load(os.path.join( self.load_path, f"seed{seed}", f"FOLD{fold}_.pth"), map_location=torch.device(self.cfg.device))) """ model.load_state_dict( torch.load(os.path.join(self.load_path, f"SEED{seed}_FOLD{fold}_scored.pth"), map_location=torch.device(self.cfg.device))) model.to(self.cfg.device) predictions = np.zeros((len(test_), self.target.iloc[:, 1:].shape[1])) predictions = inference_fn(model, testloader, self.cfg.device) return predictions
def train(model, data_source, epoch, lr=1.0, weight_decay=1e-5, momentum=0.9): optimizer = optim.SGD(params=model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay) # Turn on training mode which enables dropout. model.train() model.criterion.loss_type = args.loss total_loss = 0 pbar = tqdm(data_source, desc='Training PPL: ....') for num_batch, data_batch in enumerate(pbar): optimizer.zero_grad() data, target, length = process_data(data_batch, cuda=args.cuda, sep_target=sep_target) loss = model(data, target, length) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() total_loss += loss.item() if args.prof: break if num_batch % args.log_interval == 0 and num_batch > 0: cur_loss = total_loss / args.log_interval ppl = math.exp(cur_loss) logger.debug('| epoch {:3d} | {:5d}/{:5d} batches ' '| lr {:02.2f} | loss {:5.2f} | ppl {:8.2f}'.format( epoch, num_batch, len(corpus.train), lr, cur_loss, ppl)) pbar.set_description('Training PPL %.1f' % ppl) total_loss = 0
def train(ENV, args): processed_train_data_path = os.path.join(ENV.processed_data_path, 'processed_train.pkl') processed_test_data_path = os.path.join(ENV.processed_data_path, 'processed_test.pkl') if os.path.exists(processed_train_data_path) and os.path.exists(processed_test_data_path): processed_train_data = pickle.load(open(processed_train_data_path, 'r')) processed_test_data = pickle.load(open(processed_test_data_path, 'r')) else: train_wav_files, train_phn_files = load_data(ENV.train_data) print('Process train data...') processed_train_data = process_data(train_wav_files, train_phn_files) test_wav_files, test_phn_files = load_data(ENV.test_data) print('Process test data...') processed_test_data = process_data(test_wav_files, test_phn_files) pickle.dump(processed_train_data, open(processed_train_data_path, 'w')) pickle.dump(processed_test_data, open(processed_test_data_path, 'w')) # print(processed_train_data[0][1]) print("Define graph...") train_model(ENV, processed_train_data, processed_test_data)
def __getitem__(self, index): file_index = str(index // 100000) if file_index != self.current_index: self.current_file.close() self.current_index = file_index self.current_file = tables.open_file(self.rootdir + "train" + file_index + ".hdf5", driver="H5FD_CORE") im_a, im_b, label = self.getimhdf5(index % 100000) if type(label) == np.uint8: label = np.expand_dims(label, -1) return process_data(im_a, im_b, label, self.preprocess)
def main(unused_argv): # Load data data, _ = process_data(path_to_data=DATA_PATH, vocabulary=vocabulary) train_data, train_labels, validation_data, validation_labels = split_data( data, seq_size) # Create the Estimator classifier = tf.estimator.Estimator(model_fn=char_rnn_model_fn, model_dir=MODEL_DIR) # Train the model train_input_fn = tf.estimator.inputs.numpy_input_fn(x={'x': train_data}, y=train_labels, batch_size=batch_size, num_epochs=None, shuffle=True) # Test the model and print results validate_input_fn = tf.estimator.inputs.numpy_input_fn( x={'x': validation_data}, y=validation_labels, num_epochs=1, shuffle=False) best_model_path = None best_loss = 100.0 degradation_block_cnt = 0 for _ in range(20): classifier.train(input_fn=train_input_fn, steps=100) intermediate_results = classifier.evaluate(input_fn=validate_input_fn) current_loss = intermediate_results['loss'] if current_loss >= best_loss: degradation_block_cnt += 1 print( '\nDegradation detected: last {} blocks loss increases. Best: {}, current: {}\n' .format(degradation_block_cnt, best_loss, current_loss)) else: best_loss = current_loss print('\nLoss decreases: now best is {}\n'.format(best_loss)) degradation_block_cnt = 0 best_model_path = classifier.export_savedmodel( MODEL_DIR, serving_input_receiver_fn=serving_input_receiver_fn) if degradation_block_cnt >= EARLY_STOPPING_THRESHOLD: print( '\nEarly stopped because degradation block count exceeded threshold. Best model has loss {} and is located under {}\n' .format(best_loss, best_model_path)) break final_results = generate_with_model_located_in(best_model_path) print('\nBest model located under {}. Generated text: \n {}'.format( best_model_path, final_results))
def main(): """Main block of code. Reads the data, constructs the tokeniser and trains the model""" args = parse_cli_args() os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Supress TF warnings X, y = load_data( dataset_path=args.data_path, feature_field=args.features_field, target_field=args.target_field, ) y = np.array(y) print("Dataset loaded {0} examples".format(len(X))) model_config = yaml.safe_load(open(args.model_config, "r")) X_wide, X_deep = process_data(text_feature=X, vec_path=args.vectoriser_path, vocab_size=model_config["vocab_size"]) X_wide_train, X_wide_test, X_deep_train, X_deep_test, y_train, y_test =\ train_test_split(X_wide, X_deep, y, test_size=0.2) print("Train data contains", y_train.shape[0], "examples and test data contains", y_test.shape[0], "examples") print("Constructing Keras model") model = get_wide_deep_model( num_wide_features=X_wide.shape[1], num_deep_features=X_deep_train.shape[1], **model_config, ) print("Training...") model.fit( x=[X_wide_train, X_deep_train], y=y_train, epochs=model_config["epochs"], batch_size=model_config["batch_size"], verbose=1, ) print("Evaluating...") mse = model.evaluate(x=[X_wide_test, X_deep_test], y=y_test, batch_size=model_config["batch_size"], verbose=1) print("Evaluation MSE:", mse) print("Saving ML model") model.save_weights(args.model_path)
def evaluate(model, data_source, cuda=args.cuda): # Turn on evaluation mode which disables dropout. model.eval() eval_loss = 0 total_length = 0 data_source.batch_size = eval_batch_size for data_batch in data_source: data, target, length = process_data(data_batch, cuda=cuda, eval=True) loss = model(data, target, length) cur_length = length.sum() eval_loss += loss.data[0] * cur_length total_length += cur_length return math.exp(eval_loss / total_length)
def generate_with_model_located_in(dir, init_seq='Разрешите мне присесть?', count=100): vocabulary = '\n !"(),-.0123456789:;?NАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШЩЬЭЯабвгдежзийклмнопрстуфхцчшщъыьэюя' text_encoded, int_to_vocab = process_data(vocabulary=vocabulary, content=init_seq) for _ in range(count): generate_fn = predictor.from_saved_model(dir) answer = generate_fn({'x': [text_encoded]}) # symbol_code = np.argmax(answer['probabilities'][0]) symbol_code = pick_top_n(answer['probabilities'][0], len(vocabulary)) text_encoded = np.append(text_encoded, symbol_code) text = '\n===\n' for code in text_encoded: text += int_to_vocab[code] return text
def evaluate(model, data_source, cuda=args.cuda): # Turn on evaluation mode which disables dropout. model.eval() # GRU does not support ce mode right now eval_loss = 0 total_length = 0 with torch.no_grad(): for data_batch in data_source: data, target, length = process_data(data_batch, cuda=cuda, sep_target=sep_target) loss = model(data, target, length) cur_length = length.sum().item() eval_loss += loss.data.item() * cur_length total_length += cur_length return math.exp(eval_loss/total_length)
def main(): pwd = os.getcwd() # 当前目录 file_name = '0930-2_NOK_20200929114544.csv' # 数据文件名称 """构造数据集""" feature = utils.process_data(file_name) feature = torch.tensor(feature, dtype=torch.float32) """加载模型""" model_load = utils.SimpleNet() checkpoint = torch.load(pwd +'\model_save\model.pth.tar') # 加载训练好的模型 model_load.load_state_dict(checkpoint['state_dict']) outputs = model_load(feature.reshape(1, 12, 8, 8)) predict = torch.max(outputs, dim=1)[1] if predict==0: print("铆压结果为:NOK") else: print("铆压结果为:OK")
def train(model, data_source, epoch, lr=1.0, weight_decay=1e-5, momentum=0.9): # Turn on training mode which enables dropout. model.train() model.criterion.loss_type = args.loss total_loss = 0.0 total_real_loss = 0.0 pbar = tqdm(data_source, desc='Training PPL: ....') # pbar = data_source total_num_words = 0.0 for num_batch, data_batch in enumerate(pbar): progress = num_batch / len(pbar) + epoch - 1 optimizer.zero_grad() data, target, length = process_data(data_batch, cuda=args.cuda, sep_target=sep_target) total_num_words += length.sum().item() loss, real_loss = model(data, target, length) # / total_num_words loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() total_loss += loss.item() total_real_loss += real_loss.item() if args.prof: break if num_batch % args.log_interval == 0 and num_batch > 0: cur_loss = total_loss / total_num_words cur_real_loss = total_real_loss / total_num_words ppl = 100000 if True or cur_real_loss < math.log(ppl): ppl = math.exp(cur_real_loss) logger.debug('| epoch {:3d} | {:5d}/{:5d} batches ' '| lr {:02.2f} | loss {:5.2f} | ppl {:8.2f}'.format( epoch, num_batch, len(corpus.train), lr, cur_loss, ppl)) info_str = ('Training loss %.4f, PPL %.4f' % (cur_loss, ppl)) # print('Progress %.4f, Training loss %.4f, PPL %.4f' % (progress, cur_loss, ppl)) pbar.set_description(info_str) total_loss = 0.0 total_real_loss = 0.0 total_num_words = 0.0
def main(argv): data = nlp.load_dataset("tiny_shakespeare") train_data = data["train"][0]["text"] valid_data = data["test"][0]["text"] tokenize = Tokenizer() vocabulary = Vocab() train_data, valid_data, vocab_size = process_data(train_data, valid_data, tokenize, vocabulary, FLAGS.batch_size) charnn = model.create_model( seed=FLAGS.seed, batch_size=FLAGS.batch_size, seq_len=FLAGS.batch_size, model_kwargs=dict( vocab_size=vocab_size, embedding_size=FLAGS.embedding_size, hidden_size=FLAGS.hidden_size, output_size=vocab_size, ), ) trained_model = train_model( model=charnn, learning_rate=FLAGS.learning_rate, num_epochs=FLAGS.num_epochs, seed=FLAGS.seed, train_data=train_data, valid_data=valid_data, batch_size=FLAGS.batch_size, ) generated_text = generate_text( trained_model, vocabulary, max_length=100, temperature=0.8, top_k=3, start_letter="T", ) print("Hello Shakespeare: ", generated_text)
def data_generator(batch_size, seed): # Our dataset is small, we can pack it as numpy, then load all data into memory # Line, Cond(label), Shade x_data, c_data, y_data = load_data('./data.npy') print('Load {} data pairs'.format(len(x_data))) counts = 0 while True: np.random.seed(seed + counts) idx = np.random.randint(0, x_data.shape[0], batch_size) x_batch, c_batch, p_batch, y_batch = process_data(x_data[idx], c_data[idx], y_data[idx], seed=(seed + counts)) counts += batch_size # Line, Cond(label), Pos, Shade yield x_batch, c_batch, p_batch, y_batch
def evaluate(model, data_source, cuda=args.cuda): # Turn on evaluation mode which disables dropout. model.eval() model.criterion.loss_type = 'full' eval_loss = 0 total_length = 0 with torch.no_grad(): for data_batch in data_source: data, target, length = process_data(data_batch, cuda=cuda, sep_target=sep_target) loss = model(data, target, length) cur_length = int(length.data.sum()) eval_loss += loss.item() * cur_length total_length += cur_length model.criterion.loss_type = args.loss return math.exp(eval_loss/total_length)
def main(): data_train, data_val, data_test, char_to_index, index_to_char = process_data( look_back=30, batch_size=1024, split=[0.7, 0.2, 0.1], debug=DEBUG) vocab_size = len(char_to_index) model = LSTMModel(vocab_size, look_back=30, hidden_dim=400, batch_size=1024, lr=1, nb_layers=3) model.build_graph() model.train(data_train, 1) model.create_story( index_to_char, char_to_index, "how are you my pretty Baobei, are you having a good day?")
def train_model(pretrain): X, y = process_data(TRAIN_PATH) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_PROP) if pretrain: model = model_from_json(open(MODEL_PATH).read()) model.load_weights(WEIGHTS_PATH) else: model = build_model() model.compile(loss="categorical_crossentropy", optimizer="adadelta", metrics=["accuracy"]) early_stop = EarlyStopping(monitor="val_loss", patience=8, mode="min") model.fit(X, y, batch_size=BATCH_SIZE, nb_epoch=100, validation_data=(X_val, y_val), callbacks=[early_stop]) print("Saving model to ", MODEL_PATH) print("Saving weights to ", WEIGHTS_PATH) open(MODEL_PATH, "w").write(model.to_json()) model.save_weights(WEIGHTS_PATH, overwrite=True) accuracy = model.evaluate(X_val, y_val, batch_size=BATCH_SIZE) print("Accuracy: ", accuracy)
def plot_prediction(model, sensorId, startDate, endDate, mins_max): test_rows = generate_test_rows() test_processed_data, _ = process_data(test_rows) train_labels = model.predict(test_processed_data).flatten() days_of_week = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] x = np.empty(len(train_labels)) for i in range(len(train_labels)): x[i] = i y = train_labels frequency = 96 plt.ylabel('Total volume') plt.xlabel('Days of week') plt.xticks(x[48::frequency], days_of_week) # plt.yticks(np.arange(y.min(), y.max(), 0.005)) plt.plot(x, y) plt.plot(x, [el[1] for el in mins_maxx]) plt.plot(x, [el[2] for el in mins_maxx]) plt.grid(axis='y', linestyle='-') plt.title('Sensor id {}, time period: {} to {}'.format(str(sensorId), '2016-04-11', '2016-04-17')) plt.savefig('one_week_volume_9005.png') plt.show() pass
def main(): os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Supress TF warnings print("Loading data...") X, y = load_wine_data(DATA_PATH, "points") random_idx = (np.random.rand(5) * len(X)).astype(int) X = [X[idx] for idx in random_idx] y = [y[idx] for idx in random_idx] X_wide_deep = [ process_data(X, count_vec=pickle.load(open(vectoriser, "rb"))) for vectoriser in VEC_PATH ] print("Constructing Keras models...") prediction_models = [ get_wide_deep_model( num_wide_features=X[0].shape[1], num_deep_features=X[1].shape[1], **yaml.safe_load(open(model_conf, "r")), ) for X, model_conf in zip(X_wide_deep, MODEL_CONFIG) ] for weights, model in zip(MODEL_PATH, prediction_models): model.load_weights(weights) print("Predicting...") predictions = [ model.predict([X[0], X[1]], verbose=0)\ for X, model in zip(X_wide_deep, prediction_models) ] for pred_idx, description, target in zip(range(len(X)), X, y): print("=" * 100) print("Wine review:\n", description) print("Reviewer score:", target) for model_idx in range(len(prediction_models)): print("Model", model_idx, "prediction:", predictions[model_idx][pred_idx])
def main(): """Main block of code. Loads the data, model and vectoriser and shows a demo""" args = parse_cli_args() os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Supress TF warnings print("Loading data...") X, y = load_data( dataset_path=args.data_path, feature_field=args.features_field, target_field=args.target_field, ) # Choose five random examples to show random_idx = (np.random.rand(5) * len(X)).astype(int) X = [X[idx] for idx in random_idx] y = [y[idx] for idx in random_idx] X_wide_deep = process_data( text_feature=X, count_vec=pickle.load(open(args.vectoriser_path, "rb")), ) print("Constructing Keras model") prediction_model = get_wide_deep_model( num_wide_features=X_wide_deep[0].shape[1], num_deep_features=X_wide_deep[1].shape[1], **yaml.safe_load(open(args.model_config, "r")), ) prediction_model.load_weights(args.model_path) print("Predicting...") predictions = prediction_model.predict([X_wide_deep[0], X_wide_deep[1]], verbose=0) for prediction, text, target in zip(predictions, X, y): print("=" * 100) print("Text:\n", text) print("Target:", target) print("Model's prediction:", prediction)
def preprocess(): df = pd.read_csv("input/lish-moa/train_features.csv") df = utils.process_data(df) folds = pd.read_csv("input/folds/train_folds.csv") # Create aux target # `nsc_labels` means # of labels found in non-scored train set non_scored_df = pd.read_csv("input/lish-moa/train_targets_nonscored.csv") targets_non_scored = non_scored_df.drop("sig_id", axis=1).to_numpy().sum(axis=1) non_scored_df.loc[:, "nsc_labels"] = targets_non_scored drop_cols = [c for c in non_scored_df.columns if c not in ("nsc_labels", "sig_id")] non_scored_df = non_scored_df.drop(drop_cols, axis=1) folds = folds.merge(non_scored_df, on="sig_id", how="left") targets = folds.drop(["sig_id", "kfold"], axis=1).columns features = df.drop("sig_id", axis=1).columns df = df.merge(folds, on="sig_id", how="left") df.to_csv("input/folds/train.csv", index=False) # Serialize column names with open("input/folds/targets", "w") as f: f.write("\n".join(targets)) with open("input/folds/features", "w") as f: f.write("\n".join(features))
grid_search.best_params_ cvres = grid_search.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): logger.info( "random forest with grid search gave score \n %s for parameters %s" % (str(np.sqrt(-mean_score)), str(params)) ) feature_importances = grid_search.best_estimator_.feature_importances_ sorted(zip(feature_importances, X.columns), reverse=True) final_model = grid_search.best_estimator_ joblib.dump( final_model, os.path.join(MODEL_PATH, "random_forest_grid_search.pkl") ) if __name__ == "__main__": housing_prepared, housing_labels = process_data(is_train=True) logger = create_logger(LOGGING_PATH, "train.log") os.makedirs(MODEL_PATH, exist_ok=True) train_linear_regression(housing_prepared, housing_labels, logger) train_decision_trees(housing_prepared, housing_labels, logger) train_RFR_random_search(housing_prepared, housing_labels, logger) train_RFR_grid_search(housing_prepared, housing_labels, logger)
doc_text = doc_dict[doc_id] top_doc_word_list+=doc_text # find the most frequent words freq_dist = nltk.FreqDist(word for word in top_doc_word_list) best_words = freq_dist.keys()[:num_words] # add to the query new_query = query_text + best_words # recalculate tfidf score and add to score dictionary for doc_id, tfidf_score in calculate_tfidf(new_query, doc_dict, average_doc_length, k): score_dict[query_id, doc_id] = tfidf_score return score_dict if __name__ == "__main__": query_dict = utils.process_data('data/qrys.txt') doc_dict = utils.process_data('data/docs.txt') standard_tfidf_scores = standard_tfidf(query_dict, doc_dict) with open('results/tfidf.top', 'w') as output_file: output_file = utils.write_result(standard_tfidf_scores, output_file) tfidf_with_prf_scores = tfidf_pseudo_relevance_feedback(query_dict, doc_dict) with open('results/best.top', 'w') as output_file: output_file = utils.write_result(tfidf_with_prf_scores, output_file)
if args.DB_NAME == "dbpedia": print("training model on dbpedia") DB_START, DB_END = [1, 141], [101, 166] base = 25 skip_num = 40 db_base = 0 elif args.DB_NAME == "lmdb": print("training model on lmdb") DB_START, DB_END = [101, 166], [141, 176] base = 10 skip_num = 25 db_base = 100 DB_DIR = path.join(DATADIR, args.DB_NAME) # load data data, _, label, _, _ = utils.process_data(args.DB_NAME, DB_START, DB_END, args.top_n, args.file_n) entity2vec, pred2vec, entity2ix, pred2ix = utils.load_transE(args.DB_NAME) pred2ix_size = len(pred2ix) hidden_size = args.transE_dim + args.pred_embedding_dim # train ## cuda device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("cuda or cpu: {}".format(device)) ## loss function if args.loss_function == "BCE": criterion = torch.nn.BCELoss() elif args.loss_function == "MSE": criterion = torch.nn.MSELoss() else:
specialists = OrderedDict() for setting in SPECIALIST_SETTINGS: cols = setting["columns"] model_path = "data/model_{}.json".format(cols[0]) model = model_from_json(open(model_path).read()) weights_path = "data/weights_{}.h5".format(cols[0]) model.load_weights(weights_path) specialists[cols] = model return specialists if __name__ == "__main__": lookup, feature_index = parse_lookup_table(LOOKUP_PATH) X = process_data(TEST_PATH, mode="TEST") specialists = load_specialists() predictions = {} for cols, model in specialists.items(): spec_predictions = model.predict(X, batch_size=BATCH_SIZE) spec_predictions *= IMG_SIZE // 2 spec_predictions += IMG_SIZE // 2 for i, col in enumerate(cols): predictions[col] = spec_predictions[:,i] submission_values = [] for i in range(len(lookup)): img_id = lookup["ImageId"][i] - 1 feature = lookup["FeatureName"][i] submission_values.append(predictions[feature][img_id])