def read_dataset(filenames, *args, **kwargs): ## read the training data and pre-processing training_file = filenames[0] sentences, output_labels, max_length = prd.read_data(training_file) word_to_index = prd.get_vocabulory(sentences) label_to_index, index_to_label = prd.prepare_outputs(output_labels) char_index = prd.get_vocabulory(word_to_index) max_length = configs.MAX_SEQ_LEN char_indices = prd.get_chars(sentences, max_length, char_index) vocab_size = len(word_to_index) glove_vectors = prd.read_glove_vecs(configs.GLOVE_EMBEDDINGS) word_embeddings = prd.get_preTrained_embeddings( word_to_index, glove_vectors, vocab_size) max_length = configs.MAX_SEQ_LEN with open(configs.DICT_FILE, 'w', encoding='utf-8') as file: file.write(str(word_to_index)) file.write("\n") file.write(str(label_to_index)) file.write("\n") file.write(str(max_length)) with open(configs.EMBEDDINGS_FILE, 'wb') as file: np.save(file, word_embeddings) #input and output sequences to the model train_indeces = prd.get_sequence_indices(sentences, word_to_index, max_length) labels = prd.get_sequence_indices(output_labels, label_to_index, max_length) no_of_classes = len(label_to_index) no_of_examples = len(sentences) print('Total no of input sequences:', no_of_examples) assert (len(train_indeces) == len(labels) ), "length of I/O sequences doesn't match" #validation samples/examples sentences_v, output_labels_v, max_length_v = prd.read_data( configs.VALIDATION_FILE) indeces_v = prd.get_sequence_indices(sentences_v, word_to_index, max_length) labels_v = prd.get_sequence_indices(output_labels_v, label_to_index, max_length) char_indices_v = prd.get_chars(sentences_v, max_length, char_index) assert (len(indeces_v) == len(labels_v) ), "length of I/O sequences doesn't match" max_length = configs.MAX_SEQ_LEN return [ word_embeddings, char_index, max_length, char_indices, no_of_classes, train_indeces, labels, indeces_v, char_indices_v, labels_v, index_to_label, output_labels_v ]
def build_inputs(self): """Input prefetching, preprocessing and batching. Outputs: self.images self.input_seqs self.target_seqs (training and eval only) self.input_mask (training and eval only) """ if self.mode == "inference": image_feed = tf.placeholder(dtype=tf.string, shape=[], name="image_feed") input_feed = tf.placeholder( dtype=tf.int64, shape=[None], # batch_size name="input_feed") # Process image and insert batch dimensions. image = self.process_image(image_feed) images = tf.expand_dims(image, 0) # No target sequences or input mask in inference mode. input_seqs = tf.expand_dims(input_feed, 1) target_seqs = None input_mask = None else: queue = prepare_data.read_data( reader=self.reader, file_pattern=self.config.input_file_pattern, is_training=True, batch_size=self.config.batch_size, values_per_shard=self.config.values_per_input_shard, input_queue_capacity_factor=self.config. input_queue_capacity_factor, num_reader_threads=self.config.num_input_reader_threads) data = [] for thread_id in range(self.config.num_preprocess_threads): seq_example = queue.dequeue() encoded_img, caption = prepare_data.parse_sequence_example( seq_example) image = self.process_image(encoded_img, thread_id=thread_id) data.append([image, caption]) queue_capacity = (2 * self.config.num_preprocess_threads * self.config.batch_size) images, input_seqs, target_seqs, input_mask = ( prepare_data.prepare_batch(data=data, batch_size=self.config.batch_size, queue_capacity=queue_capacity)) self.images = images self.input_seqs = input_seqs self.target_seqs = target_seqs self.input_mask = input_mask
def main(args): data = read_data() train_data, test_data, deep_columns_idx, embedding_columns_dict = feature_engine( data) data_wide = train_data[0] train_data = (torch.from_numpy(train_data[0].values), torch.from_numpy(train_data[1].values), torch.from_numpy(train_data[2].values)) train_data = trainset(train_data) test_data = (torch.from_numpy(test_data[0].values), torch.from_numpy(test_data[1].values), torch.from_numpy(test_data[2].values)) test_data = trainset(test_data) trainloader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True) testloader = DataLoader(test_data, batch_size=args.batch_size, shuffle=False) device = to_device() # parameters setting deep_model_params = { 'deep_columns_idx': deep_columns_idx, 'embedding_columns_dict': embedding_columns_dict, 'hidden_layers': args.hidden_layers, 'dropouts': args.dropouts, 'deep_output_dim': args.deep_out_dim } wide_model_params = { 'wide_input_dim': data_wide.shape[1], 'wide_output_dim': args.wide_out_dim } activation, criterion = set_method(args.method) widedeep = WideDeep(wide_model_params, deep_model_params, activation) widedeep = widedeep.to(device) optimizer = torch.optim.Adam(widedeep.parameters(), lr=args.lr) train(widedeep, trainloader, testloader, optimizer, criterion, device, epochs=args.epochs, print_step=args.print_step, validation=args.validation) save_model(widedeep, "wide_deep_model_{}.pkl".format(time.time()))
def predict(*args, **kwargs): def remove_sents(sentences_t, labels_t, max_length): remove_idxs = [] for i, sentence in enumerate(sentences_t): if len(sentence) > max_length: remove_idxs.append(i) for i in remove_idxs: sentences_t.pop(i) labels_t.pop(i) word_index = {} label_index = {} max_length = 0 with open(configs.DICT_FILE, 'r', encoding='utf-8') as file: dicts = file.read() dicts = dicts.split("\n") word_index = eval(dicts[0]) label_index = eval(dicts[1]) max_length = eval(dicts[2]) with open(configs.EMBEDDINGS_FILE, 'rb') as file: word_embeddings = np.load(file) #Loading test sequences sentences_t, labels_t, max_length_t = prd.read_data(configs.TEST_FILE) remove_sents(sentences_t, labels_t, max_length) print('Total no of test sequences: ', len(sentences_t)) char_index = prd.get_vocabulory(word_index) char_idxs = prd.get_chars(sentences_t, max_length, char_index) label_idxs = prd.get_sequence_indices(labels_t, label_index, max_length) seq_idxs = prd.get_sequence_indices(sentences_t, word_index, max_length) assert (len(seq_idxs) == len(label_idxs) ), "length of I/O sequences doesn't match" index_labels = {} for item, i in label_index.items(): index_labels[i] = item model_t = mdl.get_model(word_embeddings, max_length, len(char_index), len(index_labels), True) # Predict labels for test data pred_label = np.asarray(model_t.predict([seq_idxs, char_idxs])) pred_label = np.argmax(pred_label, axis=-1) #Skipping padded sequences pred_label = prd.get_orig_labels(pred_label, index_labels, labels_t) print("Predicted Labels--->\n", pred_label[0]) outputfile = configs.OUTPUT_FILE with open(outputfile, 'w', encoding='utf-8') as f: f.write('WORD' + ' ' + 'TRUE_LABEL' + ' ' + 'PRED_LABEL') f.write('\n') f.write('\n') for i in range(len(pred_label)): cur_sentences = sentences_t[i] cur_labels = labels_t[i] cur_pred = pred_label[i] for j in range(len(cur_sentences)): f.write(cur_sentences[j] + ' ' + cur_labels[j] + ' ' + cur_pred[j]) f.write('\n') f.write('\n') f.write('\n') with open(outputfile, 'r', encoding='utf-8') as f: for line in f.readlines(): print(line) return ([labels_t, pred_label])
type=int, default=[64, 32, 16]) parse.add_argument("--dropouts", nargs='+', type=int, default=[0.5, 0.5]) parse.add_argument("--deep_out_dim", default=1, type=int) parse.add_argument("--wide_out_dim", default=1, type=int) parse.add_argument("--batch_size", default=32, type=int) parse.add_argument("--lr", default=0.01, type=float) parse.add_argument("--print_step", default=200, type=int) parse.add_argument("--epochs", default=10, type=int) parse.add_argument("--validation", default=True, type=bool) parse.add_argument("--method", choices=['multiclass', 'binary', 'regression'], default='binary', type=str) args = parse.parse_args() data = read_data() train_data, test_data, deep_columns_idx, embedding_columns_dict = feature_engine( data) data_wide = train_data[0] # 预测数据的输入格式,这里预测一条数据 t = (torch.from_numpy(train_data[0].values[0].reshape( -1, train_data[0].values.shape[1])), torch.from_numpy(train_data[1].values[0].reshape( -1, train_data[1].values.shape[1]))) print(t) # parameters setting deep_model_params = { 'deep_columns_idx': deep_columns_idx, 'embedding_columns_dict': embedding_columns_dict,