示例#1
0
    def read_dataset(filenames, *args, **kwargs):
        ## read the training data and pre-processing
        training_file = filenames[0]
        sentences, output_labels, max_length = prd.read_data(training_file)
        word_to_index = prd.get_vocabulory(sentences)
        label_to_index, index_to_label = prd.prepare_outputs(output_labels)
        char_index = prd.get_vocabulory(word_to_index)
        max_length = configs.MAX_SEQ_LEN
        char_indices = prd.get_chars(sentences, max_length, char_index)
        vocab_size = len(word_to_index)
        glove_vectors = prd.read_glove_vecs(configs.GLOVE_EMBEDDINGS)
        word_embeddings = prd.get_preTrained_embeddings(
            word_to_index, glove_vectors, vocab_size)
        max_length = configs.MAX_SEQ_LEN

        with open(configs.DICT_FILE, 'w', encoding='utf-8') as file:
            file.write(str(word_to_index))
            file.write("\n")
            file.write(str(label_to_index))
            file.write("\n")
            file.write(str(max_length))

        with open(configs.EMBEDDINGS_FILE, 'wb') as file:
            np.save(file, word_embeddings)

        #input and output sequences to the model
        train_indeces = prd.get_sequence_indices(sentences, word_to_index,
                                                 max_length)
        labels = prd.get_sequence_indices(output_labels, label_to_index,
                                          max_length)
        no_of_classes = len(label_to_index)
        no_of_examples = len(sentences)
        print('Total no of input sequences:', no_of_examples)
        assert (len(train_indeces) == len(labels)
                ), "length of I/O sequences doesn't match"

        #validation samples/examples
        sentences_v, output_labels_v, max_length_v = prd.read_data(
            configs.VALIDATION_FILE)
        indeces_v = prd.get_sequence_indices(sentences_v, word_to_index,
                                             max_length)
        labels_v = prd.get_sequence_indices(output_labels_v, label_to_index,
                                            max_length)
        char_indices_v = prd.get_chars(sentences_v, max_length, char_index)
        assert (len(indeces_v) == len(labels_v)
                ), "length of I/O sequences doesn't match"
        max_length = configs.MAX_SEQ_LEN

        return [
            word_embeddings, char_index, max_length, char_indices,
            no_of_classes, train_indeces, labels, indeces_v, char_indices_v,
            labels_v, index_to_label, output_labels_v
        ]
    def build_inputs(self):
        """Input prefetching, preprocessing and batching.

           Outputs:
             self.images
             self.input_seqs
             self.target_seqs (training and eval only)
             self.input_mask (training and eval only)
        """
        if self.mode == "inference":
            image_feed = tf.placeholder(dtype=tf.string,
                                        shape=[],
                                        name="image_feed")
            input_feed = tf.placeholder(
                dtype=tf.int64,
                shape=[None],  # batch_size
                name="input_feed")

            # Process image and insert batch dimensions.
            image = self.process_image(image_feed)
            images = tf.expand_dims(image, 0)

            # No target sequences or input mask in inference mode.
            input_seqs = tf.expand_dims(input_feed, 1)
            target_seqs = None
            input_mask = None
        else:
            queue = prepare_data.read_data(
                reader=self.reader,
                file_pattern=self.config.input_file_pattern,
                is_training=True,
                batch_size=self.config.batch_size,
                values_per_shard=self.config.values_per_input_shard,
                input_queue_capacity_factor=self.config.
                input_queue_capacity_factor,
                num_reader_threads=self.config.num_input_reader_threads)
            data = []
            for thread_id in range(self.config.num_preprocess_threads):
                seq_example = queue.dequeue()
                encoded_img, caption = prepare_data.parse_sequence_example(
                    seq_example)
                image = self.process_image(encoded_img, thread_id=thread_id)
                data.append([image, caption])

            queue_capacity = (2 * self.config.num_preprocess_threads *
                              self.config.batch_size)

            images, input_seqs, target_seqs, input_mask = (
                prepare_data.prepare_batch(data=data,
                                           batch_size=self.config.batch_size,
                                           queue_capacity=queue_capacity))
        self.images = images
        self.input_seqs = input_seqs
        self.target_seqs = target_seqs
        self.input_mask = input_mask
示例#3
0
def main(args):
    data = read_data()
    train_data, test_data, deep_columns_idx, embedding_columns_dict = feature_engine(
        data)
    data_wide = train_data[0]
    train_data = (torch.from_numpy(train_data[0].values),
                  torch.from_numpy(train_data[1].values),
                  torch.from_numpy(train_data[2].values))
    train_data = trainset(train_data)
    test_data = (torch.from_numpy(test_data[0].values),
                 torch.from_numpy(test_data[1].values),
                 torch.from_numpy(test_data[2].values))
    test_data = trainset(test_data)
    trainloader = DataLoader(train_data,
                             batch_size=args.batch_size,
                             shuffle=True)
    testloader = DataLoader(test_data,
                            batch_size=args.batch_size,
                            shuffle=False)
    device = to_device()
    # parameters setting
    deep_model_params = {
        'deep_columns_idx': deep_columns_idx,
        'embedding_columns_dict': embedding_columns_dict,
        'hidden_layers': args.hidden_layers,
        'dropouts': args.dropouts,
        'deep_output_dim': args.deep_out_dim
    }
    wide_model_params = {
        'wide_input_dim': data_wide.shape[1],
        'wide_output_dim': args.wide_out_dim
    }
    activation, criterion = set_method(args.method)
    widedeep = WideDeep(wide_model_params, deep_model_params, activation)
    widedeep = widedeep.to(device)
    optimizer = torch.optim.Adam(widedeep.parameters(), lr=args.lr)
    train(widedeep,
          trainloader,
          testloader,
          optimizer,
          criterion,
          device,
          epochs=args.epochs,
          print_step=args.print_step,
          validation=args.validation)
    save_model(widedeep, "wide_deep_model_{}.pkl".format(time.time()))
示例#4
0
    def predict(*args, **kwargs):
        def remove_sents(sentences_t, labels_t, max_length):
            remove_idxs = []
            for i, sentence in enumerate(sentences_t):
                if len(sentence) > max_length:
                    remove_idxs.append(i)
            for i in remove_idxs:
                sentences_t.pop(i)
                labels_t.pop(i)

        word_index = {}
        label_index = {}
        max_length = 0
        with open(configs.DICT_FILE, 'r', encoding='utf-8') as file:
            dicts = file.read()
            dicts = dicts.split("\n")
            word_index = eval(dicts[0])
            label_index = eval(dicts[1])
            max_length = eval(dicts[2])

        with open(configs.EMBEDDINGS_FILE, 'rb') as file:
            word_embeddings = np.load(file)

        #Loading test sequences
        sentences_t, labels_t, max_length_t = prd.read_data(configs.TEST_FILE)

        remove_sents(sentences_t, labels_t, max_length)
        print('Total no of test sequences: ', len(sentences_t))
        char_index = prd.get_vocabulory(word_index)
        char_idxs = prd.get_chars(sentences_t, max_length, char_index)
        label_idxs = prd.get_sequence_indices(labels_t, label_index,
                                              max_length)
        seq_idxs = prd.get_sequence_indices(sentences_t, word_index,
                                            max_length)
        assert (len(seq_idxs) == len(label_idxs)
                ), "length of I/O sequences doesn't match"

        index_labels = {}
        for item, i in label_index.items():
            index_labels[i] = item

        model_t = mdl.get_model(word_embeddings, max_length, len(char_index),
                                len(index_labels), True)
        # Predict labels for test data
        pred_label = np.asarray(model_t.predict([seq_idxs, char_idxs]))
        pred_label = np.argmax(pred_label, axis=-1)
        #Skipping padded sequences
        pred_label = prd.get_orig_labels(pred_label, index_labels, labels_t)
        print("Predicted Labels--->\n", pred_label[0])

        outputfile = configs.OUTPUT_FILE
        with open(outputfile, 'w', encoding='utf-8') as f:
            f.write('WORD' + ' ' + 'TRUE_LABEL' + ' ' + 'PRED_LABEL')
            f.write('\n')
            f.write('\n')
            for i in range(len(pred_label)):
                cur_sentences = sentences_t[i]
                cur_labels = labels_t[i]
                cur_pred = pred_label[i]
                for j in range(len(cur_sentences)):
                    f.write(cur_sentences[j] + ' ' + cur_labels[j] + ' ' +
                            cur_pred[j])
                    f.write('\n')
                f.write('\n')
            f.write('\n')

        with open(outputfile, 'r', encoding='utf-8') as f:
            for line in f.readlines():
                print(line)

        return ([labels_t, pred_label])
示例#5
0
                   type=int,
                   default=[64, 32, 16])
parse.add_argument("--dropouts", nargs='+', type=int, default=[0.5, 0.5])
parse.add_argument("--deep_out_dim", default=1, type=int)
parse.add_argument("--wide_out_dim", default=1, type=int)
parse.add_argument("--batch_size", default=32, type=int)
parse.add_argument("--lr", default=0.01, type=float)
parse.add_argument("--print_step", default=200, type=int)
parse.add_argument("--epochs", default=10, type=int)
parse.add_argument("--validation", default=True, type=bool)
parse.add_argument("--method",
                   choices=['multiclass', 'binary', 'regression'],
                   default='binary',
                   type=str)
args = parse.parse_args()
data = read_data()
train_data, test_data, deep_columns_idx, embedding_columns_dict = feature_engine(
    data)
data_wide = train_data[0]

# 预测数据的输入格式,这里预测一条数据
t = (torch.from_numpy(train_data[0].values[0].reshape(
    -1, train_data[0].values.shape[1])),
     torch.from_numpy(train_data[1].values[0].reshape(
         -1, train_data[1].values.shape[1])))
print(t)

# parameters setting
deep_model_params = {
    'deep_columns_idx': deep_columns_idx,
    'embedding_columns_dict': embedding_columns_dict,