def train(self):

        data_train, target_train = read_file('train')
        data_valid, target_valid = read_file('valid')
        train_losses = []
        valid_losses = []
        train_accs = []
        valid_accs = []
        temp_high = 0.0
        temp_high_epoch = 0
        for epoch in range(self.epoch_num):
            losses = []
            accs = []
            data_size = len(data_train)
            iteration = int(data_size / self.batch_size) + 1
            for i in range(iteration):
                # idx = np.random.choice(data_size, batch_size, replace=False)
                # inputs = data_train[idx]
                # targets = vola_train[idx]
                start = i * self.batch_size
                end = min((i + 1) * self.batch_size, data_size)
                inputs = data_train[start:end]
                targets = target_train[start:end]
                loss, acc, _ = self.sess.run(
                    [self.loss_op, self.accuracy, self.optimize],
                    feed_dict={
                        self.input: inputs,
                        self.label: targets,
                        self.dropout_rate: self.dropout
                    })
                losses.append(loss)
                accs.append(acc)

            train_losses.append(np.mean(losses))
            train_accs.append(np.mean(accs))
            valid_loss, valid_acc = self.sess.run(
                [self.loss_op, self.accuracy],
                feed_dict={
                    self.input: data_valid,
                    self.label: target_valid,
                    self.dropout_rate: 1.0
                })
            valid_losses.append(valid_loss)
            valid_accs.append(valid_acc)

            if valid_accs[-1] > temp_high:
                temp_high = valid_accs[-1]
                temp_high_epoch = epoch
                print("new high! Epoch {}: acc {} train loss {}".format(
                    (epoch + 1), valid_accs[-1], valid_losses[-1]))
                self.save_model()
            elif epoch == temp_high_epoch + 20:
                print("early stop since no increase for 20 epoch!")
                break
            elif (epoch + 1) % 1 == 0:
                print("Epoch {}: acc {} train loss {}".format(
                    (epoch + 1), valid_accs[-1], valid_losses[-1]))

        return train_losses, valid_losses
    def predict(self):

        data_test, target_test = read_file('test')
        test_loss, test_acc, test_pred = self.sess.run(
            [self.loss_op, self.accuracy, self.pred_label],
            feed_dict={
                self.input: data_test,
                self.label: target_test,
                self.dropout_rate: 1.0
            })
        print(test_pred.shape)

        print("testing...")
        print("acc {}  loss {}".format(test_acc, test_loss))
Exemplo n.º 3
0
    def __init__(self, image_path, mode='train', split_rate=None, transform=None, label_path=None):
        self.image_list = read_file(image_path)

        self.mode = mode
        if not self.mode == 'test':
            self.label_list = read_file(label_path)
            assert (len(self.image_list) == len(self.label_list)), "Invalid image and label length"

        self.split_rate = split_rate
        self.transform = transform
        self.total_length = len(self.image_list)
        self.images = self.image_list

        if self.split_rate is not None:
            self.train_length = int(self.total_length * self.split_rate)

            if mode == 'train':
                self.images = self.image_list[:self.train_length]
                self.labels = self.label_list[:self.train_length]
            elif mode == 'validation':
                self.images = self.image_list[self.train_length:]
                self.labels = self.label_list[self.train_length:]
            else:
                pass
Exemplo n.º 4
0
def main():
    # Load the data from the csv file.
    X_data, y_data = data_utils.read_file("data/skin/hmnist_28_28_RGB.csv")
    # Change type to float64
    X_data = X_data.astype('float64')

    # Split the data intro testing and training.
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data,
                                                        test_size=config.TEST_SIZE,
                                                        random_state=42)

    # Split the data into validation and training.
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                      test_size=config.VALIDATION_SIZE,
                                                      random_state=42)
    if config.USE_OVERSAMPLING:
        # Oversample the training set.
        X_train, y_train = data_utils.oversample(X_train, y_train)

        # Oversample the validation set.
        X_val, y_val = data_utils.oversample(X_val, y_val)

        # Oversample the test set.
        X_test, y_test = data_utils.oversample(X_test, y_test)

    skin_cancer_classifier = SkinCancerClassifier(X_train, y_train, X_val, y_val)

    # Evaluate the model on the testing dataset.
    stats = skin_cancer_classifier.model.evaluate(X_test, y_test, verbose=2)
    print("Testing loss", stats[0])
    print("Testing accuracy", stats[1])
    if len(stats) == 3:
        print("F1 score", stats[2])

    y_pred = skin_cancer_classifier.model.predict(X_test)
    # Decode the one-hot vector.
    y_pred = np.argmax(y_pred, axis=1)

    # Print the confusion matrix for the testing dataset.
    print(confusion_matrix(y_true=y_test, y_pred=y_pred))
    target_names = ['Actinic Keratoses', 'Basal cell carcinoma', 'Benign keratosis',
                    'Dermatofibroma', 'Melanocytic nevi', 'Melanoma', 'Vascular skin lesions']
    # Print the recall, precision and f1 scores for the testing set.
    print(classification_report(y_true=y_test, y_pred=y_pred, target_names=target_names))

    if config.PLOT_MODEL:
        draw.plot_performance(skin_cancer_classifier.history)
Exemplo n.º 5
0
Arquivo: main.py Projeto: cy94/ghack
def main():
    # read acc, gps, veh det for multiple drivers, scenes
    X_dfs, Y_dfs = [], []

    data_path = osp.join(DATA_DIR, DATA_FILE)
    print(data_path)

    df = read_file(data_path)

    X = df.iloc[:, :-1].values.astype('float32')
    labels = df.iloc[:, -1]

    enc = LabelBinarizer()
    Y = enc.fit_transform(labels.values)
    n_classes = len(enc.classes_)
    print('Number of classes:', n_classes)

    print("X shape:", X.shape)
    print("Y shape:", Y.shape)

    scaler = MinMaxScaler(feature_range=(0, 1))
    X = scaler.fit_transform(X)

    seq_len, stride = 50, 1

    X_seq = X_to_seq(X, seq_len, stride)
    Y = Y[seq_len:]

    X_tr, X_ts, Y_tr, Y_ts = train_test_split(X_seq, Y, test_size=0.2)

    # train
    print("X Train shape:", X_tr.shape)
    print("Y Train shape:", Y_tr.shape)

    print("X test shape:", X_ts.shape)
    print("Y test shape:", Y_ts.shape)

    n_features = X_tr.shape[-1]

    train_model(X_tr, Y_tr, seq_len, n_features, n_classes)

    loss = test_model(X_ts_seq, Y_ts)
    print(loss)
Exemplo n.º 6
0
def evaluate_line():
    config_path = os.path.join(FLAGS.config_path, 'config')
    test_config = load_config(config_path)

    _, word_to_id = read_vocab(test_config['vocab_file'])
    categorys, cat_to_id = read_category()
    contents, labels = read_file('data/cnews.val2.txt')
    model = Model(test_config)

    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    # 读取模型
    checkpoint_path = os.path.join(FLAGS.checkpoints_path)
    checkpoint_file = tf.train.latest_checkpoint(checkpoint_path)
    saver.restore(session, checkpoint_file)

    while True:
        line = input("请输入测试句子:")
        x_input = [[word_to_id[x] for x in line if x in word_to_id]]
        x_pad = kr.preprocessing.sequence.pad_sequences(x_input, 600)
        predict = model.evaluate(session, x_pad)
        print(categorys[predict[0][0]])
Exemplo n.º 7
0
def get_train_config():
    train_contents, train_labels = read_file(FLAGS.train_file)
    # 1.先构建训练数据的词汇字典
    if not os.path.exists(FLAGS.vocab_file):
        words = build_vocab(train_contents, FLAGS.vocab_file)
    else:
        words, _ = read_vocab(FLAGS.vocab_file)
    # 2.获取分类数据,构建分类数据的字典表,并保存至文件中
    categories, cat_to_id = read_category()
    # 3.生成训练配置文件
    vocab_size = len(words)
    num_classes = len(categories)
    #长度太大会内存溢出
    # seq_len = max([len(content) for content in train_contents])
    seq_len = 600
    filter_sizes = [int(i) for i in FLAGS.filter_sizes.split(',')]
    # 生成环境配置文件
    make_path(FLAGS)
    config_path = os.path.join(FLAGS.config_path, 'config')
    if not os.path.isfile(config_path):
        train_config = config_model(seq_len, vocab_size, num_classes,
                                    filter_sizes)
        save_config(train_config, config_path)
    return train_config
Exemplo n.º 8
0
def main():
    # read acc, gps, veh det for multiple drivers, scenes
    X_dfs, Y_dfs = [], []
    driver_dir = 'D1'

    for drive_dir in os.listdir(osp.join(DATA_DIR, driver_dir)):
        drive_path = osp.join(DATA_DIR, driver_dir, drive_dir)
        print drive_path

        acc = read_file(osp.join(drive_path, ACC_FILE))
        gps = read_file(osp.join(drive_path, GPS_FILE))
        veh = read_file(osp.join(drive_path, VEHDET_FILE))

        score = read_file(osp.join(drive_path, SCORE_FILE))
        datasets = [acc, gps, veh, score]
        n_rows = min(map(len, datasets))

        # sample high frequency data to lowest frequency
        for i in range(len(datasets)):
            # drop time column
            datasets[i].drop(0, 1, inplace=True)

            if len(datasets[i]) > n_rows:
                step = len(datasets[i]) / n_rows
                ndx = xrange(0, n_rows * step, step)
                datasets[i] = datasets[i].ix[ndx]
                datasets[i] = datasets[i].reset_index(drop=True)

        score_df = datasets[-1]
        datasets = datasets[:-1]
        Y_df = score.ix[:, SCORE_COLUMNS]

        # create dataset
        X_df = pd.concat(datasets, axis=1, ignore_index=True)
        X_df.fillna(0, inplace=True)
        print "X:", X_df.shape
        print "Y:", score_df.shape

        X_dfs.append(X_df)
        Y_dfs.append(Y_df)

    # preprocess
    X_df = pd.concat(X_dfs, ignore_index=True)
    X = X_df.values.astype('float32')
    Y = pd.concat(Y_dfs, ignore_index=True).values

    print "X shape:", X.shape
    print "Y shape:", Y.shape

    scaler = MinMaxScaler(feature_range=(0, 1))
    X = scaler.fit_transform(X)

    X_tr, X_ts, Y_tr, Y_ts = train_test_split(X, Y, test_size=0.2)

    # train
    print "X Train shape:", X_tr.shape
    print "Y Train shape:", Y_tr.shape

    print "X test shape:", X_ts.shape
    print "Y test shape:", Y_ts.shape

    seq_len = 16

    X_tr_seq = X_to_seq(X, seq_len, 1)
    Y_tr = Y_tr[seq_len:]

    X_ts_seq = X_to_seq(X_ts, seq_len, 1)
    Y_ts = Y_ts[seq_len:]

    #train_model(X_tr, Y_tr)

    loss = test_model(X_ts_seq, Y_ts)
    print loss
Exemplo n.º 9
0
def load_data(coref_data,incoref_data, window, model_path, testing_count, batch_size, n_epochs, offset = 0,  write_testset = False):
   
    with open(coref_data,'r', encoding="utf8",errors="replace") as f:
        sentences_coref = f.readlines()
        
    with open(incoref_data,'r', encoding="utf8",errors="replace") as f:
        sentences_incoref = f.readlines()
        
    tensor_path = os.path.join(CACHE_DIR, 'tensors.pkl')
    
    print("Reading data from: ",coref_data," and ", incoref_data )
    sentences = {'coref': data_utils.read_file(sentences_coref),'incoref':data_utils.read_file(sentences_incoref)}
    
    textloader = data_utils.TextLoader(sentences, window, model_path, tensor_path)
    x_coref = textloader.features['coref']
    y_coref = textloader.labels['coref']
    xt_incoref = textloader.features['incoref']
    yt_incoref = textloader.labels['incoref']
    raw_data_coref = textloader.raw_data['coref']
    raw_datat_incoref = textloader.raw_data['incoref']
    
    x_incoref,y_incoref,raw_data_incoref = [],[],[]
    #randomize incoref_data
    new_ids = list(range(len(xt_incoref)))
    random.seed(100)
    random.shuffle(new_ids)

    for i in new_ids:
        x_incoref.append(xt_incoref[i])
        y_incoref.append(yt_incoref[i])
        raw_data_incoref.append(raw_datat_incoref[i])
        
    
    print("--------------- DATA STATS --------------- ")
    print("Total extracted positive data: ", len(y_coref))
    print("Total extracted negative data: ", len(y_incoref))
    print()
    # Seperation into testing and training
    #counting the number of coref data in each doc
    docs_count = {}
    max_doc_id = 0 
    for m in raw_data_coref:
        doc_id = m[0]['doc_id']
        if doc_id in docs_count:
            docs_count[doc_id] = docs_count[doc_id] + 1
        else:
            docs_count[doc_id] = 1
        if doc_id > max_doc_id:
            max_doc_id = doc_id
            
    # shuffling according to the number of docs
    docs_seq_ind = list(range(max_doc_id))
    random.seed(400)
    random.shuffle(docs_seq_ind)
    
    # to get the number of test cases
    count = 0
    offset_c = 0
    start = 0
    for x in range(len(docs_seq_ind)):
        if count + 10 >= testing_count:
            if offset_c >= offset: # allowing a window of 10 chains
                print("testing documents:", docs_seq_ind[start:x])
                break;
            else:
                count = 0
                start = x
                offset_c = offset_c + 1
        if docs_seq_ind[x] in docs_count:
            count = count + docs_count[docs_seq_ind[x]]
    
    docs_seq_ind = docs_seq_ind[start:x]
    
    x_test,x_train,y_test,y_train = [],[],[],[]
    raw_data = []
    for (x,y,r) in zip(x_coref,y_coref,raw_data_coref):
        if r[0]['doc_id'] in docs_seq_ind:
            x_test.append(x)
            y_test.append(y)
            raw_data.append(r)
        else:
            x_train.append(x)
            y_train.append(y)
    coref_test_size = len(x_test)
    coref_train_size = len(x_train)
    
    for (x,y,r) in zip(x_incoref,y_incoref,raw_data_incoref):
        if r[0]['doc_id'] in docs_seq_ind:
            if len(x_test) - coref_test_size < coref_test_size:
                x_test.append(x)
                y_test.append(y)
                raw_data.append(r)
        else:
            if len(x_train) - coref_train_size < coref_train_size:
                x_train.append(x)
                y_train.append(y)
    # End of Speration into test and training 
    if write_testset:
        with open("devset_" + str(testing_count) +"_" + \
                  coref_data.split("/")[1],"w+",encoding="utf-8") as f:
            for chain in raw_data:
                for link in chain:
                    f.write(link['raw_data']+"\n")
    
        
    train_batches = data_utils.batch_iter(
    list(zip(x_train, y_train)), batch_size, n_epochs)
    test_data = {'x': x_test, 'y': y_test}
    print()
    print("Total testing data: ", len(y_test), " coref:", coref_test_size)
    print("Total training data: ", len(y_train), " coref:", coref_train_size)
    print()
    
    return (train_batches, test_data, textloader.features_size)
Exemplo n.º 10
0
def train_model(args):
    """Train sequence to sequence model with training files."""

    # Parse model parameters
    model_params = dict([(arg, val) for arg, val in vars(args).items()
                         if arg in get_Seq2Seq_model_param_names()])

    # Model
    model = Seq2Seq(model_dir=args.model_dir,
                    dict_path=args.dict_path,
                    **model_params)

    # Train
    if args.train_data_path is not None:

        # Train files
        files = get_path_files(args.train_data_path)

        if args.shuffle_files:
            np.random.shuffle(files)

        # Batch generators
        # File batches
        file_gen = read_files_cycled(
            filenames=files,
            max_file_pool_size=args.max_file_pool_size,
            file_batch_size=args.file_batch_size,
            file_batch_shuffle=False)

        # Train batches
        train_gen = rebatch(file_gen,
                            in_batch_size_limit=args.file_batch_size *
                            args.max_file_pool_size,
                            out_batch_size=args.batch_size,
                            shuffle=args.shuffle_file_batches,
                            flatten=True)

        if args.validation_data_path is not None:
            valid_data = read_file(args.validation_data_path,
                                   nrows=args.validate_n_rows)
            valid_source_docs, valid_target_docs = zip(*valid_data)

        # Train
        start = time.clock()
        for batch_nb, batch in enumerate(train_gen):
            source_docs, target_docs = zip(*batch)
            loss, global_step = model.train(
                source_docs,
                target_docs,
                dropout_rate=args.dropout_rate,
                optimizer=args.optimizer,
                learning_rate=args.learning_rate,
                max_gradient_norm=args.max_gradient_norm,
                max_seq_len=args.max_seq_len,
                save_every_n_batch=args.save_every_n_batch)

            # Print progress
            end = time.clock()
            samples = global_step * args.batch_size
            print('[{}] Training step: {} - Samples: {} - Loss: {:<.3f} - Time {:<.3f}'\
                .format(str(datetime.now()), global_step, samples, loss, round(end-start,3)))
            start = end

            # Validation
            if args.validation_data_path is not None:
                if batch_nb % args.validate_every_n_batch == 0 and batch_nb > 0:
                    loss, global_step = model.eval(valid_source_docs,
                                                   valid_target_docs)
                    end = time.clock()
                    print('[{}] Validation step: {} - Samples: {} - Loss: {:<.3f} - Time {:<.3f}'\
                    .format(str(datetime.now()), global_step, samples, loss, round(end-start,3)))
                    start = end

    else:
        print('Model created, but no training files were provided!')