def run_cv(fold_iterator, logger, params_dict, upsample=True):
    for traindirs, testdirs in fold_iterator:
        # TRAIN LOCAL PREDICTION MODEL
        # Generators
        logger.info('############ FOLD #############')
        logger.info('Training folders are {}'.format(traindirs))
        training_generator = DataLoader(data_dir,
                                        traindirs,
                                        32,
                                        width_template=params_dict['width'],
                                        upsample=upsample)
        validation_generator = DataLoader(data_dir,
                                          testdirs,
                                          32,
                                          width_template=params_dict['width'],
                                          type='val',
                                          upsample=upsample)

        # Design model
        model = create_model(params_dict['width'] + 1,
                             params_dict['h1'],
                             params_dict['h2'],
                             params_dict['h3'],
                             embed_size=params_dict['embed_size'],
                             drop_out_rate=params_dict['dropout_rate'],
                             use_batch_norm=params_dict['use_batchnorm'])
        # Train model on training dataset
        '''
        model.fit_generator(generator=training_generator,
                            validation_data=validation_generator,
                            use_multiprocessing=True,
                            epochs=params_dict['n_epochs'],
                            workers=6)
        '''
        try:
            model.load_weights(os.path.join(checkpoint_dir, 'model22.h5'))
        except OSError:
            print('here')
            model.fit_generator(generator=training_generator,
                                validation_data=validation_generator,
                                use_multiprocessing=True,
                                epochs=params_dict['n_epochs'],
                                workers=4,
                                max_queue_size=20)
            model.save_weights(os.path.join(checkpoint_dir, 'model.h5'))
        metrics = model.evaluate_generator(generator=validation_generator,
                                           workers=4,
                                           max_queue_size=20)
        logger.info(metrics)
Exemplo n.º 2
0
def starcraft_sp_test():

    # Create DataLoader instance to load and format data
    dataLoader = DataLoader()

    logging.info("Program started")

    logging.info("Loading starcraft data")
    # Read skillcraft dataset, the class index is the second column
    dataLoader.read(filename="data/SkillCraft1_Dataset.csv",
                    classIndex=1,
                    numOfFeatures=15)

    # Normalize data values from 0 - 1
    #dataLoader.normalize()

    # Create new labels to fit into binary classification
    dataLoader.scaleToBinary(5)

    # Spectral Clustering

    # Binary
    clustering(dataLoader.x_train,
               dataLoader.y_train,
               writer_starcraft,
               'starcraft-binary',
               multiple=True,
               binary=True)

    # Multiclass
    #clustering(dataLoader.x_train, dataLoader.multi_y_train, writer_starcraft, 'starcraft-multiclass', multiple=True, binary=False)

    # Write all the results
    writer_starcraft.save()
Exemplo n.º 3
0
def load_data(filePath: str,
              label_txt_filePath: str,
              shuffle: bool = True,
              seq_length: int = 3000,
              batch_size: int = 64,
              training: bool = True):
    voc = Vocab()
    dataLoader = DataLoader()

    # 全部数据
    dataLoader.sequences = dataLoader.read_fasta_file(fasta_file_path=filePath)
    # 训练集
    dataLoader.train_seq = dataLoader.sequences[:900]
    # 测试集
    dataLoader.test_seq = dataLoader.sequences[900:1000]
    # 标签,0/1
    dataLoader.labels = dataLoader.read_label_txt(
        label_file_path=label_txt_filePath)
    # 训练集的向量表示
    dataLoader.train_vectorized_seq = voc.sequences_to_ids(
        dataLoader.train_seq)
    # 测试集的向量表示
    dataLoader.test_vectorized_seq = voc.sequences_to_ids(dataLoader.test_seq)

    # print(dataLoader.train_vectorized_seq)
    # print(dataLoader.test_vectorized_seq)
    # x_batch, y_batch = dataLoader.get_batch(shuffle=shuffle, seq_length=seq_length, batch_size=batch_size, training=training)
    # print("x_batch.shape={}, y_batch.shape={}".format(x_batch.shape, y_batch.shape))
    # print("x_batch[0]:{}".format(x_batch[0]))
    # print("y_batch[0]:{}".format(y_batch[0]))

    return voc, dataLoader
Exemplo n.º 4
0
def main():
    file_name = 'input/BioASQ-task6bPhaseB-testset3.json'
    file_name = 'input/BioASQ-trainingDataset6b.json'
    file_name = 'input/BioASQ-trainingDataset5b.json'
    file_name = 'input/phaseB_5b_05.json'
    save_model_file_name = 'weights_2'
    ranker = SVMRank(save_model_file_name)
    data = DataLoader(file_name)
    data.load_ner_entities()
    ans_file = 'output/factoid_list_%s.json' % data.name

    questions = data.get_questions_of_type(C.FACTOID_TYPE)
    for i, question in enumerate(tqdm(questions)):
        ranked_sentences = question.ranked_sentences()
        X, candidates = get_only_features(question, ranked_sentences)
        top_answers = ranker.classify_from_feed(X, candidates, i)
        question.exact_answer = [[answer] for answer in top_answers[:5]]
        # question.exact_answer = [answer for answer in top_answers]
        # print question.exact_answer_ref
        # print '\n'
        # print top5
        # print '\n'
        # print '\n\n\n'
    questions = data.get_questions_of_type(C.LIST_TYPE)
    for i, question in enumerate(tqdm(questions)):
        ranked_sentences = question.ranked_sentences()
        X, candidates = get_only_features(question, ranked_sentences)
        top_answers = ranker.classify_from_feed(X, candidates, i)
        question.exact_answer = [[answer] for answer in top_answers[:10]]

    data.save_factoid_list_answers(ans_file)
Exemplo n.º 5
0
def get_data_loaders(data_root, speaker_id, test_shuffle=True):
    data_loaders = {}
    local_conditioning = hparams.cin_channels > 0
    for phase in ["train", "test"]:
        train = phase == "train"
        X = FileSourceDataset(
            RawAudioDataSource(data_root,
                               speaker_id=speaker_id,
                               train=train,
                               test_size=hparams.test_size,
                               test_num_samples=hparams.test_num_samples,
                               random_state=hparams.random_state))
        if local_conditioning:
            Mel = FileSourceDataset(
                MelSpecDataSource(data_root,
                                  speaker_id=speaker_id,
                                  train=train,
                                  test_size=hparams.test_size,
                                  test_num_samples=hparams.test_num_samples,
                                  random_state=hparams.random_state))
            assert len(X) == len(Mel)
            print("Local conditioning enabled. Shape of a sample: {}.".format(
                Mel[0].shape))
        else:
            Mel = None
        print("[{}]: length of the dataset is {}".format(phase, len(X)))

        if train:
            lengths = np.array(X.file_data_source.lengths)
            # Prepare sampler
            sampler = PartialyRandomizedSimilarTimeLengthSampler(
                lengths, batch_size=hparams.batch_size)
            shuffle = False
        else:
            sampler = None
            shuffle = test_shuffle

        dataset = PyTorchDataset(X, Mel)
        data_loader = DataLoader(dataset,
                                 batch_size=hparams.batch_size,
                                 num_workers=hparams.num_workers,
                                 sampler=sampler,
                                 shuffle=shuffle,
                                 collate_fn=collate_fn,
                                 pin_memory=hparams.pin_memory)

        speaker_ids = {}
        for idx, (x, c, g) in enumerate(dataset):
            if g is not None:
                try:
                    speaker_ids[g] += 1
                except KeyError:
                    speaker_ids[g] = 1
        if len(speaker_ids) > 0:
            print("Speaker stats:", speaker_ids)

        data_loaders[phase] = data_loader

    return data_loaders
Exemplo n.º 6
0
    def _build_data(self, data_dir='train_dir', num_classes=10, mode='train'):
        loader = DataLoader(data_dir=data_dir, num_classes=num_classes,
                            mode=mode, height=self.height, width=self.width)

        dataset = tf.data.Dataset.from_generator(generator=loader.generator,
                                                 output_types=(tf.float32,
                                                               tf.int32),
                                                 output_shapes=(tf.TensorShape([self.height, self.width, 3]),
                                                                tf.TensorShape([self.num_classes])))
        return dataset
Exemplo n.º 7
0
def _reading_data():
    print(config.USER)

    # step2 the way to load_data
    # load data contains :
    # the way to load data
    # the way to preprocess with data
    # doing some special data cleaning process
    trainFilepath = os.path.join(os.getcwd(), "data", config.FILENAME)
    trainDataLoader = DataLoader(trainFilepath)
    train_data = trainDataLoader.load_data(useSpark=False, interactive=False)

    train_data.save_data(os.getcwd())
Exemplo n.º 8
0
def main():
    ranker = SVMRank()
    file_name = 'input/BioASQ-trainingDataset6b.json'
    data = DataLoader(file_name)
    data.load_ner_entities()
    questions = data.get_questions_of_type(C.FACTOID_TYPE)[:419]

    for i, question in enumerate(questions):
        ranked_sentences = question.ranked_sentences()
        X, y = get_features(question, ranked_sentences)
        ranker.feed(X, y, i)

    ranker.train_from_feed()
    ranker.save('weights_2')
Exemplo n.º 9
0
def test_real_dataset(create_obj_func,
                      src_name=None,
                      trg_name=None,
                      show=False,
                      block_figure_on_end=False):
    print('Running {} ...'.format(os.path.basename(__file__)))

    if src_name is None:
        if len(sys.argv) > 2:
            src_name = sys.argv[2]
        else:
            raise Exception('Not specify source dataset')
    if trg_name is None:
        if len(sys.argv) > 3:
            trg_name = sys.argv[3]
        else:
            raise Exception('Not specify trgget dataset')

    np.random.seed(random_seed())
    tf.set_random_seed(random_seed())
    tf.reset_default_graph()

    print("========== Test on real data ==========")
    users_params = dict()
    users_params = parse_arguments(users_params)
    data_format = 'mat'

    if 'format' in users_params:
        data_format, users_params = extract_param('format', data_format,
                                                  users_params)

    data_loader = DataLoader(src_domain=src_name,
                             trg_domain=trg_name,
                             data_path=data_dir(),
                             data_format=data_format,
                             cast_data=users_params['cast_data'])

    assert users_params['batch_size'] % data_loader.num_src_domain == 0
    print('users_params:', users_params)

    learner = create_obj_func(users_params)
    learner.dim_src = data_loader.data_shape
    learner.dim_trg = data_loader.data_shape

    learner.x_trg_test = data_loader.trg_test[0][0]
    learner.y_trg_test = data_loader.trg_test[0][1]
    learner._init(data_loader)
    learner._build_model()
    learner._fit_loop()
Exemplo n.º 10
0
    def test(self,
             data_dir='test_b',
             model_dir=None,
             output_dir=None,
             threshold=0.5):
        print("testing starts.")

        loader = DataLoader(data_dir=data_dir,
                            mode='test',
                            height=self.height,
                            width=self.width,
                            label_value=self.label_values)
        testset = tf.data.Dataset.from_generator(
            generator=loader.generator,
            output_types=(tf.string, tf.int32, tf.float32),
            output_shapes=(tf.TensorShape([]), tf.TensorShape([2]),
                           tf.TensorShape([self.height, self.width, 3])))

        testset = testset.batch(1)
        testset = testset.prefetch(10)
        test_init = self.it.make_initializer(testset)

        saver = tf.train.Saver()
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        with tf.Session(config=config) as sess:
            saver.restore(sess, model_dir)
            sess.run(test_init)
            queue = multiprocessing.Queue(maxsize=30)
            writer_process = multiprocessing.Process(
                target=writer,
                args=[output_dir, self.label_values, queue, 'stop'])
            writer_process.start()
            print('writing predictions...')
            try:
                while True:
                    img, path, size, output_image = sess.run(
                        [self.img, self.path, self.size, self.logits])
                    queue.put(('continue', path, size, img, output_image))
            except tf.errors.OutOfRangeError:
                queue.put(('stop', None, None, None, None))

        print('testing finished.')
Exemplo n.º 11
0
def prepare_data_loader_train_10_splits(texture_train_data_set_path,
                                        texture_train_label_set_path,
                                        texture_val_data_set_path,
                                        texture_val_label_set_path,
                                        texture_batch_size, num_workers,
                                        device):
    data_loader_list = []
    for i in range(10):
        idx = i + 1
        print("Split: {0}".format(idx))
        texture_train_data_set_path = texture_train_data_set_path.format(idx)
        texture_train_label_set_path = texture_train_label_set_path.format(idx)
        texture_val_data_set_path = texture_val_data_set_path.format(idx)
        texture_val_label_set_path = texture_val_label_set_path.format(idx)

        dL = DataLoader()
        texture_train_set, train_set_size = dL.get_tensor_set(
            texture_train_data_set_path, texture_train_label_set_path, device)
        texture_val_set, val_set_size = dL.get_tensor_set(
            texture_val_data_set_path, texture_val_label_set_path, device)
        print("Train set size: {0}".format(train_set_size))
        print("Val set size: {0}".format(val_set_size))

        texture_train_data_loader = torch.utils.data.DataLoader(
            texture_train_set,
            batch_size=texture_batch_size,
            shuffle=True,
            num_workers=num_workers)
        texture_val_data_loader = torch.utils.data.DataLoader(texture_val_set,
                                                              num_workers=1,
                                                              shuffle=False,
                                                              pin_memory=True)

        data_loader_dict = {
            "train": texture_train_data_loader,
            "val": texture_val_data_loader
        }
        data_loader_list.append(data_loader_dict)

    return data_loader_list
Exemplo n.º 12
0
def prepare_data_loader_test_10_splits(texture_test_data_set_path,
                                       texture_test_label_set_path, device):
    data_loader_list = []
    for i in range(10):
        idx = i + 1
        print("Split: {0}".format(idx))
        texture_test_data_set_path = texture_test_data_set_path.format(idx)
        texture_test_label_set_path = texture_test_label_set_path.format(idx)

        dL = DataLoader()
        texture_test_set, test_set_size = dL.get_tensor_set(
            texture_test_data_set_path, texture_test_label_set_path, device)
        print("Test set size: {0}".format(test_set_size))

        test_data_loader = torch.utils.data.DataLoader(texture_test_set,
                                                       num_workers=1,
                                                       shuffle=False,
                                                       pin_memory=True)

        data_loader_list.append(test_data_loader)

    return data_loader_list
Exemplo n.º 13
0
    def test(self, data_dir='test', model_dir=None, output_dir='result', batch_size=10):
        print("testing starts.")

        if not os.path.exists(output_dir):
            os.mkdir(output_dir)
        # load test data
        loader = DataLoader(data_dir=data_dir, num_classes=self.num_classes,
                            mode='test', height=self.height, width=self.width)

        testset = tf.data.Dataset.from_generator(generator=loader.generator,
                                                 output_types=(tf.string,
                                                               tf.float32),
                                                 output_shapes=(tf.TensorShape([]),
                                                                tf.TensorShape([self.height, self.width, 3])))
        testset = testset.shuffle(100)
        testset = testset.batch(batch_size)
        testset = testset.prefetch(20)
        test_init = self.it.make_initializer(testset)

        saver = tf.train.Saver()
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        with tf.Session(config=config) as sess:
            saver.restore(sess, model_dir)
            sess.run(test_init)
            queue = multiprocessing.Queue(maxsize=30)
            writer_process = multiprocessing.Process(target=writer, args=[output_dir, batch_size, queue, 'stop'])
            writer_process.start()
            print('writing predictions...')
            try:
                while True:
                    img_name, pre_label = sess.run([self.img_name, self.prediction_value])
                    queue.put(('continue', img_name, pre_label))
            except tf.errors.OutOfRangeError:
                queue.put(('stop', None, None))

        print('testing finished.')
Exemplo n.º 14
0
def integrated_benchmark(dataset_path):
    """
    Variables:
    Dataset size: number of columns
    Dataset distribution: column length distribution
    threshold
    query column
    """
    loader = DataLoader("")
    dataset = loader.load_dataset(dataset_path)

    bf_lists, lsh_list = init(dataset)
    print("""
Benchmark 1 
Goal: Measure scalability of different methods
Variable: 
    the size of datasets. size: 400, 600, 800, 1000
Fix:
    threshold = 0.6
    query column = median col
Output:
    Runtime
    precision, recall, f1
""")
    labels = ["bloom filter", "lsh", "lsh ensemble", "lsh + bloom filter"]
    time_for_each_size = np.empty((len(dataset), len(labels)), dtype=float)
    x_axis = np.empty(len(dataset), dtype=int)

    for i, cols in enumerate(dataset):
        candidate_index = len(cols) // 2  # median col
        brute_force_result = brute_force(candidate_index, cols, 0.6)
        print("brute_force finished\n")
        time = benchmark(cols, candidate_index, 0.6, bf_lists[i], lsh_list[i],
                         brute_force_result,
                         "Benchmark-1-cols-size-" + str(len(cols)))
        time_for_each_size[i] = time
        x_axis[i] = len(cols)

    fig, ax = plt.subplots()
    for i in range(len(labels)):
        ax.plot(x_axis, time_for_each_size[:, i], 'o-', label=labels[i])
    ax.legend()
    ax.set_title("Benchmark-1-cols-size")
    ax.set_xticks(x_axis)
    ax.set_xlabel("size")
    ax.set_ylabel("time(s)")
    fig.tight_layout()
    # plt.show()
    fig.savefig("./bench_results/Benchmark-1-cols-size")

    print("""
Benchmark 2
Goal: Measure the effect of threshold
Variable:
   threshold: 0.1 0.3 0.5 0.7 0.9
Fix:
    dataset size = median col
Output
    Runtime
    precision, recall, f1
""")
    threshold_list = [0.1, 0.3, 0.5, 0.7, 0.9]
    time_for_each_threshold = np.empty((len(threshold_list), len(labels)),
                                       dtype=float)
    x_axis = np.empty(len(threshold_list), dtype=float)

    cols_index = len(dataset) // 2
    cols = dataset[cols_index]
    for i in range(len(threshold_list)):
        threshold = threshold_list[i]
        candidate_index = len(cols) // 2  # median col
        brute_force_result = brute_force(candidate_index, cols, threshold)
        print("brute_force finished\n")
        time = benchmark(
            cols, candidate_index, threshold, bf_lists[cols_index],
            lsh_list[cols_index], brute_force_result,
            "Benchmark-2-threshold-" + str(int(threshold * 100)) + "%")
        time_for_each_threshold[i] = time
        x_axis[i] = threshold

    fig, ax = plt.subplots()
    for i in range(len(labels)):
        ax.plot(x_axis, time_for_each_threshold[:, i], 'o-', label=labels[i])
    ax.legend()
    ax.set_title("Benchmark-2-threshold")
    ax.set_xticks(x_axis)
    ax.set_xlabel("threshold")
    ax.set_ylabel("time(s)")
    fig.tight_layout()
    # plt.show()
    fig.savefig("./bench_results/Benchmark-2-threshold")

    print("""
Benchmark 3
Goal: Measure the effect of query column
Variable:
    query column = small col, median col, large col
Fix:
    dataset size = median size cols
    threshold = 0.6
Output
    Runtime
    precision, recall, f1
""")
    cols_index = len(dataset) // 2
    cols = dataset[cols_index]
    label = ["small-col", "median-col", "large-col"]
    for i, candidate_index in enumerate([0, len(cols) // 2, len(cols) - 1]):
        brute_force_result = brute_force(candidate_index, cols, 0.6)
        benchmark(cols, candidate_index, 0.6, bf_lists[cols_index],
                  lsh_list[cols_index], brute_force_result,
                  "Benchmark-3-candidate-" + label[i])
Exemplo n.º 15
0
from dataLoader import DataLoader

loader = DataLoader()
loader.loadAll()

fileobj = open("csv/subjectAreaDump.csv", 'w')

for id, paper in loader.papers.iteritems():
    if paper.accepted:
        fileobj.write("%s|%d|%s" %
                      (paper.primarySpecificSubjectArea, id, paper.title))
        for subj in paper.specificSubjectAreas:
            fileobj.write("|" + subj)
        fileobj.write("\n")
fileobj.close()
Exemplo n.º 16
0
from pathlib import Path

from flask import Flask, render_template, make_response, jsonify, request, send_from_directory

import configurations
from analyzeResults import AnalyzeResults
from dataLoader import DataLoader
from hitCounter import HitCounter
import numpy as np

from vistDataset import VistDataset
import base64
import time

app = Flask(__name__)
data_loader = DataLoader(root_path=configurations.root_data)
hit_counter = HitCounter(root_path=configurations.root_data,
                         story_max_hits=configurations.max_story_submit)
vist_dataset = VistDataset(root_path=configurations.root_data,
                           hit_counter=hit_counter,
                           samples_num=configurations.samples)
analyze_results = AnalyzeResults(data_root=configurations.root_data,
                                 data_loader=data_loader,
                                 vist_dataset=vist_dataset)


@app.route('/api/images/<image_id>', methods=['GET'])
def serve_image(image_id):
    print("Requested image file: {}".format(image_id))
    image_path = data_loader._find_file(image_id)
    if image_path is None:
Exemplo n.º 17
0

# 该函数由register_forward_hook调用,类似于event handler,当resnet前向传播时记录所需中间层结果
def hook_feature(module, input, output):
    features_blobs.append(output.data.cpu().numpy())


# 需要输出的中间层名称,名称为resnet_for_vis的__init__函数中声明的。
finalconv_name = 'layer3'
# print(model._modules)
# model._modules.layer3.register_forward_hook(hook_feature)
# model._modules['module'].layer3.register_forward_hook(hook_feature)
model._modules.get(finalconv_name).register_forward_hook(hook_feature)

# Data Loader
loader = DataLoader(args_dataset, batch_size=args_batch_size)
dataloaders, dataset_sizes = loader.load_data()
labels_name = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
               'ship', 'truck')
colors = ['b', 'g', 'r', 'k', 'c', 'm', 'y', '#e24fff', '#524C90', '#845868']


def visulize(train_data, labels):
    print('PCA Embedding')
    tsne = TSNE(n_components=2)
    embed_feature = []
    batch_size = 10000
    slices = 0
    while slices + batch_size <= len(train_data):
        print('processing %d/%d' % (slices, len(train_data)))
        tsne.fit_transform(train_data[slices:slices + batch_size])
Exemplo n.º 18
0
def train(traindirs,
          data_dir,
          upsample,
          params_dict,
          checkpointdir,
          logger,
          validation_gen=None):
    if logger is not None:
        logger.info('Training folders are {}'.format(traindirs))
    else:
        print('Training folders are {}'.format(traindirs))
    training_generator = DataLoader(data_dir,
                                    traindirs,
                                    32,
                                    width_template=params_dict['width'],
                                    upsample=upsample)
    earl = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
    # Design model
    model = create_model(params_dict['width'] + 1,
                         params_dict['h1'],
                         params_dict['h2'],
                         params_dict['h3'],
                         embed_size=params_dict['embed_size'],
                         drop_out_rate=params_dict['dropout_rate'],
                         use_batch_norm=params_dict['use_batchnorm'])
    # Train local Net
    if validation_gen is None:
        model.fit_generator(generator=training_generator,
                            use_multiprocessing=True,
                            epochs=params_dict['n_epochs'],
                            workers=4,
                            max_queue_size=20,
                            callbacks=[earl])
    else:
        model.fit_generator(generator=training_generator,
                            validation_data=validation_gen,
                            use_multiprocessing=True,
                            epochs=params_dict['n_epochs'],
                            workers=4,
                            max_queue_size=20)
    if logger is not None:
        logger.info('Local Net trained')
        logger.info('Stopped epoch {}'.format(earl.stopped_epoch))
    else:
        print('Local Net trained')
        print('Stopped epoch {}'.format(earl.stopped_epoch))
    # Train the temporal model
    for folder in traindirs:
        if logger is not None:
            logger.info('Getting temporal training set for {}'.format(folder))
        else:
            print('Getting temporal training set for {}'.format(folder))
        img_dir = os.path.join(data_dir, folder, 'Data')
        annotation_dir = os.path.join(data_dir, folder, 'Annotation')
        list_label_files = [
            os.path.join(annotation_dir, dI)
            for dI in os.listdir(annotation_dir)
            if (dI.endswith('txt') and not dI.startswith('.'))
        ]
        try:
            img_init = np.asarray(
                Image.open(os.path.join(img_dir, "{:04d}.png".format(1))))
        except FileNotFoundError:
            img_init = np.asarray(
                Image.open(os.path.join(img_dir, "{:05d}.png".format(1))))
        list_imgs = [
            os.path.join(img_dir, dI) for dI in os.listdir(img_dir)
            if (dI.endswith('png') and not dI.startswith('.'))
        ]
        n_obs = len(list_imgs)
        X0, X1, X2, X3, X4, X5 = [], [], [], [], [], []
        Y0, Y1, Y2, Y3, Y4, Y5 = [], [], [], [], [], []
        for label in list_label_files:
            print(label)
            df = pd.read_csv(os.path.join(annotation_dir, label),
                             header=None,
                             names=['id', 'x', 'y'],
                             sep='\s+')
            c1_interpolate = np.interp(np.arange(1, n_obs + 1), df.id.values,
                                       df.x.values)
            c2_interpolate = np.interp(np.arange(1, n_obs + 1), df.id.values,
                                       df.y.values)
            n = len(c1_interpolate)
            X0 = np.append(X0, c1_interpolate)
            X1 = np.append(X1, c1_interpolate[1:n])
            X2 = np.append(X2, c1_interpolate[2:n])
            X3 = np.append(X3, c1_interpolate[3:n])
            X4 = np.append(X4, c1_interpolate[4:n])
            X5 = np.append(X5, c1_interpolate[5:n])
            Y0 = np.append(Y0, c2_interpolate)
            Y1 = np.append(Y1, c2_interpolate[1:n])
            Y2 = np.append(Y2, c2_interpolate[2:n])
            Y3 = np.append(Y3, c2_interpolate[3:n])
            Y4 = np.append(Y4, c2_interpolate[4:n])
            Y5 = np.append(Y5, c2_interpolate[5:n])
    l = len(X5)
    fullX = np.transpose(
        np.vstack([X0[0:l], X1[0:l], X2[0:l], X3[0:l], X4[0:l]]))
    fullY = np.transpose(
        np.vstack([Y0[0:l], Y1[0:l], Y2[0:l], Y3[0:l], Y4[0:l]]))
    c1_label = X5
    c2_label = Y5
    est_c1 = RidgeCV()
    est_c2 = RidgeCV()
    scores_c1 = cross_validate(est_c1,
                               fullX,
                               c1_label,
                               cv=5,
                               scoring=('r2', 'neg_mean_squared_error'))
    scores_c2 = cross_validate(est_c2,
                               fullY,
                               c2_label,
                               cv=5,
                               scoring=('r2', 'neg_mean_squared_error'))
    if logger is not None:
        logger.info('c1')
        logger.info(scores_c1['test_neg_mean_squared_error'])
        logger.info('c2')
        logger.info(scores_c2['test_neg_mean_squared_error'])
    else:
        print('c1')
        print(scores_c1['test_neg_mean_squared_error'])
        print('c2')
        print(scores_c2['test_neg_mean_squared_error'])
    # Fit on the whole training set
    est_c1.fit(fullX, c1_label)
    est_c2.fit(fullY, c2_label)

    # Save the local Net and the temporal model
    if logger is not None:
        logger.info('Saving trained models to {}'.format(checkpoint_dir))
    else:
        print('Saving trained models to {}'.format(checkpoint_dir))
    model.save_weights(os.path.join(checkpoint_dir, 'model.h5'))
    dump(est_c1, os.path.join(checkpoint_dir, 'est_c1.joblib'))
    dump(est_c2, os.path.join(checkpoint_dir, 'est_c2.joblib'))
    return model, est_c1, est_c2
Exemplo n.º 19
0
from dataLoader import DataLoader
from crfBrandDetector import CrfBrandDetector

if __name__ == '__main__':
    print('Preparing Data...')
    df = DataLoader().get_data()
    print('Building Model...')
    crf_model = CrfBrandDetector()
    print('Fitting...')
    x_train, x_test, y_train, y_test = crf_model.train_test_split(df)
    crf_model.fit(x_train, y_train)
    crf_model.report_classification(x_test, y_test)
    print('Accuracy: {}'.format(crf_model.evaluate(x_test, y_test)))
    pred = crf_model.predict(x_test)
    pred.to_csv('./pred.csv', index=False)
Exemplo n.º 20
0
def run_global_cv(fold_iterator,
                  data_dir,
                  checkpoint_dir,
                  logger,
                  params_dict,
                  upsample=True):
    eucl_dist_per_fold = []
    pixel_dist_per_fold = []
    for traindirs, testdirs in fold_iterator:
        # TRAIN LOCAL PREDICTION MODEL
        # Generators
        logger.info('############ FOLD #############')
        logger.info('Training folders are {}'.format(traindirs))
        training_generator = DataLoader(data_dir,
                                        traindirs,
                                        32,
                                        width_template=params_dict['width'],
                                        upsample=upsample)
        validation_generator = DataLoader(data_dir,
                                          testdirs,
                                          32,
                                          width_template=params_dict['width'],
                                          type='val',
                                          upsample=upsample)
        model, est_c1, est_c2 = train(traindirs, data_dir, upsample,
                                      params_dict, checkpoint_dir, logger,
                                      validation_generator)
        # PREDICT WITH GLOBAL MATCHING + LOCAL MODEL ON TEST SET
        curr_fold_dist = []
        curr_fold_pix = []
        for k, testfolder in enumerate(testdirs):
            res_x, res_y = training_generator.resolution_df.loc[
                training_generator.resolution_df['scan'] == testfolder,
                ['res_x', 'res_y']].values[0]
            annotation_dir = os.path.join(data_dir, testfolder, 'Annotation')
            img_dir = os.path.join(data_dir, testfolder, 'Data')
            list_imgs = [
                os.path.join(img_dir, dI) for dI in os.listdir(img_dir)
                if (dI.endswith('png') and not dI.startswith('.'))
            ]

            list_label_files = [
                dI for dI in os.listdir(annotation_dir)
                if (dI.endswith('txt') and not dI.startswith('.'))
            ]
            print(list_label_files)
            try:
                img_init = np.asarray(
                    Image.open(os.path.join(img_dir, "{:04d}.png".format(1))))
            except FileNotFoundError:
                img_init = np.asarray(
                    Image.open(os.path.join(img_dir, "{:05d}.png".format(1))))
            img_init = prepare_input_img(img_init, res_x, res_y, upsample)

            for j, label_file in enumerate(list_label_files):
                print(label_file)
                img_current = img_init
                df = pd.read_csv(os.path.join(annotation_dir, label_file),
                                 header=None,
                                 names=['id', 'x', 'y'],
                                 sep='\s+')
                if upsample:
                    df['x_newres'] = df['x'] * res_x / 0.4
                    df['y_newres'] = df['y'] * res_y / 0.4
                else:
                    df['x_newres'] = df['x']
                    df['y_newres'] = df['y']
                c1_init, c2_init = df.loc[
                    df['id'] == 1, ['x_newres', 'y_newres']].values[0, :]
                a, b = np.nonzero(img_init[:, 20:(len(img_init) - 20)])
                if upsample:
                    list_centers = [[
                        c1_init * 0.4 / res_x, c2_init * 0.4 / res_y
                    ]]
                else:
                    list_centers = [[c1_init, c2_init]]
                xax, yax = find_template_pixel(c1_init, c2_init,
                                               params_dict['width'],
                                               img_init.shape[1],
                                               img_init.shape[0])
                template_init = img_init[np.ravel(yax),
                                         np.ravel(xax)].reshape(
                                             1, len(yax), len(xax))
                c1, c2 = c1_init, c2_init
                stop_temporal = False
                k = 0
                for i in range(2, len(list_imgs) + 1):
                    if i % 100 == 0:
                        print(i)
                    img_prev = img_current
                    try:
                        img_current = np.asarray(
                            Image.open(
                                os.path.join(img_dir, "{:04d}.png".format(i))))
                    except FileNotFoundError:
                        img_current = np.asarray(
                            Image.open(
                                os.path.join(img_dir, "{:05d}.png".format(i))))
                    img_current = prepare_input_img(img_current, res_x, res_y,
                                                    upsample)
                    if i > 5:
                        tmp = list_centers[-10:].reshape(-1, 2)
                        assert tmp.shape[0] == 5
                        c1, c2, stop_temporal, k = get_next_center(
                            k, stop_temporal, c1, c2, img_prev, img_current,
                            params_dict, model, template_init, c1_init,
                            c2_init, logger, est_c1, est_c2, tmp[:, 0], tmp[:,
                                                                            1])
                    else:
                        c1, c2, stop_temporal, k = get_next_center(
                            k, stop_temporal, c1, c2, img_prev, img_current,
                            params_dict, model, template_init, c1_init,
                            c2_init, logger)
                    # project back in init coords
                    if upsample:
                        c1_orig_coords = c1 * 0.4 / res_x
                        c2_orig_coords = c2 * 0.4 / res_y
                    else:
                        c1_orig_coords = c1
                        c2_orig_coords = c2
                    list_centers = np.append(list_centers,
                                             [c1_orig_coords, c2_orig_coords])
                    if i in df.id.values:
                        true = df.loc[df['id'] == i, ['x', 'y']].values[0]
                        diff_x = np.abs(c1_orig_coords - true[0])
                        diff_y = np.abs(c2_orig_coords - true[1])
                        if upsample:
                            dist = np.sqrt(diff_x**2 + diff_y**2)
                            logger.info(
                                'ID {} : euclidean dist diff {}'.format(
                                    i, dist * 0.4))
                        else:
                            dist = np.sqrt((res_x * diff_x)**2 +
                                           (diff_y * res_y)**2)
                            logger.info(
                                'ID {} : euclidean dist diff {}'.format(
                                    i, dist))
                        if dist > 10:
                            # logger.info(
                            #     'Bad dist - maxNCC was {}'.format(maxNCC))
                            logger.info('True {},{}'.format(true[0], true[1]))
                            logger.info('Pred {},{}'.format(
                                c1_orig_coords, c2_orig_coords))
                idx = df.id.values.astype(int)
                list_centers = list_centers.reshape(-1, 2)
                df_preds = list_centers[idx - 1]
                df_true = df[['x', 'y']].values
                absolute_diff = np.mean(np.abs(df_preds - df_true))
                pix_dist = np.mean(
                    np.sqrt((df_preds[:, 0] - df_true[:, 0])**2 +
                            (df_preds[:, 1] - df_true[:, 1])**2))
                dist = compute_euclidean_distance(df_preds, df_true)
                curr_fold_dist.append(dist)
                curr_fold_pix.append(pix_dist)
                logger.info(
                    '======== Test Feature {} ======='.format(label_file))
                logger.info('Pixel distance is {}'.format(pix_dist))
                logger.info('Euclidean distance in mm {}'.format(dist))
                logger.info('Mean absolute difference in pixels {}'.format(
                    absolute_diff))
                pred_df = pd.DataFrame()
                pred_df['idx'] = range(1, len(list_centers) + 1)
                pred_df['c1'] = list_centers[:, 0]
                pred_df['c2'] = list_centers[:, 1]
                pred_df.to_csv(os.path.join(checkpoint_dir,
                                            '{}.txt'.format(label_file)),
                               header=False,
                               index=False)
        eucl_dist_per_fold = np.append(eucl_dist_per_fold,
                                       np.mean(curr_fold_dist))
        pixel_dist_per_fold = np.append(pixel_dist_per_fold,
                                        np.mean(curr_fold_pix))
        logger.info('EUCLIDEAN DISTANCE CURRENT FOLD {}'.format(
            eucl_dist_per_fold[-1]))
        logger.info('PIXEL DISTANCE CURRENT FOLD {}'.format(
            pixel_dist_per_fold[-1]))
    logger.info('================= END RESULTS =================')
    logger.info('Mean euclidean distance in mm {} (std {})'.format(
        np.mean(eucl_dist_per_fold), np.std(eucl_dist_per_fold)))
Exemplo n.º 21
0

args = get_args()
setup_seed(args.seed)
device = args.device
checkpoint_dir = args.checkpoint_dir
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

data = np.load(args.dataset_path)

model = net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

dataLoader = DataLoader(data['X'],
                        data['Y'],
                        train_val_split=[0.7, 0.15, 0.15],
                        batch_size=args.batch_size,
                        device=device)
loader_train, loader_val, loader_test = dataLoader.get_loader()
trainer = Trainer(model, optimizer)
loss_fn = torch.nn.functional.cross_entropy
trainer.train_with_val(loss_fn,
                       loader_train=loader_train,
                       loader_val=loader_val,
                       epochs=args.epochs,
                       save_path=checkpoint_dir + 'model.pth',
                       save_best_only=True,
                       monitor_on='acc')
trainer.test(loader_test, loss_fn, info='Test ')
Exemplo n.º 22
0
def learn_test(expr):
    loader = DataLoader()
    dataset = loader.loadData(
        dataset=expr)  # dataset options: electricity, traffic, BLE
    pastObserve = pastObserves[expr]
    o_columns = dataset.columns
    predCol = o_columns[-1]
    lenAll = len(dataset)
    lenx = int(lenAll * .75)

    test_orig = []
    mean_errors = []
    error_stds = []
    all_errors = []

    all_predictions = []

    values = dataset.values
    origData = values

    # normalize
    parameters = dataset.values.shape[1]
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled = scaler.fit_transform(values)
    reframed = series_to_supervised(scaled, pastObserve, 1)

    # drop columns we don't want to predict
    droppings = []
    for i in range(1, pastObserve + 1):
        x = [a for a in range(parameters * (i - 1), parameters * i - 1)]
        droppings.extend(x)
    reframed.drop(reframed.columns[droppings], axis=1, inplace=True)
    valuesTrans = reframed.values
    test = valuesTrans

    # split into input and outputs
    train_X_all, train_y_all = valuesTrans[:, :-1], valuesTrans[:, -1]
    test_X, test_y = test[:, :-1], test[:, -1]

    trainingModels = []
    for i in range(modelsNo):
        deepModel = create_model(parameters, pastObserve)
        trainingModels.append(deepModel)

    dy = 0
    sparsity = 3
    for model in trainingModels:
        # fit network
        partsLen = int(len(train_X_all) / sparsity) * sparsity
        a = np.arange(partsLen)
        a = a.reshape(sparsity, int(partsLen / sparsity))
        ixs = []
        # just consider part of dataset not all of that
        for t in range(sparsity):
            if (t == dy):
                ixs.append(a[t])
        # ixs.append(a[t+1]) # for considering 40% sparsity
        # ixs.append(a[t+2]) # for considering 60% sparsity
        ixs = np.array(ixs)
        train_ixs = ixs.flatten()
        train_X, train_y = train_X_all[train_ixs], train_y_all[train_ixs]
        model.fit(train_X, train_y, epochs=20, batch_size=20, verbose=2)
        dy += 1
        # calculate predictions
        predictions = model.predict(test_X)
        predictions = predictions.reshape((len(predictions), 1))

        pads = np.zeros(len(test_y) * (parameters - 1))
        pads = pads.reshape(len(test_y), parameters - 1)

        inv_yhat = concatenate((pads, predictions), axis=1)
        inv_yhat = scaler.inverse_transform(inv_yhat)
        inv_yhat = inv_yhat[:, -1]
        inv_yhat = np.around(inv_yhat, decimals=2)

        # invert scaling for actual
        test_y = test_y.reshape((len(test_y), 1))
        inv_test = concatenate((test_X[:, pastObserve:], test_y), axis=1)
        test_orig = scaler.inverse_transform(inv_test)

        origY = test_orig[:, -1]
        meanErr, std, errors = report_errors(origY, inv_yhat, errorType[expr])

        mean_errors.append(meanErr)
        error_stds.append(std)

        all_errors.append(errors)
        all_predictions.append(inv_yhat)

        print(min(origY), max(origY))
        print(min(inv_yhat), max(inv_yhat))
        print('Test Mean Error: %.3f ' % meanErr)

    p_cols = []
    df = DataFrame(test_orig, columns=o_columns)
    for k in range(len(all_predictions)):
        colName = 'predict_' + str(k + 1)
        p_cols.append(colName)
        df[colName] = all_predictions[k]
    for k in range(len(all_predictions)):
        errName = 'error_' + str(k + 1)
        df[errName] = all_errors[k]

    print(errorType[expr])
    print(mean_errors)

    if not os.path.exists(models_output_folder):
        os.makedirs(models_output_folder)

    outDetails_filename = models_output_folder + 'predictions_details_%s.csv' % expr
    out_filename = models_output_folder + 'predictions_output_%s.csv' % expr

    df.to_csv(outDetails_filename, index=False)

    models_prediction_cols = p_cols
    models_prediction_cols.append(predCol)
    df_modelOutput = df[models_prediction_cols]
    df_modelOutput.to_csv(out_filename, index=False)
Exemplo n.º 23
0
def train():
    parser = OptionParser()
    parser.add_option("--train_inputDir",
                      dest="train_inputDir",
                      help="Input directory",
                      metavar="DIRECTORY")
    parser.add_option("--train_inputFile",
                      dest="train_inputFile",
                      help="Input file",
                      metavar="FILE")
    parser.add_option("--train_type",
                      dest="train_type",
                      help="Training type, 1|2|3|4.",
                      metavar="VALUE",
                      default=2)
    parser.add_option("--particle_number",
                      dest="train_number",
                      help="Number of positive samples to train.",
                      metavar="VALUE",
                      default=-1)
    parser.add_option("--mrc_number",
                      dest="mrc_number",
                      help="Number of mrc files to be trained.",
                      metavar="VALUE",
                      default=-1)
    parser.add_option(
        "--coordinate_symbol",
        dest="coordinate_symbol",
        help="The symbol of the coordinate file, like '_manualPick'",
        metavar="STRING")
    parser.add_option("--particle_size",
                      dest="particle_size",
                      help="the size of the particle.",
                      metavar="VALUE",
                      default=-1)
    parser.add_option("--validation_ratio",
                      dest="validation_ratio",
                      help="the ratio.",
                      metavar="VALUE",
                      default=0.1)
    parser.add_option(
        "--model_retrain",
        action="store_true",
        dest="model_retrain",
        help=
        "train the model using the pre-trained model as parameters initialization .",
        default=False)
    parser.add_option("--model_load_file",
                      dest="model_load_file",
                      help="pre-trained model",
                      metavar="FILE")
    parser.add_option("--model_save_dir",
                      dest="model_save_dir",
                      help="save the model to this directory",
                      metavar="DIRECTORY",
                      default="../trained_model")
    parser.add_option("--model_save_file",
                      dest="model_save_file",
                      help="save the model to file",
                      metavar="FILE")
    parser.add_option("--pos_list",
                      dest="pos_list",
                      help="",
                      metavar="VALUE",
                      default="")
    parser.add_option("--neg_list",
                      dest="neg_list",
                      help="",
                      metavar="VALUE",
                      default="")

    parser.add_option("--mixup",
                      dest="mixup",
                      help="",
                      metavar="VALUE",
                      default="0")
    (opt, args) = parser.parse_args()

    model_input_size = [128, 64, 64, 1]
    num_class = 2
    batch_size = model_input_size[0]
    # define input parameters
    train_type = int(opt.train_type)
    train_inputDir = opt.train_inputDir
    train_inputFile = opt.train_inputFile
    protein_number = len(train_inputFile.split(';'))
    train_number = float(opt.train_number)
    mrc_number = int(opt.mrc_number)
    dropout_rate = 0.5
    coordinate_symbol = opt.coordinate_symbol
    debug_dir = '../train_output'  # output dir
    particle_size = int(opt.particle_size)
    validation_ratio = float(opt.validation_ratio)
    # define the save model
    model_retrain = opt.model_retrain
    model_load_file = opt.model_load_file
    model_save_dir = opt.model_save_dir
    model_save_file = os.path.join(model_save_dir, opt.model_save_file)
    pos_list = opt.pos_list
    neg_list = opt.neg_list
    mixup = int(opt.mixup)
    print("MIXUP=======", mixup)
    if not os.access(model_save_dir, os.F_OK):
        os.mkdir(model_save_dir)
    if not os.access(debug_dir, os.F_OK):
        os.mkdir(debug_dir)
    dataLoader = DataLoader()

    train_number = int(train_number)
    if train_type == 1:
        # load train data from mrc file dir
        train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_mrcFileDir(
            train_inputDir, particle_size, model_input_size, validation_ratio,
            coordinate_symbol, mrc_number, train_number)
    elif train_type == 2:
        # load train data from numpy data struct
        train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_ExtractedDataFile(
            train_inputDir, train_inputFile, model_input_size,
            validation_ratio, train_number)
    elif train_type == 3:
        # load train data from prepicked results
        train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_PrePickedResults(
            train_inputDir, train_inputFile, particle_size, model_input_size,
            validation_ratio, train_number)
    elif train_type == 4:
        # load train data from relion .star file
        train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_RelionStarFile(
            train_inputFile, particle_size, model_input_size, validation_ratio,
            train_number)
    elif train_type == 5:
        # load train data from class2d .star file
        train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_Class2dStarFile(
            train_inputDir, train_inputFile, model_input_size,
            validation_ratio, train_number)
    elif train_type == 6:
        left = 0
        right = 50
        get_partition = lambda x, y: (x + y) / 2
        '''
        # load train data from auto_filter_class .star file
        train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_AutoClass2dStarFile(
                train_inputDir, train_inputFile, model_input_size, validation_ratio, train_number)
        '''
    else:
        print("ERROR: invalid value of train_type:", train_type)
    try:
        train_type == 6 or train_data
    except NameError:
        print("Error: in function load.loadInputTrainData.")
        return None
    else:
        print("Load training data successfully!")

    idx = 0
    good_enough = False
    while True and not good_enough:
        best_eval_error_rate = 100
        all_error = []
        finetune = False if train_type == 6 else False
        dropout_rate = 0.5 if train_type == 6 else dropout_rate
        deepModel = DeepModel(particle_size,
                              model_input_size,
                              num_class,
                              dropout_rate=dropout_rate,
                              finetune=finetune)
        if train_type == 6:
            deepModel.learning_rate = deepModel.learning_rate / 10.0
            deepModel.decay_steps *= 2
            if good_enough:
                partition = partition + 1
            else:
                partition = get_partition(left, right)
            print "PARTITOIN --->>>", partition
            partition = 9
            good_enough = True  #Set this=True to run while for just once!!!
            #train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_AutoClass2dStarFile(train_inputDir, train_inputFile, model_input_size, validation_ratio, train_number, partition)
            train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_AutoClass2dStarFile(
                train_inputDir, train_inputFile, model_input_size,
                validation_ratio, train_number, partition, pos_list, neg_list)
        train_data, train_label = shuffle_in_unison_inplace(
            train_data, train_label)
        print("label_shape = ", np.array(train_label).shape)
        '''
        mix_data, mix_label = [], []
        if mixup:
            mixnum = len(train_data)
            #for cnt in tqdm(range(mixnum)):
            for cnt in range(mixnum):
            #for cnt in range(mixnum):
                L = np.random.beta(0.2, 0.2)
                i1, i2 = np.random.randint(mixnum, size=2)
                if train_data[i1].shape[1] == train_data[i2].shape[1]:
                    new_data = (1-L) * train_data[i1] + L * train_data[i2]
                    new_label = (1-L) * train_label[i1][1] + L * train_label[i2][1]
                    mix_data.append(new_data)
                    mix_label.append([1.0-new_label, new_label])
        train_data = train_data + mix_data
        train_label = train_label + mix_label
        '''
        print("label_shape = ", np.array(train_label).shape)
        #eval_data, eval_label = shuffle_in_unison_inplace(eval_data, eval_label)
        bs2train = {}
        bs2eval = {}
        for idx, t in enumerate(train_data):
            if t.shape[1] not in bs2train.keys():
                bs2train[t.shape[1]] = [idx]
            else:
                bs2train[t.shape[1]].append(idx)
        for idx, t in enumerate(eval_data):
            if t.shape[1] not in bs2eval.keys():
                bs2eval[t.shape[1]] = [idx]
            else:
                bs2eval[t.shape[1]].append(idx)
        train_size = len(train_data)
        eval_size = len(eval_data)
        print("train size=%d, eval_size=%d" % (train_size, eval_size))
        print("batch_size=%d" % batch_size)
        print("dropout=%.2f" % dropout_rate)
        if train_size < 1000:
            print("NOTE: no enough training data!\n<Failed>! ")
            exit()
        '''
        if eval_size < model_input_size[0]: #TODO
            tile_size = model_input_size[0] // eval_size + 1
            eval_data = np.array(eval_data)
            eval_data = np.tile(eval_data, [tile_size,1,1,1])
            print ("tiled eval_data !!!!", tile_size)
        '''
        saver = tf.train.Saver(tf.all_variables(), max_to_keep=30)

        start_time = time.time()
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.26)
        train_error = []
        valid_error = []
        eval_time = 0
        with tf.Session(config=tf.ConfigProto(
                gpu_options=gpu_options, log_device_placement=False)) as sess:
            tf.initialize_all_variables().run()
            if model_load_file:
                print model_load_file
                saver.restore(sess, model_load_file)
            max_epochs = 200
            best_eval_error_rate = 100
            toleration_patience = 10
            toleration_patience_flag = 0
            eval_frequency = train_size // batch_size
            print("total_step=%d" %
                  (int(max_epochs * train_size) // batch_size))
            #fout = open('trainingcurve%d_%s_test2_block1_lr0.1.txt'%(protein_number, deepModel.arch), 'w')
            #fout = open('trainingcurve%d_%s_lr0.1.txt'%(protein_number, deepModel.arch), 'w')
            #fout = open('trainingcurve%d_resnet.txt'%protein_number, 'w')
            idx += 1
            batch_type = bs2train.keys()
            batch_type_number = len(batch_type)
            po = {}
            for k in range(batch_type_number):
                po[k] = 0
            batch_type_idx = 0
            train_error_list = []
            print(
                "==================================================================="
            )
            #for step in xrange(int(max_epochs * train_size) // batch_size):
            eval_prediction = deepModel.evaluation(eval_data,
                                                   sess,
                                                   label=eval_label)
            eval_error_rate = error_rate(eval_prediction, eval_label)
            eval_before_retrain = eval_error_rate
            print('valid error before training: %.6f%%' % eval_error_rate)
            print(
                "==================================================================="
            )
            for epoch in range(int(max_epochs)):
                start_time = time.time()
                #for s in tqdm(range(eval_frequency)):
                for s in range(eval_frequency):
                    step = epoch * eval_frequency + s
                    # get the batch training data
                    offset = (step * batch_size) % (train_size - batch_size)
                    batch_type_idx = (batch_type_idx + 1) % batch_type_number
                    batch = batch_type[batch_type_idx]
                    if po[batch_type_idx] + batch_size >= len(bs2train[batch]):
                        po[batch_type_idx] = 0
                    p = po[batch_type_idx]
                    idxs = bs2train[batch][p:(p + batch_size)]
                    batch_data = []
                    batch_label = []
                    for ix in idxs:
                        batch_data.append(train_data[ix])
                        batch_label.append(train_label[ix])
                    po[batch_type_idx] = po[batch_type_idx] + batch_size
                    #batch_data = train_data[offset:(offset+batch_size)]
                    #batch_label = train_label[offset:(offset+batch_size)]
                    '''
                    batch_data_shape = batch_data[0].shape
                    con = False
                    for bb in batch_data:
                        if bb.shape != batch_data_shape:
                            con = True
                            break
                    if con:
                        continue
                    '''
                    # online augmentation
                    #batch_data = DataLoader.preprocess_particle_online(batch_data)
                    loss_value, lr, train_prediction = deepModel.train_batch(
                        batch_data, batch_label, sess)
                    train_error_list.append(
                        error_rate(train_prediction, batch_label))

                    # do the computation
                    #if step % eval_frequency == 0:
                    #if step % 50 == 0:
                #TODO:display after each epoch
                stop_time = time.time() - start_time
                eval_prediction = deepModel.evaluation(eval_data,
                                                       sess,
                                                       label=eval_label)
                eval_error_rate = error_rate(eval_prediction, eval_label)
                #best_eval_error_rate = min(best_eval_error_rate, eval_error_rate)
                #print('>> epoch: %.2f , %.2f ms' % (step * batch_size /train_size, 1000 * stop_time / eval_frequency))
                train_error_mean = np.mean(train_error_list)
                print(
                    '>> epoch: %d, train loss: %.2f, lr: %.6f, toleration:%d, train error: %.2f%%, valid error: %.2f%%'
                    % (epoch, loss_value, lr, toleration_patience,
                       train_error_mean, eval_error_rate))
                #print >>fout, step, train_error_mean, eval_error_rate
                train_error.append(train_error_mean)
                valid_error.append(eval_error_rate)
                eval_time += 1
                train_error_list = []
                all_error.append(eval_error_rate)

                if eval_error_rate < best_eval_error_rate:
                    best_eval_error_rate = eval_error_rate
                    toleration_patience = 10
                    saver.save(sess, model_save_file)
                else:
                    if epoch > 50:
                        toleration_patience = toleration_patience - 1

                if toleration_patience == 0:
                    break
        good_enough = True
        '''
        import matplotlib
        matplotlib.use('Agg')
        import matplotlib.pyplot as plt
        plt.title('Training curve')
        plt.ylabel('Error(%)')
        plt.xlabel('Epoch')
        axes = plt.gca()
        axes.set_ylim([0, 60])
        plt.plot(range(eval_time), train_error, label='training')
        plt.plot(range(eval_time), valid_error, label='validation')
        plt.legend(loc='upper right')
        plt.show()
        #plt.savefig('pickercurve.png')
        '''
        print("Accuracy: before retrain: %.2f%%, after retrain: %.2f%%" %
              (100.0 - eval_before_retrain, 100.0 - best_eval_error_rate))
        print("Retrain <Successful>!")
Exemplo n.º 24
0
from dataLoader import DataLoader

#Global variables to be accessed between files.
global data_loader
data_loader = DataLoader()

global locations
locations = data_loader.setup_locations()

global distances
distances = data_loader.setup_distances()

global all_packages
all_packages = data_loader.setup_packages(locations)
Exemplo n.º 25
0
def starcraft_svm_test():

    # Create DataLoader instance to load and format data
    dataLoader = DataLoader()

    logging.info("Program started")

    logging.info("Loading starcraft data")
    # Read skillcraft dataset, the class index is the second column
    dataLoader.read(filename="data/SkillCraft1_Dataset.csv",
                    classIndex=1,
                    numOfFeatures=15)
    multi_label_count = dataLoader.labelCount(8)

    # Creates plots for a few of the data features
    # dataLoader.visualize()

    # Normalize data values from 0 - 1
    #dataLoader.normalize()

    # Create new labels to fit into binary classification
    dataLoader.scaleToBinary(5)
    label_count = dataLoader.binaryLabelCount(5)
    logging.info("Number of examples per class")
    logging.info("Casual - (1):           " + str(label_count[0]))
    logging.info("Hardcore - (-1):           " + str(label_count[1]))

    label_count = dataLoader.labelCount(8)
    logDataCount(label_count)
    """
    # Create SVM
    svm = SVM()

    # Train and predict for binary svm
    logging.info("Running SVM for binary classification")
    # Train for binary single run with these objects
    logging.info("Single binary SVM")
    svm.train(dataLoader.x_train, dataLoader.y_train, dataLoader.x_test, dataLoader.y_test)

    # Train and test binary svm multiple times for all available binary variables
    logging.info("Multiple runs with different parameters - binary SVM")
    svm.train(dataLoader.x_train, dataLoader.y_train, dataLoader.x_test, dataLoader.y_test, iterate=True)

    # Save binary results to excel sheet
    logging.info("Saving binary SVM results")
    svm.results.to_excel(writer_starcraft, sheet_name='binary-svm')


    # MULTI CLASS SVM
    logging.info("Running SVM for multiclass classification")


    # Train and predict for multi-class data using the linear svm from liblinear implementation
    logging.info("Running SVM for multiclass classification with liblinear implementation")
    svm.train(dataLoader.x_train, dataLoader.multi_y_train, dataLoader.x_test, dataLoader.multi_y_test, binary=False)
    logging.info("Saving multiclass liblinear results")
    svm.results.to_excel(writer_starcraft, sheet_name='multiclass-liblinear')

    # Train for multi-class single run with these objects using the libsvm implementation
    logging.info("Running SVM for multiclass classification with libsvm implementation")
    svm.train(dataLoader.x_train, dataLoader.multi_y_train, dataLoader.x_test, dataLoader.multi_y_test, binary=False, linear=False)
    logging.info("Saving multiclass libsvm results")
    svm.results.to_excel(writer_starcraft, sheet_name='multiclass-libsvm')

    # Train and test multi-class svm multiple times for all available multi-class variables
    logging.info("Running SVM for multiclass classification for all available multi-class variables")
    svm.train(dataLoader.x_train, dataLoader.multi_y_train, dataLoader.x_test, dataLoader.multi_y_test, iterate=True, binary=False)
    logging.info("Saving multiclass multiple-runs results")
    svm.results.to_excel(writer_starcraft, sheet_name='multiclass-multiple-variables')

    # Train and test multi-class svm multiple times with KPCA-LDA
    logging.info("Running SVM for multiclass classification with KPCA-LDA")
    svm.train(dataLoader.x_train, dataLoader.multi_y_train, dataLoader.x_test, dataLoader.multi_y_test, iterate=True, binary=False, decomposition=True)
    logging.info("Saving multiclass multiple-runs results")
    svm.results.to_excel(writer_starcraft, sheet_name='multiclass-kpca-lda')

    # KNN and NC
    nearest(dataLoader.x_train, dataLoader.y_train, dataLoader.x_test, dataLoader.y_test, dataLoader.multi_y_train, dataLoader.multi_y_test, writer_starcraft)
    """

    clustering(dataLoader.x_train, dataLoader.y_train, dataLoader.x_test,
               dataLoader.y_test)

    # Write all the results
    writer_starcraft.save()
Exemplo n.º 26
0
def Run_SRNN_NormalCase(args, no_dataset):

    data_path, graph_path = Data_path(no_dataset)
    log_path = Log_path(no_dataset)

    # Construct the DataLoader object that loads data
    dataloader = DataLoader(args)
    dataloader.load_data(data_path)

    # Construct the ST-graph object that reads graph
    stgraph = ST_GRAPH(args)
    stgraph.readGraph(dataloader.num_sensor, graph_path)

    # Initialize net
    net = SRNN(args)
    net.setStgraph(stgraph)

    print('- Number of trainable parameters:',
          sum(p.numel() for p in net.parameters() if p.requires_grad))

    # optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate)
    # optimizer = torch.optim.RMSprop(net.parameters(), lr=args.learning_rate, momentum=0.0001, centered=True)
    optimizer = torch.optim.Adagrad(net.parameters())

    best_eval_loss = 10000
    best_epoch = 0

    print('')
    print('---- Train and Evaluation ----')

    eval_loss_res = np.zeros((args.num_epochs + 1, 2))
    for e in range(args.num_epochs):
        epoch = e + 1

        ####  Training ####
        print('-- Training, epoch {}/{}'.format(epoch, args.num_epochs))
        loss_epoch = 0

        # For each batch
        for b in range(dataloader.num_batches_train):
            batch = b + 1
            start = time.time()

            # Get batch data
            x = dataloader.next_batch_train()

            # Loss for this batch
            loss_batch = 0

            # For each sequence in the batch
            for sequence in range(dataloader.batch_size):

                # put node and edge features
                stgraph.putSequenceData(x[sequence])

                # get data to feed
                data_nodes, data_temporalEdges, data_spatialEdges = stgraph.getSequenceData(
                )

                # put a sequence to net
                loss_output, data_nodes, outputs = forward(
                    net, optimizer, args, stgraph, data_nodes,
                    data_temporalEdges, data_spatialEdges)
                loss_output.backward()
                loss_batch += loss_RMSE(data_nodes[-1], outputs[-1],
                                        dataloader.scaler)

                # Clip gradients
                torch.nn.utils.clip_grad_norm_(net.parameters(),
                                               args.grad_clip)

                # Update parameters
                optimizer.step()

            end = time.time()
            loss_batch = loss_batch / dataloader.batch_size
            loss_epoch += loss_batch

            print('Train: {}/{}, train_loss = {:.3f}, time/batch = {:.3f}'.
                  format(e * dataloader.num_batches_train + batch,
                         args.num_epochs * dataloader.num_batches_train,
                         loss_batch, end - start))
        # Compute loss for the entire epoch
        loss_epoch /= dataloader.num_batches_train
        print('(epoch {}), train_loss = {:.3f}'.format(epoch, loss_epoch))

        # Save the model after each epoch
        save_path = Save_path(no_dataset, epoch)
        print('Saving model to ' + save_path)
        torch.save(
            {
                'epoch': epoch,
                'state_dict': net.state_dict(),
                'optimizer_state_dict': optimizer.state_dict()
            }, save_path)

        #### Evaluation ####
        print('-- Evaluation, epoch {}/{}'.format(epoch, args.num_epochs))
        loss_epoch = 0
        for b in range(dataloader.num_batches_eval):
            batch = b + 1
            start = time.time()

            # Get batch data
            x = dataloader.next_batch_eval()

            # Loss for this batch
            loss_batch = 0

            for sequence in range(dataloader.batch_size):

                # put node and edge features
                stgraph.putSequenceData(x[sequence])

                # get data to feed
                data_nodes, data_temporalEdges, data_spatialEdges = stgraph.getSequenceData(
                )

                # put a sequence to net
                _, data_nodes, outputs = forward(net, optimizer, args, stgraph,
                                                 data_nodes,
                                                 data_temporalEdges,
                                                 data_spatialEdges)

                loss_batch += loss_RMSE(data_nodes[-1], outputs[-1],
                                        dataloader.scaler)

            end = time.time()
            loss_batch = loss_batch / dataloader.batch_size
            loss_epoch += loss_batch

            print(
                'Eval: {}/{}, eval_loss = {:.3f}, time/batch = {:.3f}'.format(
                    e * dataloader.num_batches_eval + batch,
                    args.num_epochs * dataloader.num_batches_eval, loss_batch,
                    end - start))
        loss_epoch /= dataloader.num_batches_eval
        eval_loss_res[e] = (epoch, loss_epoch)

        # Update best validation loss until now
        if loss_epoch < best_eval_loss:
            best_eval_loss = loss_epoch
            best_epoch = epoch

        print('(epoch {}), eval_loss = {:.3f}'.format(epoch, loss_epoch))

    # Record the best epoch and best validation loss overall
    print('Best epoch: {}, Best evaluation loss {:.3f}'.format(
        best_epoch, best_eval_loss))
    eval_loss_res[-1] = (best_epoch, best_eval_loss)
    np.savetxt(log_path, eval_loss_res, fmt='%d, %.3f')
    print('- Eval result has been saved in ', log_path)
    print('')
Exemplo n.º 27
0
def train():
    parser = OptionParser()
    parser.add_option("--train_good",
                      dest="train_good",
                      help="Input good particles ",
                      metavar="FILE")
    parser.add_option("--train_bad",
                      dest="train_bad",
                      help="Input bad particles",
                      metavar="FILE")
    parser.add_option("--particle_number",
                      type="int",
                      dest="train_number",
                      help="Number of positive samples to train.",
                      metavar="VALUE",
                      default=-1)
    parser.add_option("--bin_size",
                      type="int",
                      dest="bin_size",
                      help="image size reduction",
                      metavar="VALUE",
                      default=3)

    parser.add_option(
        "--coordinate_symbol",
        dest="coordinate_symbol",
        help="The symbol of the coordinate file, like '_manualPick'",
        metavar="STRING")
    parser.add_option("--particle_size",
                      type="int",
                      dest="particle_size",
                      help="the size of the particle.",
                      metavar="VALUE",
                      default=-1)
    parser.add_option("--validation_ratio",
                      type="float",
                      dest="validation_ratio",
                      help="the ratio.",
                      metavar="VALUE",
                      default=0.1)
    parser.add_option(
        "--model_retrain",
        action="store_true",
        dest="model_retrain",
        help=
        "train the model using the pre-trained model as parameters initialization .",
        default=False)
    parser.add_option("--model_load_file",
                      dest="model_load_file",
                      help="pre-trained model",
                      metavar="FILE")
    parser.add_option("--logdir",
                      dest="logdir",
                      help="directory of logfiles",
                      metavar="DIRECTORY",
                      default="Logfile")
    parser.add_option("--model_save_file",
                      dest="model_save_file",
                      help="save the model to file",
                      metavar="FILE")
    (opt, args) = parser.parse_args()

    np.random.seed(1234)

    # define the input size of the model
    model_input_size = [100, 64, 64, 1]
    num_classes = 2  # the number of output classes
    batch_size = model_input_size[0]

    if not os.access(opt.logdir, os.F_OK):
        os.mkdir(opt.logdir)

    # load training dataset
    dataLoader = DataLoader()
    train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_RelionStarFile(
        opt.train_good, opt.particle_size, model_input_size,
        opt.validation_ratio, opt.train_number, opt.bin_size)

    # Check if train_data exist
    try:
        train_data
    except NameError:
        print("ERROR: in function load.loadInputTrainData.")
        return None
    else:
        print("Load training data successfully!")
    # shuffle training data
    train_data, train_label = shuffle_in_unison_inplace(
        train_data, train_label)
    eval_data, eval_label = shuffle_in_unison_inplace(eval_data, eval_label)

    train_x = train_data.reshape(train_data.shape[0], 64, 64, 1)
    test_x = eval_data.reshape(eval_data.shape[0], 64, 64, 1)
    print("shape of training data: ", train_x.shape, test_x.shape)
    train_y = to_categorical(train_label, 2)
    test_y = to_categorical(eval_label, 2)
    print(train_y.shape, test_y.shape)
    datagen = ImageDataGenerator(featurewise_center=True,
                                 featurewise_std_normalization=True,
                                 rotation_range=20,
                                 width_shift_range=0.0,
                                 height_shift_range=0.0,
                                 horizontal_flip=True,
                                 vertical_flip=True)
    datagen.fit(train_x)

    model = Sequential()
    model.add(
        Conv2D(32,
               kernel_size=(8, 8),
               strides=(1, 1),
               activation='relu',
               input_shape=(64, 64, 1)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(64, kernel_size=(8, 8), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    for layer in model.layers:
        print(layer.name, layer.output_shape)

    logdir = opt.logdir + '/' + datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = TensorBoard(log_dir=logdir)
    checkpoint = ModelCheckpoint('best_model.h5',
                                 monitor='val_acc',
                                 verbose=1,
                                 save_best_only=True,
                                 period=1)
    reduce_lr_plateau = ReduceLROnPlateau(monitor='val_acc',
                                          patience=10,
                                          verbose=1)
    callbacks = [checkpoint, reduce_lr_plateau, tensorboard_callback]
    model.compile(optimizer=SGD(0.01),
                  loss="binary_crossentropy",
                  metrics=["accuracy"])

    model.fit_generator(datagen.flow(train_x, train_y, batch_size=batch_size),
                        steps_per_epoch=len(train_x) / 32,
                        epochs=30,
                        validation_data=(test_x, test_y),
                        callbacks=callbacks)
    model.save(opt.model_save_file)
    accuracy = model.evaluate(x=test_x, y=test_y, batch_size=batch_size)
    print("Accuracy:", accuracy[1])
Exemplo n.º 28
0
from myModel import MyModel
from dataLoader import DataLoader

if __name__ == '__main__':
    ENABLE_SAVE_MODEL = True
    MODEL_NAME = 'mini'
    # 4 mnist
    # H, W, C = 28, 28, 1
    # (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

    # 4 ukiyoe
    DATA_PATH = './data/'
    H, W, C = 224, 224, 3
    RH, RW = 224, 224
    x_train, y_train, x_test, y_test = DataLoader(0.2).load(DATA_PATH)
    if C == 1:
        x_train = np.sum(x_train, axis=-1) / 3
        x_test = np.sum(x_test, axis=-1) / 3

    x_train = x_train.astype('float32') / 255
    x_test = x_test.astype('float32') / 255
    x_train = x_train.reshape(x_train.shape[0], H, W, C)
    x_test = x_test.reshape(x_test.shape[0], H, W, C)

    model = MyModel()

    loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
    optimizer = tf.keras.optimizers.Adam()

    train_loss = tf.keras.metrics.Mean(name='train_loss')
Exemplo n.º 29
0
                'loss': loss,
            }, os.path.join(args.exp_dir , 'unfinished_model.pt'))
        epoch += 1

    cost_time = time.time() - since
    print ('Training complete in {:.0f}m {:.0f}s'.format(cost_time//60,cost_time%60))
    print ('Best Train Acc is {:.4f}'.format(best_train_acc))
    print ('Best Val Acc is {:.4f}'.format(best_acc))
    model.load_state_dict(best_model)
    return model,cost_time,best_acc,best_train_acc


if __name__ == '__main__':
    print ('DataSets: '+args.dataset)
    print ('ResNet Depth: '+str(args.depth))
    loader = DataLoader(args.dataset,batch_size=args.batch_size)
    dataloaders,dataset_sizes = loader.load_data()
    num_classes = 10
    if args.dataset == 'cifar-10':
        num_classes = 10
    if args.dataset == 'cifar-100':
        num_classes = 100

    model = resnet_cifar(depth=args.depth, num_classes=num_classes)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1,
                                momentum=0.9, nesterov=True, weight_decay=1e-4)

    # define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    scheduler = MultiStepLR(optimizer, milestones=[args.epoch*0.4, args.epoch*0.6, args.epoch*0.8], gamma=0.1)
Exemplo n.º 30
0
            update_checkpoint_link([('epoch_%d.pt' % best_epoch, 'best.pt'),
                                    ('epoch_%d.pt' % epoch, 'last.pt')])

        epoch += 1

    cost_time = time.time() - since
    print('Training complete in {:.0f}h{:.0f}m{:.0f}s'.format(
        (cost_time // 60) // 60, (cost_time // 60) % 60, cost_time % 60))

    return model, cost_time, best_acc, best_train_acc


if __name__ == '__main__':

    loader = DataLoader(args.dataset,
                        batch_size=args.batch_size,
                        seed=args.seed)
    dataloaders, dataset_sizes = loader.load_data(args.img_size)

    num_classes = 10
    if args.dataset == 'cifar-10':
        num_classes = 10
    if args.dataset == 'cifar-100':
        num_classes = 100
    if args.dataset == 'VOCpart':
        num_classes = len(dataloaders['train'].dataset.classes)

    assert args.img_size == 128, 'only supports --img_size 128'
    model = resnet_std(depth=args.depth,
                       num_classes=num_classes,
                       ifmask=args.ifmask,