Exemplo n.º 1
0
def main(_):
    with tf.Session() as sess:
        train_dataset = dataset.Dataset(
            os.path.join(_DATA_DIRECTORY, 'train.data'))
        test_dataset = dataset.Dataset(
            os.path.join(_DATA_DIRECTORY, 'test.data'))

        model = model_lib.Model(
            sequence_length=train_dataset.GetSequenceLength(),
            mode=tf.contrib.learn.ModeKeys.TRAIN,
            learning_rate=FLAGS.learning_rate,
            momentum_rate=FLAGS.momentum_rate)

        sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
        for i in range(FLAGS.max_steps):
            train_batch = train_dataset.GetBatch(FLAGS.train_batch_size)
            feed_dict = {
                model.sequences_placeholder: train_batch[0],
                model.true_labels_placeholder: train_batch[1],
            }
            model.mode = tf.contrib.learn.ModeKeys.TRAIN
            _ = sess.run([model.train_op, model.loss_op], feed_dict=feed_dict)

            model.mode = tf.contrib.learn.ModeKeys.EVAL
            loss = sess.run([model.loss_op], feed_dict=feed_dict)
            print ` loss `
Exemplo n.º 2
0
Arquivo: train.py Projeto: vzhong/gazp
def main(args):
    args.gpu = torch.cuda.is_available()
    utils.manual_seed(args.seed)
    Model = utils.load_module(args.model)
    cache_file = args.fcache or (os.path.join(
        'cache', 'data_{}_{}.debug.pt'.format(args.model, args.dataset)
        if args.debug else 'data_{}_{}.pt'.format(args.model, args.dataset)))
    splits, ext = torch.load(cache_file, map_location=torch.device('cpu'))
    splits = {k: dataset.Dataset(v) for k, v in splits.items()}
    splits['train'] = Model.prune_train(splits['train'], args)
    splits['dev'] = Model.prune_dev(splits['dev'], args)

    if args.model == 'nl2sql':
        Reranker = utils.load_module(args.beam_rank)
        ext['reranker'] = Reranker(args, ext)
    m = Model(args, ext).place_on_device()

    d = m.get_file('')
    if not os.path.isdir(d):
        os.makedirs(d)

    pprint.pprint(m.get_stats(splits, ext))

    if not args.test_only:
        if not args.skip_upperbound:
            print('upperbound')
            pprint.pprint(m.compute_upperbound(splits['train'][:1000]))
        if args.aug:
            augs = []
            for a in args.aug:
                augs.extend(torch.load(a))
            aug = dataset.Dataset(augs)
            splits['aug'] = Model.prune_train(aug, args)[:args.aug_lim]
            print('aug upperbound')
            pprint.pprint(m.compute_upperbound(aug[:10]))
            # aug_args = copy.deepcopy(args)
            # if 'consistent' not in args.aug:
            #     aug_args.epoch = 10
            # aug_dev = dataset.Dataset(random.sample(splits['train'], 3000))
            # m.run_train(aug, aug_dev, args=aug_args)
        pprint.pprint(m.get_stats(splits, ext))
        m.run_train(dataset.Dataset(splits['train'] + splits.get('aug', [])),
                    splits['dev'],
                    args=args)

    if args.resume:
        m.load_save(fname=args.resume)
    elif args.resumes:
        m.average_saves(args.resumes)
    if args.interactive_eval:
        dev_preds = m.run_interactive_pred(splits['dev'], args, verbose=True)
    else:
        dev_preds = m.run_pred(splits['dev'], args, verbose=True)

    if args.write_test_pred:
        with open(args.write_test_pred, 'wt') as f:
            json.dump(dev_preds, f, indent=2)
        print('saved test preds to {}'.format(args.write_test_pred))

    pprint.pprint(m.compute_metrics(splits['dev'], dev_preds))
Exemplo n.º 3
0
    def test_dataset_conversion(self):
        ds = dataset.Dataset()
        ds.readFromFile('../data/tiny.dat')
        vds = dataset.VerticalDataset()
        vds.readFromDataset(ds)
        ds2 = dataset.Dataset()
        ds2.readFromDataset(vds)

        self.assertEqual(ds.rows, ds2.rows)

        ds.readFromFile('../data/chess_tiny.dat')
        vds = dataset.VerticalDataset()
        vds.readFromDataset(ds)
        ds2 = dataset.Dataset()
        ds2.readFromDataset(vds)

        self.assertEqual(ds.rows, ds2.rows)

        ds.readFromFile('../data/chess.dat')
        vds = dataset.VerticalDataset()
        vds.readFromDataset(ds)
        ds2 = dataset.Dataset()
        ds2.readFromDataset(vds)

        self.assertEqual(ds.rows, ds2.rows)
Exemplo n.º 4
0
def run():
    path = datautils.Path('../input/train_data')
    image_files = datautils.get_images(path)
    train_paths, valid_paths = train_test_split(image_files,
                                                test_size=config.VALID_SPLIT,
                                                random_state=42)
    print(len(train_paths), len(valid_paths))
    encoder = joblib.load('label_encoder.pkl')
    train_ds = dataset.Dataset(train_paths,
                               get_labels=datautils.get_label,
                               label_enc=encoder,
                               size=(1200, 600))
    num_classes = len(encoder.classes_)

    train_dl = torch.utils.data.DataLoader(train_ds,
                                           batch_size=config.TRAIN_BATCH_SIZE,
                                           num_workers=8,
                                           shuffle=True)

    valid_ds = dataset.Dataset(valid_paths,
                               get_labels=datautils.get_label,
                               label_enc=encoder,
                               size=(1200, 600))

    valid_dl = torch.utils.data.DataLoader(valid_ds,
                                           batch_size=config.TRAIN_BATCH_SIZE *
                                           2,
                                           num_workers=8,
                                           shuffle=False)

    ocr_model = model.Model(len(encoder.classes_))
    ocr_model.to(config.DEVICE)

    total_steps = len(train_dl) * config.N_EPOCHS

    opt = torch.optim.Adam(ocr_model.parameters(), config.MAX_LR)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(opt,
                                                    config.MAX_LR,
                                                    total_steps=total_steps)

    for epoch in range(config.N_EPOCHS):
        engine.train_loop(train_dl, ocr_model, opt, scheduler, None,
                          config.DEVICE)
        losses, output = engine.eval_loop(valid_dl, ocr_model, None,
                                          config.DEVICE)

        print(torch.tensor(losses).mean().item())

    save_dict = {
        'label_encoding': encoder,
        'model_dict': ocr_model.state_dict()
    }
    torch.save(save_dict, f'ocr_model_{config.N_EPOCHS}')
Exemplo n.º 5
0
 def readcspfile(self):
     try:
         import dataset  #NEEDED for reading and writing cryosparc cs files
     except:
         response = messagebox.showerror(
             "ERROR", "Cannot import required cryosparc library")
     else:
         self.particleset = dataset.Dataset().from_file(self.cspfile.get())
         self.passthruset = dataset.Dataset().from_file(
             self.passthrufile.get())
         if 'ctf/exp_group_id' in self.passthruset.data.keys():
             self.particleset = self.passthruset
             response = messagebox.showinfo(
                 "IMPORTANT",
                 "CTF information is in the passthrough file.\n Replace that file in cryosparc directory"
             )
         elif 'ctf/exp_group_id' in self.particleset.data.keys():
             response = messagebox.showinfo(
                 "IMPORTANT",
                 "CTF information is in the particleset file.\n Replace that file in cryosparc directory"
             )
         else:
             response = messagebox.showerror(
                 "ERROR", "No CTF information found! Cannot group")
         groupdata = {}
         for i, dataline in enumerate(self.stardata):
             filename = os.path.basename(dataline[self.micnameindex])
             groupdata[filename] = self.grouplabels[i]
         keys = groupdata.keys()
         if 'location/micrograph_path' in self.passthruset.data.keys():
             numbad = 0
             for i in range(len(self.particleset.data)):
                 filename = self.passthruset.data[
                     'location/micrograph_path'][i]
                 basename = os.path.basename(filename)
                 if basename in keys:
                     self.particleset.data['ctf/exp_group_id'][
                         i] = groupdata[basename]
                 else:  #no appion ctf data for this micrograph, put them all in their own tilt group
                     #print ("error, no key found for " + basename)
                     self.particleset.data['ctf/exp_group_id'][i] = len(
                         self.grouplabels) + 1
                     numbad += 1
             if (numbad > 0):
                 response = messagebox.showwarning(
                     "WARNING",
                     "Number of particles without good appion ctf data:\n" +
                     str(numbad))
         else:
             response = messagebox.showerror(
                 "ERROR",
                 "No Micrograph filename information found! Cannot group")
Exemplo n.º 6
0
    def setUp(self):
        self.input_labels = ['label1', 'label2']
        self.samples = [
            {
                'label1': 1.0,
                'label2': 2.0,
                'output': 3.0,
                'ignored': 1.0
            },
            {
                'label1': 4.0,
                'label2': 5.0,
                'output': 6.0,
                'ignored': 1.0
            },
            {
                'label1': 7.0,
                'label2': 8.0,
                'output': 9.0,
                'ignored': 1.0
            },
        ]
        self.samples_with_strings = [
            {
                'label1': 1.0,
                'label2': 'foo',
                'output': 3.0,
                'ignored': 1.0
            },
            {
                'label1': 4.0,
                'label2': 'foo',
                'output': 6.0,
                'ignored': 1.0
            },
            {
                'label1': 7.0,
                'label2': '',
                'output': 9.0,
                'ignored': 1.0
            },
        ]
        self.output_generators = collections.OrderedDict([
            ('times10', lambda x: 10.0 * x['output']),
            ('filterGt3', lambda x: -1 if x['output'] > 3.0 else 42.0)
        ])

        self.dataset = dataset.Dataset(self.samples, self.input_labels,
                                       self.output_generators)
        self.dataset_with_strings = dataset.Dataset(self.samples_with_strings,
                                                    self.input_labels,
                                                    self.output_generators)
Exemplo n.º 7
0
    def __init__(self):
        self.ds_train = dataset.Dataset(config.trainPath, config.dictPath)
        self.ds_valid = dataset.Dataset(config.validPath, config.dictPath)
        self.ds_test = dataset.Dataset(config.testPath, config.dictPath)

        vocSize = len(self.ds_train.word2id)
        maxSeqLen = max([len(idLine) for idLine in self.ds_train.idData])
        padId = self.ds_train.word2id['<PAD>']
        self.model = lm.LM(vocSize, maxSeqLen, padId)
        # vocSize+1 for padding indice

        self.loss_log = logger.Logger('../result/loss.log')
        self.eval_log = logger.Logger('../result/eval.log')
Exemplo n.º 8
0
def k_nearest(filters, n_neighbors, weights, verbose=False):
    """K nearest neighbors classifier experiment."""
    # Dataset folder operations
    if verbose:
        print('Getting dataset images paths...')
    paths = dataset.get_images_paths('../data/training/*.jpg')

    # Loading dataset
    if verbose:
        print('Loading images dataset...')
    input_data = dataset.Dataset(paths, filters=filters, use_mean=True)

    # Generating sintetic images
    if verbose:
        print('Generating sintetic dataset...')
    input_data.generate_sintetic_dataset()

    # Extract labels
    if verbose:
        print('Extracting labels...')
    labels = input_data.labels_array()

    # Input data matrix
    if verbose:
        print('Generating data matrix...')
    data_matrix = input_data.compute_data_matrix()

    # Compute PCA
    if verbose:
        print('Making principal component analysis...')
    pca = PCA(n_components=25)
    pca.fit_transform(data_matrix, y=labels)

    # Instantiate random forest classifier model
    if verbose:
        print('Instantiating model...')
    classifier = KNeighborsClassifier(n_jobs=-1,
                                      n_neighbors=n_neighbors,
                                      weights=weights)

    # Training random forest classifier model and computing score
    if verbose:
        print('Training and computing score...')
    shuffle_split = ShuffleSplit(n_splits=15,
                                 test_size=0.15,
                                 train_size=0.85,
                                 random_state=0)
    scores = cross_val_score(estimator=classifier,
                             X=data_matrix,
                             y=labels,
                             cv=shuffle_split,
                             n_jobs=-1)

    # Reporting results
    mean_score = np.mean(scores)
    std_deviation = np.std(scores)
    f = mean_score - std_deviation
    print('\nMean score: ', mean_score)
    print('Standard deviation: ', std_deviation)
    print('F: ', f)
Exemplo n.º 9
0
    def get_track_dataset(
        self,
        name,
        src_ibr_dir,
        tgt_ibr_dir,
        n_nbs,
        im_size=None,
        pad_width=16,
        patch=None,
        nbs_mode="argmax",
        train=False,
    ):
        logging.info(f"  create dataset for {name}")

        src_im_paths = sorted(src_ibr_dir.glob(f"im_*.png"))
        src_im_paths += sorted(src_ibr_dir.glob(f"im_*.jpg"))
        src_im_paths += sorted(src_ibr_dir.glob(f"im_*.jpeg"))
        src_dm_paths = sorted(src_ibr_dir.glob("dm_*.npy"))
        src_Ks = np.load(src_ibr_dir / "Ks.npy")
        src_Rs = np.load(src_ibr_dir / "Rs.npy")
        src_ts = np.load(src_ibr_dir / "ts.npy")

        tgt_im_paths = sorted(tgt_ibr_dir.glob(f"im_*.png"))
        tgt_im_paths += sorted(tgt_ibr_dir.glob(f"im_*.jpg"))
        tgt_im_paths += sorted(tgt_ibr_dir.glob(f"im_*.jpeg"))
        if len(tgt_im_paths) == 0:
            tgt_im_paths = None
        tgt_dm_paths = sorted(tgt_ibr_dir.glob("dm_*.npy"))
        count_paths = sorted(tgt_ibr_dir.glob("count_*.npy"))
        counts = []
        for count_path in count_paths:
            counts.append(np.load(count_path))
        counts = np.array(counts)
        tgt_Ks = np.load(tgt_ibr_dir / "Ks.npy")
        tgt_Rs = np.load(tgt_ibr_dir / "Rs.npy")
        tgt_ts = np.load(tgt_ibr_dir / "ts.npy")

        dset = dataset.Dataset(
            name=name,
            tgt_im_paths=tgt_im_paths,
            tgt_dm_paths=tgt_dm_paths,
            tgt_Ks=tgt_Ks,
            tgt_Rs=tgt_Rs,
            tgt_ts=tgt_ts,
            tgt_counts=counts,
            src_im_paths=src_im_paths,
            src_dm_paths=src_dm_paths,
            src_Ks=src_Ks,
            src_Rs=src_Rs,
            src_ts=src_ts,
            im_size=im_size,
            pad_width=pad_width,
            patch=patch,
            n_nbs=n_nbs,
            nbs_mode=nbs_mode,
            bwd_depth_thresh=self.bwd_depth_thresh,
            invalid_depth_to_inf=self.invalid_depth_to_inf,
            train=train,
        )
        return dset
Exemplo n.º 10
0
 def setUp(self):
     instances = [dataset.Instance(0, {0:.2, 1:1.0}, label=-1.0),
                  dataset.Instance(1, {0:.2, 1:.7}, label=-1.0),
                  dataset.Instance(2, {0:.5, 1:.5}, label=1.0),
                  dataset.Instance(3, {0:.7, 1:.7}, label=1.0)]
     inst_dict = dict(zip(range(4), instances))
     self.data = dataset.Dataset(instances=inst_dict)
Exemplo n.º 11
0
def save_bottleneck_features(session, network, dataset):
    def transform(name, x, y):
        num_batches, batches = batches_mod.make_batches(x, y)

        transformed_results = []
        for batch_idx, (batch_x, _) in enumerate(batches):
            transformed_results.append(
                session.run(network.bottleneck_out,
                            feed_dict={network.x: batch_x}))

            print(f"{name}: {batch_idx}/{num_batches}")
            if batch_idx == 10: break

        return np.concatenate(transformed_results, axis=0)

    transform_train_x = transform("train_x", dataset.train_x, dataset.train_y)
    transform_valid_x = transform("valid_x", dataset.valid_x, dataset.valid_y)

    fname = f"../data/bottleneck_{network.name}_{dataset.name}.p"
    with open(fname, "wb") as f:
        pickle.dump(
            dataset_mod.Dataset(train_x=transform_train_x,
                                train_y=dataset.train_y,
                                valid_x=transform_valid_x,
                                valid_y=dataset.valid_y,
                                num_classes=dataset.num_classes,
                                name=dataset.name), f)
    def waveform_capture_dataset(self):
        if len(self.capturedDataBuffer) > 0:
            self.signalsNames, self.wfm_data, self.time_vector = self.capturedDataBuffer[
                0]
        else:
            self.ts.log_error('Did not capture data!')

        ds = dataset.Dataset()
        masterlist = self.analog_channels + self.digital_channels
        if len(self.signalsNames) == len(masterlist):
            ds.points.append('TIME')
            ds.data.append(self.time_vector[0::self.subsampling_rate])
            chan_count = 0
            for c in masterlist:
                ds.points.append(wfm_typhoon_channels[c])
                ds.data.append(
                    self.wfm_data[chan_count][0::self.subsampling_rate])
                chan_count += 1

        else:
            self.ts.log_error(
                'Number of channels returned from waveform capture is unexpected. '
                'Expected %s. Got: %s' %
                (self.channelSettings, self.signalsNames))

        return ds
Exemplo n.º 13
0
    def test_fit_complex(self):
        tf.set_random_seed(1)
        np.random.seed(1)
        flags = testing_flags.FLAGS
        flags.input_dim = 1
        flags.hidden_dim = 32
        flags.num_hidden_layers = 2
        flags.output_dim = 1
        flags.batch_size = 32
        flags.num_epochs = 8
        flags.learning_rate = .002
        flags.l2_reg = 0.0
        flags.verbose = False
        flags.save_weights_every = 100000
        flags.snapshot_dir = os.path.abspath(
            os.path.join(os.path.dirname(__file__), os.pardir, 'data',
                         'snapshots', 'test'))

        x = np.random.randn(1000).reshape(-1, 1)
        y = np.ones(x.shape)
        y[x < 0] = 0
        data = {'x_train': x, 'y_train': y, 'x_val': x, 'y_val': y}
        d = dataset.Dataset(data, flags)
        with tf.Session() as session:
            network = ffnn.FeedForwardNeuralNetwork(session, flags)
            network.fit(d)
            actual = network.predict(x)
            actual[actual < .5] = 0
            actual[actual >= .5] = 1
            np.testing.assert_array_almost_equal(y, actual, 8)
Exemplo n.º 14
0
def main():

    # Basic set
    patchsize = 40

    # Load networks
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_obj = OBJ_CNN().to(DEVICE)
    model_obj.load_state_dict(
        torch.load('./Model parameter/fire/obj_model_init.pkl'))

    # Traning path setting
    dataDir = './'
    trainingDir = dataDir + 'training/'  # If u want to do experiment on test dataset, change it
    trainingSets = util.getSubPaths(trainingDir)

    # Initialize training datasets
    trainingDataset = dataset.Dataset()
    trainingDataset.readFileNames(trainingSets[0])
    trainingDataset.SetObjID(1)

    i = 523  # If u want to do experiment on other RGB image, change it
    imgBGR = trainingDataset.getBGR(i)

    # Do every pixel sample
    sample = [[idx % 640, idx // 640] for idx in range(480 * 640)]
    sample_with_patch = [idx for idx in sample if patchsize / 2 <= idx[0] < 640 - patchsize / 2 and \
                                                  patchsize / 2 <= idx[1] < 480 - patchsize / 2]
    sample_with_patch = np.array(sample_with_patch).reshape([440, 600, 2])
    pred_coord = cnn.getCoordImg(colorData=imgBGR,
                                 sampling=sample_with_patch,
                                 patchsize=patchsize,
                                 model=model_obj) / 1000.0
    return pred_coord
Exemplo n.º 15
0
    def data_capture(self, enable=True, channels=None):
        """
        Enable/disable data capture.

        If sample_interval == 0, there will be no autonomous data captures and self.data_sample should be used to add
        data points to the capture
        """
        if enable is True:
            if self._capture is False:
                self._ds = dataset.Dataset(self.data_points)
                self._last_datarec = []
                if self.sample_interval > 0:
                    if self.sample_interval < MINIMUM_SAMPLE_PERIOD:
                        raise DASError('Sample period too small: %s' %
                                       (self.sample_interval))
                    self._timer = self.ts.timer_start(
                        float(self.sample_interval) / 1000,
                        self._timer_timeout,
                        repeating=True)
                self._capture = True
        elif enable is False:
            if self._capture is True:
                if self._timer is not None:
                    self.ts.timer_cancel(self._timer)
                self._timer = None
                self._capture = False
        self.device.data_capture(enable)
Exemplo n.º 16
0
def main():
    print("loading data...")
    ds = dataset.Dataset(classes=classes)
    train_X, train_y = ds.load_data('train')

    train_X = ds.preprocess_inputs(train_X)
    train_Y = ds.reshape_labels(train_y)
    print("input data shape...", train_X.shape)
    print("input label shape...", train_Y.shape)

    test_X, test_y = ds.load_data('test')
    test_X = ds.preprocess_inputs(test_X)
    test_Y = ds.reshape_labels(test_y)
    print("creating model...")
    model = SegNet(input_shape=input_shape, classes=classes)
    model.compile(loss="categorical_crossentropy",
                  optimizer='adadelta',
                  metrics=["accuracy"])

    model.fit(train_X,
              train_Y,
              batch_size=batch_size,
              epochs=epochs,
              verbose=1,
              class_weight=class_weighting,
              validation_data=(test_X, test_Y),
              shuffle=True)

    model.save('seg.h5')
Exemplo n.º 17
0
    def read_func_data(self, func_lst_in_loop):
        # ------------start(retriev the target function data)------------------------
        function_data_file = func_lst_in_loop[0] + ".dat"
        function_data_path = os.path.join(self.output_dir, function_data_file)
        # result_path = os.path.join(self.output_dir, 'data_batch_result.pkl')
        if os.path.exists(function_data_path):
            with open(function_data_path, 'r') as f:
                data_batch = pickle.load(f)
            print('read the function data !!! ... %s' % function_data_path)
        else:
            my_data = dataset.Dataset(self.data_folder, func_lst_in_loop,
                                      self.embed_path, self.process_num,
                                      self.embed_dim, self.num_classes,
                                      self.tag, self.int2insn_path)
            data_batch = my_data.get_batch(batch_size=self.batch_size)
            with open(function_data_path, 'w') as f:
                pickle.dump(data_batch, f)
            print('Save the function_data_path !!! ... %s' %
                  function_data_path)

        # *******start(used to predict the label of this data_batch)********
        # keep_prob = 1.0
        # feed_batch_dict1 = {
        #     'data': data_batch['data'],
        #     'label': data_batch['label'],
        #     'length': data_batch['length'],
        #     'keep_prob_pl': keep_prob
        # }
        # print "type of feed_batch_dict1['data']", type(feed_batch_dict1['data'])
        # print "len of feed_batch_dict1['data']", len(feed_batch_dict1['data'])
        # print "data of feed_batch_dict1['data']", feed_batch_dict1['data']
        # eval_predict.main(feed_batch_dict1)
        # ******* end (used to predict the label of this data_batch)********
        # ------------ end (retriev the target function data)------------------------
        return data_batch
Exemplo n.º 18
0
def evaluate(args):
    """
    Evaluate the classification model
    """
    logger = logging.getLogger("alibaba")
    logger.info("Load data_set , vocab and label config...")
    if args.pretrained_embedding:
        word_vocab_ = PretrainedVocab(args)

    else:
        with open(os.path.join(args.vocab_dir, "vocab.data"), "rb") as fin:
            word_vocab_ = pickle.load(fin)
    with open(os.path.join(args.vocab_dir, "vocab_character.data"),
              "rb") as fin:
        vocab_character_ = pickle.load(fin)
    data = dataset.Dataset(args)
    logger.info("Convert word to id...")
    data.convert_to_ids(word_vocab_, set_name='test')
    logger.info("Convert character to id...")
    data.convert_to_ids(vocab_character_, character=True, set_name='test')
    logger.info("Build Model...")
    model_ = model.Model(args,
                         word_vocab=word_vocab_,
                         character_vocab=vocab_character_)
    model_.restore(model_dir=args.model_dir, model_prefix=args.class_model)
    logger.info("Evaluating the model on dev set...")
    dev_batchs = data.get_mini_batchs(batch_size=args.batch_size,
                                      set_name="test",
                                      predict=True)
    _ = model_.predictiton(
        batch_data=dev_batchs,
        result_file=args.result_file,
        save_predict_label=True,
    )
    logger.info("Predicted labels are saved to {}".format(args.result_file))
Exemplo n.º 19
0
    def begin(self, edge, strategy):
        data = dataset.Dataset('data/data.csv')

        self.train_results_len = int(TRAINING_SET_SPLIT *
                                     len(data.processed_results))
        self.train_results = data.processed_results[:self.train_results_len]
        self.test_results = data.processed_results[self.train_results_len:]
        self.test_results_info = [[] for i in range(7)]
        for i in range(3):
            self.test_results_info[i] = data.result_info[i][self.
                                                            train_results_len:]

        def map_results(results):
            features = {}

            for result in results:
                for key in result.keys():
                    if key not in features:
                        features[key] = []

                    features[key].append(result[key])

            for key in features.keys():
                features[key] = np.array(features[key])
            return features, features['result']

        self.train_features, self.train_labels = map_results(
            self.train_results)
        self.test_features, self.test_labels = map_results(self.test_results)

        return self.learn(edge, strategy)
Exemplo n.º 20
0
    def setup(self, path):
        """Prepare experiment.

        Args:
            path (str): path to experiment configuration file.
        Returns:
            None.

        """
        if self.verbose:
            print('\n-- Starting experiment')

        self.config = self.read_config_file(path)

        training_paths, test_paths = dataset.get_images_paths(
            self.config['training_path'], self.config['dataset_mode'])

        if self.verbose:
            print('\n-- Loading dataset \n')
        self.dataset = dataset.Dataset(paths=training_paths,
                                       config=self.config,
                                       verbose=self.verbose)

        self.labels = self.dataset.labels_array()

        self.data_matrix = self.dataset.compute_data_matrix()

        if self.verbose:
            print('\n-- Making principal component analysis')
        pca = PCA(int(self.config['n_components']))
        pca.fit_transform(self.data_matrix, y=self.labels)
Exemplo n.º 21
0
    def test_fit_basic(self):
        """
        Description:
            - overfit a debug dataset using the 'fit' method
        """
        tf.set_random_seed(1)
        np.random.seed(1)
        flags = testing_flags.FLAGS
        flags.input_dim = 3
        flags.hidden_dim = 32
        flags.num_hidden_layers = 2
        flags.output_dim = 2
        flags.batch_size = 16
        flags.num_epochs = 200
        flags.learning_rate = .05
        flags.l2_reg = 0.0
        flags.save_weights_every = 100000
        flags.snapshot_dir = os.path.abspath(
            os.path.join(os.path.dirname(__file__), os.pardir, 'data',
                         'snapshots', 'test'))

        x = np.vstack((np.ones(
            (flags.batch_size // 2, flags.input_dim)), -1 * np.ones(
                (flags.batch_size // 2, flags.input_dim))))
        y = np.vstack((np.zeros((flags.batch_size // 2, flags.output_dim)),
                       np.ones((flags.batch_size // 2, flags.output_dim))))
        data = {'x_train': x, 'y_train': y, 'x_val': x, 'y_val': y}
        d = dataset.Dataset(data, flags)
        with tf.Session() as session:
            network = ffnn.FeedForwardNeuralNetwork(session, flags)
            network.fit(d)
            actual = network.predict(x)
            np.testing.assert_array_almost_equal(y, actual, 8)
Exemplo n.º 22
0
def train(args):
    """
    Training the classification model
    """
    logger = logging.getLogger("alibaba")
    logger.info("Load data_set , vocab and label config...")
    if args.pretrained_embedding:
        word_vocab_ = PretrainedVocab(args)

    else:
        with open(os.path.join(args.vocab_dir, "vocab.data"), "rb") as fin:
            word_vocab_ = pickle.load(fin)
    with open(os.path.join(args.vocab_dir, "vocab_character.data"),
              "rb") as fin:
        vocab_character_ = pickle.load(fin)
    data = dataset.Dataset(args)
    logger.info("Convert word to id...")
    data.convert_to_ids(word_vocab_)
    logger.info("Convert character to id...")
    data.convert_to_ids(vocab_character_, character=True)
    logger.info("Build Model...")
    model_ = model.Model(args,
                         word_vocab=word_vocab_,
                         character_vocab=vocab_character_)
    logger.info("Training the model...")
    model_.train(
        data,
        args.epochs,
        args.batch_size,
        save_dir=args.model_dir,
        save_prefix=args.class_model,
    )
    logger.info("Done with training...")
Exemplo n.º 23
0
def err_analyze(dst, mat, twtf, plcf, col):
    """output csv for mat"""
    twt_lst = dataset.Dataset()
    with open(twtf) as ftwt:
        for line in ftwt:
            twt_lst.append(json.loads(line))

    places = dataset.DataItem()
    with open(plcf) as fplc:
        for line in fplc:
            place = json.loads(line)
            places[place[col]] = place

    with open(dst, 'w') as fdst:
        print >>fdst, '"Ref POI", "Hyp POI", "Text", "Ref Genre", "Hyp Genre", "Ref SGenre", "Hyp SGenre"'
        for i in mat:
            for j in mat:
                #if i != j:
                    for item in mat[i][j]:
                        #              ref    hyp  text  rcat  hcat   rsc   hsc
                        try:
                            print >>fdst, '"{0}","{1}","{2}","{3}","{4}","{5}","{6}"' \
                                .format(csv_filter(places[i]['name']),csv_filter(places[j]['name']), \
                                fourq_filter(csv_filter(twt_lst[item]['text'])), \
                                places[i]['category'],places[j]['category'], \
                                places[i]['super_category'], places[j]['super_category'])
                        except: pass
Exemplo n.º 24
0
    def __init__(self, config):
        self.config = config
        self.label = config.label
        self.path = config.get_result_path()
        self.dataset = dataset.Dataset(config.get_dataset_filename())
        self.dataset.load()
        self.df = self.load_results()
        self.data = self.load_data()
        self.selected_features = self.load_selected_features()
        weight_filename = self.dataset.get_target_filename(config.weight)
        self.weight = np.memmap(weight_filename, 'float32')
        target_filename = self.dataset.get_target_filename(config.target)
        self.target = np.memmap(target_filename, 'float32')
        train_data_filename = self.dataset.get_train_filename()
        self.train_data_index = np.memmap(train_data_filename, 'int32')
        test_data_filename = self.dataset.get_test_filename()
        self.test_data_index = np.memmap(test_data_filename, 'int32')
        #if self.config.loss == "gamma":
        self.test_data_index = np.intersect1d(self.test_data_index,
                                              self.weight.nonzero())
        self.train_data_index = np.intersect1d(self.train_data_index,
                                               self.weight.nonzero())
        self.train_data = self.data[self.train_data_index, :]
        self.test_data = self.data[self.test_data_index, :]

        self.df_coeffs = self.load_coeffs()
        self.gini_curve = self.load_gini_curve()
        self.nb_features = self.gini_curve.shape[0]
Exemplo n.º 25
0
def cate_smooth(twt_lst, ratio, sel, lmd):
    """Smoothing the dataset by place category"""
    rst_lst = dataset.Dataset()
    pid_lst = twt_lst.distinct('place_id')
    twt_dist = twt_lst.groupfunc('place_id', len)
    tid_set = set(twt_lst.distinct('place_id'))
    pid_set = set(pid_lst)

    for pid in pid_lst:
        plc = dataset.loadrows(GEOTWEET, ('id', 'lat', 'lng', 'super_category'), \
            ('id = \'{0}\''.format(pid),), 'place')
        plc_type = plc[0]['super_category']
        tmp_lst = list()
        cand = dataset.type_random(plc_type)

        for twt in cand:
            if twt['id'] not in tid_set and twt['place_id'] not in pid_lst:
                if sel(twt, plc):
                    twt['place_id'] = pid
                    tid_set.add(twt['id'])
                    pid_set.add(twt['place_id'])
                    tmp_lst.append(twt)
                if len(tmp_lst) >= ratio * twt_dist[pid]: break
        rst_lst.extend(tmp_lst)

    rst_lst.extend(twt_lst)

    return rst_lst
Exemplo n.º 26
0
 def benchmark(self, dd):
     """
     This method benchmarks Raha.
     """
     d = dataset.Dataset(dd)
     sampling_range = [self.LABELING_BUDGET]
     aggregate_results = {s: [] for s in sampling_range}
     for r in range(self.RUN_COUNT):
         print("Run {}...".format(r))
         for s in sampling_range:
             self.LABELING_BUDGET = s
             correction_dictionary = self.run(dd)
             er = d.get_data_cleaning_evaluation(correction_dictionary)[:3]
             aggregate_results[s].append(er)
     results_string = "\\addplot[error bars/.cd,y dir=both,y explicit] coordinates{(0,0.0)"
     for s in sampling_range:
         mean = numpy.mean(numpy.array(aggregate_results[s]), axis=0)
         std = numpy.std(numpy.array(aggregate_results[s]), axis=0)
         print("Raha on {}".format(d.name))
         print("Labeled Tuples Count = {}".format(s))
         print("Precision = {:.2f} +- {:.2f}".format(mean[0], std[0]))
         print("Recall = {:.2f} +- {:.2f}".format(mean[1], std[1]))
         print("F1 = {:.2f} +- {:.2f}".format(mean[2], std[2]))
         print("--------------------")
         results_string += "({},{:.2f})+-(0,{:.2f})".format(s, mean[2], std[2])
     results_string += "}; \\addlegendentry{Raha}"
     print(results_string)
Exemplo n.º 27
0
def load(view):
    """Load the UIUC Car Dataset"""
    assert view in {"train", "test"}

    folder = "CarData"

    if view == "train":
        positive_image_filenames = [
            "%s/TrainImages/pos-%d.pgm" % (folder, i)
            for i in xrange(550)  # specified by readme
        ]
        negative_image_filenames = [
            "%s/TrainImages/neg-%d.pgm" % (folder, i)
            for i in xrange(500)  # specified by readme
        ]

    records = [
        dataset.DatasetRecord(filename, None, "positive")
        for filename in positive_image_filenames
    ] + [
        dataset.DatasetRecord(filename, None, "negative")
        for filename in negative_image_filenames
    ]

    return dataset.Dataset(_name="UIUC Cars", _folder=folder, _records=records)
Exemplo n.º 28
0
    def getColumnDataset(self,
                         seriesNumber,
                         column,
                         error=None,
                         errorFn=None,
                         autoLabel=True,
                         name=None,
                         units=None):
        df = self.series[seriesNumber]
        col = df[df.columns[column]].dropna()

        if autoLabel:
            if name is not None or units is not None:
                _warnings.warn(
                    'autoLabel selected and manual name/units paramenters set. Defaulting to manual name parameters where available.'
                )

            n, u = self.parseColumnName(df.columns[column])

            name = n if name is None else name
            units = u if units is None else units

        return _ds.Dataset(
            _np.array([
                s.replace(self.decimal, '.') if isinstance(s, str) else s
                for s in col
            ], 'float64'), error, errorFn, name, units)
Exemplo n.º 29
0
def main(argv=None):
    # custom parse of flags for list input
    compression_flags.custom_parse_flags(FLAGS)

    # set random seeds
    np.random.seed(FLAGS.random_seed)
    tf.set_random_seed(FLAGS.random_seed)

    # load dataset
    input_filepath = FLAGS.dataset_filepath
    data = dataset_loaders.risk_dataset_loader(
        input_filepath,
        shuffle=True,
        train_split=.9,
        debug_size=FLAGS.debug_size,
        timesteps=FLAGS.timesteps,
        num_target_bins=FLAGS.num_target_bins,
        balanced_class_loss=FLAGS.balanced_class_loss,
        target_index=FLAGS.target_index)

    if FLAGS.use_priority:
        d = priority_dataset.PrioritizedDataset(data, FLAGS)
    else:
        if FLAGS.balanced_class_loss:
            d = dataset.WeightedDataset(data, FLAGS)
        else:
            d = dataset.Dataset(data, FLAGS)

    print('means:\n{}\n{}'.format(np.mean(d.data['y_train'], axis=0),
                                  np.mean(d.data['y_val'], axis=0)))
    y = copy.deepcopy(d.data['y_val'])
    y[y == 0.] = 1e-8
    y[y == 1.] = 1 - 1e-8
    compression_metrics.regression_score(y, np.mean(y, axis=0), 'baseline')
    compression_metrics.regression_score(y, y, 'correct')

    # fit the model
    with tf.Session(config=tf.ConfigProto(
            log_device_placement=False)) as session:
        # if the timestep dimension is > 1, use recurrent network
        if FLAGS.timesteps > 1:
            network = rnn.RecurrentNeuralNetwork(session, FLAGS)
        else:
            if FLAGS.task_type == 'classification':
                if FLAGS.balanced_class_loss:
                    network = ffnn.WeightedClassificationFeedForwardNeuralNetwork(
                        session, FLAGS)
                else:
                    network = ffnn.ClassificationFeedForwardNeuralNetwork(
                        session, FLAGS)
            else:
                network = ffnn.FeedForwardNeuralNetwork(session, FLAGS)
        network.fit(d)

        # save weights to a julia-compatible weight file
        neural_networks.utils.save_trainable_variables(
            FLAGS.julia_weights_filepath, session, data)

        # evaluate the fit
        compression_metrics.evaluate_fit(network, data, FLAGS)
Exemplo n.º 30
0
    def __init__(self,
                 reg=0.0025,
                 learning_rate=0.05,
                 annealing=1.,
                 init_sigma=1,
                 k=32,
                 **kwargs):
        self.name = 'FM'
        self.dataset = dataset.Dataset()
        self.feature_dim = self.dataset.num_users + 2 * self.dataset.num_items + self.dataset.item_feature_dim

        self.reg = reg
        self.learning_rate = learning_rate  # self.learning_rate will change due to annealing.
        self.init_learning_rate = learning_rate  # self.init_learning_rate keeps the original value (for filename)
        self.annealing_rate = annealing
        self.init_sigma = init_sigma
        self.metrics = {
            'recall': {
                'direction': 1
            },
            'precision': {
                'direction': 1
            },
            'user_coverage': {
                'direction': 1
            },
            'item_coverage': {
                'direction': 1
            },
            'ndcg': {
                'direction': 1
            },
            # 'blockbuster_share' : {'direction': -1}
        }
        self.k = k  #分解的维度