Пример #1
0
    def testReader1(self):
        # Create DataReader instance
        data_reader = DataReader(self.filename, self.batch_size,
                                 self.seq_length)

        print(data_reader.vocab)
        print(data_reader.get_tensor(self.filename))
Пример #2
0
    def readconfig(self, config, name, template):
        """ get this reader module configuration from config file """
        DataReader.readconfig(self, config, name, template)

        self._getopt('doublequote', config, name, template, True)
        if self.doublequote is not True:
            self.doublequote = self.doublequote == 'True'

        self._getopt('escapechar', config, name, template, None)
        if self.escapechar is not None:
            self.escapechar = self.escapechar[0]

        self._getopt('quotechar', config, name, template, '"')
        self.quotechar = self.quotechar[0]

        self._getopt('skipinitialspace', config, name, template, False)
        if self.skipinitialspace is not False:
            self.skipinitialspace = self.skipinitialspace == 'True'

        self._getopt('field_size_limit', config, name, template, -1, "mem")

        for opt in [
                'doublequote', 'escapechar', 'quotechar', 'skipinitialspace',
                'field_size_limit'
        ]:

            self.log.debug("reader.readconfig %s: '%s'" \
                           % (opt, self.__dict__[opt]))
def evaluate():
    places = fluid.CUDAPlace(0)
    exe = fluid.Executor(places)

    [eval_prog, feed_target_names,
     fetch_targets] = fluid.io.load_inference_model(dirname=os.path.join(
         config.train['checkpoint_path'], 'infer_meteor'),
                                                    executor=exe)
    exe = fluid.ParallelExecutor(use_cuda=True, main_program=eval_prog)
    batch_size = config.train['batch_size']
    dr = DataReader()
    dr = dr.get_reader(batch_size, 'test')
    bleu_score = [0] * 5
    bleu_vec = ([1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1])
    sentence_said = set()
    for l, data in enumerate(dr()):
        img, real_cap = zip(*data)
        cp = exe.run(
            feed={feed_target_names[0]: np.array(img, dtype='float32')},
            fetch_list=fetch_targets)[0]
        for idx, vec in enumerate(bleu_vec):
            bleu_score[idx] += calc_bleu(cp, real_cap, vec)
        if config.evaluate['sentence_statistics']:
            for p in cp:
                p = words2sentence(filter(p))
                sentence_said.add(p)
    for i in range(len(bleu_score)):
        bleu_score[i] /= l + 1
    bleu_score[4] = sum(bleu_score[:-1]) / 4
    print('BLEU [{:.7f}, {:.7f}, {:.7f}, {:.7f}] {:.7f}'.format(*bleu_score))
    if config.evaluate['sentence_statistics']:
        print('模型一共说了{}句不同的话'.format(len(sentence_said)))
Пример #4
0
    def readconfig(self, config, name, template):
        """ get this reader module configuration from config file """
        DataReader.readconfig(self, config, name, template)

        self._getopt('doublequote', config, name, template, True)
        if self.doublequote is not True:
            self.doublequote = self.doublequote == 'True'
        
        self._getopt('escapechar', config, name, template, None)
        if self.escapechar is not None:
            self.escapechar = self.escapechar[0]

        self._getopt('quotechar', config, name, template, '"')
        self.quotechar = self.quotechar[0]

        self._getopt('skipinitialspace', config, name, template, False)
        if self.skipinitialspace is not False:
            self.skipinitialspace = self.skipinitialspace == 'True'

        self._getopt('field_size_limit', config, name, template, -1, "mem")

        for opt in ['doublequote', 'escapechar',
                    'quotechar', 'skipinitialspace', 'field_size_limit']:
            
            self.log.debug("reader.readconfig %s: '%s'" \
                           % (opt, self.__dict__[opt]))
Пример #5
0
def show_summary():
    train_labels = DataReader.read_training_labels()
    test_labels = DataReader.read_test_labels()
    for label_ix in range(3):
        print('{} labels:'.format(LABEL_TO_METHANOMETER[label_ix]))
        train_counts = Counter(train_labels[:, label_ix])
        test_counts = Counter(test_labels[:, label_ix])
        print('Train -> ' + ' '.join('{} {}'.format(key, value) for key, value in train_counts.items()))
        print('Test -> ' + ' '.join('{} {}'.format(key, value) for key, value in test_counts.items()))
Пример #6
0
    def __init__(self, log, db, reject, filename, input_encoding,
                 table, columns, newline_escapes = None):
        """ init textreader with a newline_escapes parameter """
        DataReader.__init__(self, log, db, reject,
                            filename, input_encoding, table, columns)

        if 'newline_escapes' not in self.__dict__:
            self.newline_escapes = newline_escapes

        self.log.debug('reader.__init__: newline_escapes %s' \
                       % self.newline_escapes)
Пример #7
0
    def testReader2(self):
        # Create DataReader instance
        data_reader = DataReader(self.filename, self.batch_size,
                                 self.seq_length)

        tensor = data_reader.get_tensor(self.filename)

        data_reader.generate_batches(tensor)
        x = data_reader.x_batches
        y = data_reader.y_batches

        print(x)
        print(y)
Пример #8
0
 def _transform_data_to_features(self):
     X_train_partials = []
     for X_train_partial in DataReader.iter_train_files_data():
         X_train_partials.append(X_train_partial)
     X_train = np.concatenate(X_train_partials, axis=0)
     rows = sum(partial.shape[0] for partial in X_train_partials)
     assert X_train.shape == (rows, DataReader.SENSOR_NUM * DataReader.SENSOR_DATA_COUNT_IN_ROW)
     train_features = self.transformer.transform(X_train)
     feature_names = np.asarray(self.transformer.get_feature_names())
     assert train_features.shape == (rows, len(feature_names))
     X_test = DataReader.read_test_data()
     assert X_test.shape == (X_test.shape[0], DataReader.SENSOR_NUM * DataReader.SENSOR_DATA_COUNT_IN_ROW)
     test_features = self.transformer.transform(X_test)
     assert test_features.shape == (test_features.shape[0], len(feature_names))
     return train_features, test_features, feature_names
Пример #9
0
    def readconfig(self, config, name, template):
        """ get this reader module configuration from config file """
        DataReader.readconfig(self, config, name, template)

        # this will be called twice if templates are in used, so we
        # have to protect ourselves against removing already read
        # configurations while in second run.

        self._getopt('field_count', config, name, template, None, 'int')
        self._getopt('trailing_sep', config, name, template, False)
        if self.trailing_sep is not False:
            self.trailing_sep = self.trailing_sep == 'True'

        self.log.debug('reader.readconfig: field_count %s', self.field_count)
        self.log.debug('reader.readconfig: trailing_sep %s', self.trailing_sep)
Пример #10
0
def train(args):
    if not os.path.exists(args.model_path):
        os.mkdir(args.model_path)

    writer = SummaryWriter("log")
    torch.cuda.set_device(args.device_id)

    model = CrossModal(vocab_size=args.vocab_size, 
            pretrain_path=args.pretrain_path).cuda()
    #model = torch.nn.DataParallel(model).cuda()
    criterion = RankLoss()
    optimizer = torch.optim.Adam(model.parameters(), 
            lr=args.learning_rate)

    step = 0
    for epoch in range(args.epochs):
        train_reader = DataReader(args.vocab_path, args.train_data_path, args.image_path, 
                args.vocab_size, args.batch_size, is_shuffle=True)
        print("train reader load succ......")
        for train_batch in train_reader.batch_generator():
            query = torch.from_numpy(train_batch[0]).cuda()
            pos = torch.stack(train_batch[1], 0).cuda()
            neg = torch.stack(train_batch[2], 0).cuda()

            optimizer.zero_grad()
        
            left, right = model(query, pos, neg)
            loss = criterion(left, right).cuda()

            loss.backward()
            optimizer.step()
            if step == 0:
                writer.add_graph(model, (query, pos, neg))

            if step % 100 == 0:
                writer.add_scalar('Train/Loss', loss.item(), step)    

            if step % args.eval_interval == 0:
                print('Epoch [{}/{}], Step [{}] Loss: {:.4f}'.format(epoch + 1, 
                            args.epochs, step, loss.item()), flush=True)

            if step % args.save_interval == 0:
                # Save the model checkpoint
                torch.save(model.state_dict(), '%s/model.ckpt' % args.model_path)
            step += 1
Пример #11
0
def train(args):
    if not os.path.exists(args.model_path):
        os.mkdir(args.model_path)

    torch.cuda.set_device(args.device_id)
    model = CrossModal(vocab_size=args.vocab_size,
                       pretrain_path=args.pretrain_path).cuda()
    model.load_state_dict(torch.load(args.model_path + "/model.ckpt"))
    model.eval()

    train_reader = DataReader(args.vocab_path,
                              "./data/query.txt",
                              args.image_path,
                              args.vocab_size,
                              args.batch_size,
                              is_shuffle=False)
    for train_batch in train_reader.extract_emb_generator():
        query = torch.from_numpy(train_batch).cuda()
        vec_list = model.query_emb(query)
        for vec in vec_list:
            print(" ".join(
                [str(round(x, 4)) for x in vec.cpu().detach().numpy()]))
Пример #12
0
class RealDataWorker:
    def __init__(self, pipe_recv, pipe_send):
        self.pipe_recv, self.pipe_send = pipe_recv, pipe_send

        self.queue_data = multiprocessing.Queue()
        self.queue_recv = multiprocessing.Queue()
        self.queue_send = multiprocessing.Queue()

        self.thread_data = threading.Thread(target=self.data_worker)
        self.thread_recv = threading.Thread(target=self.receiver)
        self.thread_send = threading.Thread(target=self.sender)

        self.reader = DataReader()
        self.thread_data.start()
        self.thread_recv.start()
        self.thread_send.start()

        logging.info(u"RealDataWorker - init finish")

    def __del__(self):
        pass

    def receiver(self):
        logging.info(u"start receiver thread")
        while True:
            c = self.pipe_recv.recv()
            if c == "stop":
                self.queue_recv.put(None)
                self.queue_send.put(None)
                self.queue_data.put(None)
            else:
                self.queue_recv.put(c)

    def sender(self):
        logging.info(u"start sender thread")
        try:
            while True:
                to_send = self.queue_send.get()
                if to_send is None:
                    return
                self.pipe_send.send(to_send)
        except:
            return

    def data_worker(self):
        logging.info(u"start DataReader thread")
        while True:
            obj = self.queue_recv.get()
            if obj is None:
                return
            self.queue_send.put(self.reader.process(obj))
Пример #13
0
def main(params):
    # Arguments passed down from the parser
    download_data_path = params['input_data_path']
    data_basepath = params['output_data_path']
    logs_path = params['logs_path']
    plots_path = params['plots_path']
    contour_type = params['contour_type']
    toggle_plot = params['toggle_plot']
    mini_batch_size = params['mini_batch_size']

    # Set up logging
    _setup_logging(logs_path)

    # Meat of the python program
    logging.info(
        'Started running preprocessor for the following parameters: {}'.format(
            params))
    reader = DataReader(download_data_path=download_data_path,
                        data_basepath=data_basepath,
                        logs_path=logs_path,
                        plots_path=plots_path,
                        contour_type=contour_type,
                        save_plot=toggle_plot)
    images, masks, metadata = reader.load_samples(reader.sample_tuples)
    loader = DataLoader(output_dir=data_basepath,
                        images=images,
                        masks=masks,
                        metadata=metadata,
                        mini_batch_size=mini_batch_size)
    minibatches = loader.random_mini_batches()

    # If user enabled the toggle_plot to evaluate the reader and loader modules
    if toggle_plot:
        # Check out the overall view of all samples (dicoms, masks) with no shuffle and no partitioning
        logging.debug(
            'Plotting the overall view of all (dicom, mask) samples...')
        reader.plot_samples(images, masks, metadata,
                            'data-reader_no-shuffle_batchset.jpg')

        # Check out first minibatch to see whether it matches the ones in 'data-reader_no-shuffle_batchset.jpg' with same label
        logging.debug(
            'Extracting and plotting the first minibatch to validate DataLoader against the previous plot from DataReader...'
        )
        for i, minibatch in enumerate(minibatches):
            if i > 1:
                break
            minibatch_image, minibatch_mask, minibatch_metadata = minibatch

        # minibatch_image (8,256,256), minibatch_mask (8,256,256), minibatch_metadata (8,)
        reader.plot_samples(minibatch_image, minibatch_mask,
                            minibatch_metadata,
                            'data-loader_shuffled_batchset.jpg')
        logging.info('Finished running preprocessor...')
Пример #14
0
class BitmexData:
    def __init__(self, data_dir, initial_date):
        self.reader = DataReader(data_dir, initial_date)

    def _idtoprice(self, ID, symbolIdx=88, ticksize=0.01):
        price = ((100000000 * symbolIdx) - ID) * ticksize
        return price

    def _zip_orderBookL2(self, bulk_orderBookL2):
        # return form: data [{side: 'sell', price: '8685', size: '11223'}]
        zip_data = []
        for tick in bulk_orderBookL2:
            side = tick['side']
            price = self._idtoprice(tick['id'])
            size = tick['size']
            zip_data.append({'side': side, 'price': price, 'size': size})
        return zip_data

    def _zip_trade(self, bulk_trade):
        zip_data = []
        for trade in bulk_trade:
            side = trade['side']
            price = trade['price']
            size = trade['size']
            zip_data.append({'side': side, 'price': price, 'size': size})
        return zip_data

    def _zip_liquid(self, bulk_liquid):
        zip_data = []
        for trade in bulk_liquid:
            side = trade['side']
            price = trade['price']
            size = trade['size']
            zip_data.append({'side': side, 'price': price, 'size': size})
        return zip_data

    def next_data(self):
        bulk_data = self.reader.next_file()
        data = {'orderBookL2': [], 'trade': [], 'liquid': []}
        for each_data in bulk_data:
            if each_data['table'] == 'orderBookL2':
                data['orderBookL2'].append(
                    self._zip_orderBookL2(each_data['data']))
            elif each_data['trade'] == 'trade':
                data['trade'].append(self._zip_trade(each_data['data']))
            else:
                data['liquid'].append(self._zip_liquid(each_data['data']))

        return data
Пример #15
0
    def __init__(self, pipe_recv, pipe_send):
        self.pipe_recv, self.pipe_send = pipe_recv, pipe_send

        self.queue_data = multiprocessing.Queue()
        self.queue_recv = multiprocessing.Queue()
        self.queue_send = multiprocessing.Queue()

        self.thread_data = threading.Thread(target=self.data_worker)
        self.thread_recv = threading.Thread(target=self.receiver)
        self.thread_send = threading.Thread(target=self.sender)

        self.reader = DataReader()
        self.thread_data.start()
        self.thread_recv.start()
        self.thread_send.start()

        logging.info(u"RealDataWorker - init finish")
Пример #16
0
 def make_feature_transformer_pipeline(sensor_group_count, n_jobs):
     assert sensor_group_count % 60 == 0
     feature_transformers = [
         ('max', SensorTransformer(np.max)),
         ('min', SensorTransformer(np.min)),
         ('first_location_of_maximum', SensorTransformer(first_location_of_maximum)),
         ('last_location_of_maximum', SensorTransformer(last_location_of_maximum)),
         ('binned_entropy_5', SensorTransformer(binned_entropy, max_bins=5)),
         ('mean', SensorTransformer(np.mean)),
         ('median', SensorTransformer(np.median)),
         ('variance', SensorTransformer(np.var)),
         ('std', SensorTransformer(np.std)),
         ('sum_values', SensorTransformer(np.sum)),
         ('mean_change', SensorTransformer(mean_change)),
         ('mean_abs_change', SensorTransformer(mean_abs_change)),
         ('absolute_sum_of_changes', SensorTransformer(absolute_sum_of_changes)),
         ('abs_energy', SensorTransformer(abs_energy)),
         ('percentile_10', SensorTransformer(np.percentile, q=10)),
         ('percentile_20', SensorTransformer(np.percentile, q=20)),
         ('percentile_80', SensorTransformer(np.percentile, q=80)),
         ('percentile_90', SensorTransformer(np.percentile, q=90)),
         # ('fft_coefficent', SensorMultiTransformer(
         #     fft_coefficient,
         #     param=[{'coeff': coeff} for coeff in range(5)]
         # )),
         # ('cwt_coeff', SensorMultiTransformer(
         #      cwt_coefficients,
         #      param=[{'coeff': coeff, 'widths': (2, 5, 10, 20), 'w': w}
         #             for coeff in range(15) for w in (2, 5, 10, 20)]
         #  ))
     ]
     sensor_names = DataReader.get_sensor_names()
     for _, feature_transformer in feature_transformers:
         feature_transformer.sensor_names = sensor_names
         feature_transformer.sensor_group_minutes_interval = sensor_group_count // 60
     return SensorPipeline([
         ('groups', SensorGroupingTransformer(
             sensor_data_count=DataReader.SENSOR_DATA_COUNT_IN_ROW,
             sensor_group_count=sensor_group_count
         )),
         ('features', SensorFeatureUnion(feature_transformers, n_jobs=n_jobs)),
     ])
Пример #17
0
def update(input_to_update: pd.DataFrame):
    feature_cols = [
        'dire_score', 'radiant_score', 'duration', 'patch', 'region',
        'radiant_team_id', 'dire_team_id'
    ]
    y_cols = ['radiant_win']
    x_cols = [
        'avg_dire_score', 'avg_radiant_score', 'avg_duration', 'patch',
        'region'
    ]
    x_cols += ['radiant_team_id', 'dire_team_id']
    #x_cols += [f'radiant_player_{j}' for j in range(1, 6)] + [f'dire_player_{j}' for j in range(1, 6)]

    data_reader = DataReader('../Datasets/BaseDataset/dota2_dataset.pickle',
                             feature_cols, y_cols, x_cols)
    data_reader.read_preprocessed(
        '../Datasets/BaseDataset/dota2_dataset_preprocessed.pickle')
    input_to_update = data_reader.add_observations(input_to_update)
    data_reader.write_data(
        '../Datasets/BaseDataset/dota2_dataset_preprocessed.pickle')

    radiant_wr = np.where(data_reader.preprocessed_data[y_cols])[0].shape[0] / \
                 data_reader.preprocessed_data[y_cols].shape[0]

    cost_weigths = np.asarray([radiant_wr, 1. - radiant_wr])
    lr = 1e-5

    model, x, y = build_model(cost_weigths_=cost_weigths, learning_rate=lr)

    train_x = np.expand_dims(input_to_update[x_cols], axis=-1)
    train_y = np.hstack((input_to_update[y_cols], 1 - input_to_update[y_cols]))

    print(train_y)

    saver = tf.train.Saver()

    with tf.Session() as sess:
        saver.restore(sess, "model.ckpt")
        _, c = sess.run([model.optimize(), model.cost()],
                        feed_dict={
                            x: train_x,
                            y: train_y
                        })
        saver.save(sess, "model.ckpt")
Пример #18
0
def main(_):
  vocab = load_vocabulary(FLAGS.data_dir)
  data_reader = DataReader(FLAGS.data_dir)

  model = Model(total_users=data_reader.total_users, total_items=data_reader.total_items,
                global_rating=data_reader.global_rating, num_factors=FLAGS.num_factors,
                img_dims=[196, 512], vocab_size=len(vocab), word_dim=FLAGS.word_dim,
                lstm_dim=FLAGS.lstm_dim, max_length=FLAGS.max_length, dropout_rate=FLAGS.dropout_rate)

  update_rating, update_review, global_step = train_fn(model)

  saver = tf.compat.v1.train.Saver(max_to_keep=10)

  log_file = open('log.txt', 'w')
  test_step = 0

  config = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement)
  config.gpu_options.allow_growth = True
  with tf.Session(config=config) as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(1, FLAGS.num_epochs + 1):
      log_info(log_file, "\nEpoch: {}/{}".format(epoch, FLAGS.num_epochs))

      count = 0
      sum_rating_loss = 0
      sum_review_loss = 0

      # Training
      for users, items, ratings in data_reader.read_train_set(FLAGS.batch_size, rating_only=True):
        count += 1

        fd = model.feed_dict(users=users, items=items, ratings=ratings, is_training=True)
        _step, _, _rating_loss = sess.run([global_step, update_rating, model.rating_loss], feed_dict=fd)
        sum_rating_loss += _rating_loss

        review_users, review_items, _, photo_ids, reviews = get_review_data(users, items, ratings,
                                                                            data_reader.train_review)
        img_idx = [data_reader.train_id2idx[photo_id] for photo_id in photo_ids]
        images = data_reader.train_img_features[img_idx]

        fd = model.feed_dict(users=review_users, items=review_items, images=images,
                             reviews=reviews, is_training=True)
        _, _review_loss = sess.run([update_review, model.review_loss], feed_dict=fd)
        sum_review_loss += _review_loss

        if _step % FLAGS.display_step == 0:
          data_reader.iter.set_postfix(rating_loss=(sum_rating_loss / count),
                                       review_loss=(sum_review_loss / count))

      # Testing
      review_gen_corpus = defaultdict(list)
      review_ref_corpus = defaultdict(list)

      photo_bleu_scores = defaultdict(list)
      photo_rouge_scores = defaultdict(list)

      review_bleu_scores = defaultdict(list)
      review_rouge_scores = defaultdict(list)

      sess.run(model.init_metrics)
      for users, items, ratings in data_reader.read_test_set(FLAGS.batch_size, rating_only=True):
        test_step += 1

        fd = model.feed_dict(users, items, ratings)
        sess.run(model.update_metrics, feed_dict=fd)

        review_users, review_items, review_ratings, photo_ids, reviews = get_review_data(users, items, ratings,
                                                                                         data_reader.test_review)
        img_idx = [data_reader.test_id2idx[photo_id] for photo_id in photo_ids]
        images = data_reader.test_img_features[img_idx]

        fd = model.feed_dict(users=review_users, items=review_items, images=images)
        _reviews, _alphas, _betas = sess.run([model.sampled_reviews, model.alphas, model.betas], feed_dict=fd)

        gen_reviews = decode_reviews(_reviews, vocab)
        ref_reviews = [decode_reviews(batch_review_normalize(ref), vocab) for ref in reviews]

        for user, item, gen, refs in zip(review_users, review_items, gen_reviews, ref_reviews):
          review_gen_corpus[(user, item)].append(gen)
          review_ref_corpus[(user, item)] += refs

          bleu_scores = compute_bleu([refs], [gen], max_order=4, smooth=True)
          for order, score in bleu_scores.items():
            photo_bleu_scores[order].append(score)

          rouge_scores = rouge([gen], refs)
          for metric, score in rouge_scores.items():
            photo_rouge_scores[metric].append(score)

      _mae, _rmse = sess.run([model.mae, model.rmse])
      log_info(log_file, '\nRating prediction results: MAE={:.3f}, RMSE={:.3f}'.format(_mae, _rmse))

      log_info(log_file, '\nReview generation results:')
      log_info(log_file, '- Photo level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format(
        np.array(photo_bleu_scores[1]).mean() * 100, np.array(photo_bleu_scores[2]).mean() * 100,
        np.array(photo_bleu_scores[3]).mean() * 100, np.array(photo_bleu_scores[4]).mean() * 100))

      for user_item, gen_reviews in review_gen_corpus.items():
        references = [list(ref) for ref in set(tuple(ref) for ref in review_ref_corpus[user_item])]

        user_item_bleu_scores = defaultdict(list)
        for gen in gen_reviews:
          bleu_scores = compute_bleu([references], [gen], max_order=4, smooth=True)
          for order, score in bleu_scores.items():
            user_item_bleu_scores[order].append(score)
        for order, scores in user_item_bleu_scores.items():
          review_bleu_scores[order].append(np.array(scores).mean())

        user_item_rouge_scores = defaultdict(list)
        for gen in gen_reviews:
          rouge_scores = rouge([gen], references)
          for metric, score in rouge_scores.items():
            user_item_rouge_scores[metric].append(score)
        for metric, scores in user_item_rouge_scores.items():
          review_rouge_scores[metric].append(np.array(scores).mean())

      log_info(log_file, '- Review level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format(
        np.array(review_bleu_scores[1]).mean() * 100, np.array(review_bleu_scores[2]).mean() * 100,
        np.array(review_bleu_scores[3]).mean() * 100, np.array(review_bleu_scores[4]).mean() * 100))

      for metric in ['rouge_1', 'rouge_2', 'rouge_l']:
        log_info(log_file, '- Photo level: {} = {:.2f}, {:.2f}, {:.2f}'.format(
          metric,
          np.array(photo_rouge_scores['{}/p_score'.format(metric)]).mean() * 100,
          np.array(photo_rouge_scores['{}/r_score'.format(metric)]).mean() * 100,
          np.array(photo_rouge_scores['{}/f_score'.format(metric)]).mean() * 100))
        log_info(log_file, '- Review level: {} = {:.2f}, {:.2f}, {:.2f}'.format(
          metric,
          np.array(review_rouge_scores['{}/p_score'.format(metric)]).mean() * 100,
          np.array(review_rouge_scores['{}/r_score'.format(metric)]).mean() * 100,
          np.array(review_rouge_scores['{}/f_score'.format(metric)]).mean() * 100))

      save_path = saver.save(sess, f"tmp/model{epoch}.ckpt")
      log_info(log_file, '')
Пример #19
0
 def __init__(self, data_dir, initial_date):
     self.reader = DataReader(data_dir, initial_date)
Пример #20
0
def train(args):
    if not os.path.exists(args.model_path):
        os.mkdir(args.model_path)
    #tf.reset_default_graph()
    model = CrossModel(vocab_size=args.vocab_size)
    # optimizer
    train_step = tf.contrib.opt.LazyAdamOptimizer(
        learning_rate=args.learning_rate).minimize(model.loss)
    saver = tf.train.Saver()
    loss_summary = tf.summary.scalar("train_loss", model.loss)
    init = tf.group(tf.global_variables_initializer(),
                    tf.local_variables_initializer())
    with tf.Session() as sess:
        sess.run(init)
        #variables_to_restore = slim.get_variables_to_restore()
        #restore_fn = slim.assign_from_checkpoint_fn(args.pretrain_path, variables_to_restore)
        #restore_fn(sess)
        #sess.run(tf.global_variables_initializer())
        init_variables_from_checkpoint(args.pretrain_path)

        _writer = tf.summary.FileWriter(args.logdir, sess.graph)
        # init embedding
        embedding = load_embedding(args.emb_path, args.vocab_size, 256)
        _ = sess.run(model.embedding_init,
                     feed_dict={model.embedding_in: embedding})
        print("loading pretrain emb succ.")

        # summary
        summary_op = tf.summary.merge([loss_summary])
        step = 0
        for epoch in range(args.epochs):
            train_reader = DataReader(args.vocab_path,
                                      args.train_data_path,
                                      args.image_data_path,
                                      args.vocab_size,
                                      args.batch_size,
                                      is_shuffle=True)
            print("train reader load succ.")
            for train_batch in train_reader.batch_generator():
                query, pos, neg = train_batch

                _, _loss, _summary = sess.run(
                    [train_step, model.loss, summary_op],
                    feed_dict={
                        model.text: query,
                        model.img_pos: pos,
                        model.img_neg: neg
                    })
                _writer.add_summary(_summary, step)
                step += 1

                # test
                sum_loss = 0.0
                iters = 0
                summary = tf.Summary()
                if step % args.eval_interval == 0:
                    print("Epochs: {}, Step: {}, Train Loss: {:.4}".format(
                        epoch, step, _loss))

                    test_reader = DataReader(args.vocab_path,
                                             args.test_data_path,
                                             args.image_data_path,
                                             args.vocab_size, args.batch_size)
                    for test_batch in test_reader.batch_generator():
                        query, pos, neg = test_batch
                        _loss = sess.run(model.loss,
                                         feed_dict={
                                             model.text: query,
                                             model.img_pos: pos,
                                             model.img_neg: neg
                                         })
                        sum_loss += _loss
                        iters += 1
                    avg_loss = sum_loss / iters
                    summary.value.add(tag="test_loss", simple_value=avg_loss)
                    _writer.add_summary(summary, step)
                    print("Epochs: {}, Step: {}, Test Loss: {:.4}".format(
                        epoch, step, sum_loss / iters))
                if step % args.save_interval == 0:
                    save_path = saver.save(sess,
                                           "{}/model.ckpt".format(
                                               args.model_path),
                                           global_step=step)
                    print("Model save to path: {}/model.ckpt".format(
                        args.model_path))
                        'rawfilename' in config['APP'] and 'fun' in config['APP']\
                            and 'predictionfilename' in config['APP'])

    data_columns = config['COVID_DATA']['datacolumns'].split(",")

    base_dir_report = config['APP']['report_dir']

    filereportname = config['APP']['filereportname']
    rawfilename = config['APP']['rawfilename']
    predictionfilename = config['APP']['predictionfilename']

    funname = config['APP']['fun']
    #############  #############

    ## PREPARE ##
    reader = DataReader(config['COVID_DATA']['url'],
                        config['APP']['reader_mode'])
    dataextract = ExtractCovidData(data_columns)
    fittingclass = CovidFitFunctions()
    statcalc = ComputeStat()
    prediction = CovidPrediction()

    _idelab_ = datetime.now().strftime("%Y%m%d_%H%M%S")
    _reportdir_ = path.join(base_dir_report, _idelab_)
    if not path.exists(base_dir_report):
        mkdir(base_dir_report)
    _reportlog_ = []
    if not path.exists(_reportdir_):
        mkdir(_reportdir_)

    _reportlog_.append(
        LOGROW.format(dt=str(datetime.now()), tx="Start Process " + _idelab_))
Пример #22
0
 def __init__(self):
     self.dr = DataReader()
Пример #23
0
def train(args):
    config = ParameterConfig()

    data_reader = DataReader(args['data'], config.batch_size, config.seq_length)
    config.vocab_size = data_reader.vocab_size
    
    if not os.path.exists(args['model_dir']):
        os.makedirs(args['model_dir'])
        
    with open(os.path.join(args['model_dir'], 'config.pkl'), 'wb') as f:
        cPickle.dump(config, f)
    with open(os.path.join(args['model_dir'], 'vocab.pkl'), 'wb') as f:
        cPickle.dump((data_reader.tokens, data_reader.vocab), f)

    training_model = RNNModel(config=config)
            
    with tf.Session() as session:
        initializer = tf.random_uniform_initializer(-config.init_scale,config.init_scale)
        
        tf.initialize_all_variables().run()
        saver = tf.train.Saver(tf.all_variables())
        
        #Run a single epoch of training
        for epoch in range(config.total_max_epoch):
            current_state = session.run(training_model.initial_state)
                    
            learning_rate_decay = config.lr_decay ** max(epoch - config.max_epoch, 0.0)
            training_model.assign_learningRate(session, config.learning_rate * learning_rate_decay)
                    
            total_cost = 0.0
            total_seq = 0
                    
            data_reader.reset_batch_pointer()
            for batch in range(data_reader.num_batches):
                start = time.time()
                x,y = data_reader.next_batch()
                feed_dict = {training_model.input_data: x, training_model.targets: y, 
                             training_model.initial_state: current_state}
                  
                cost, current_state, _ = session.run([training_model.cost, training_model.final_state, training_model.train_op], 
                                                     feed_dict) 
                 
                total_cost += cost
                total_seq += config.seq_length
                 
                perplexity = np.exp(total_cost / total_seq)
                end = time.time()                 
                
                print("{}/{} (epoch {}), perplexity = {:.3f}, time/batch = {:.3f}" \
                    .format(epoch * data_reader.num_batches + batch,
                            config.total_max_epoch * data_reader.num_batches,
                            epoch, perplexity, end - start))
                sys.stdout.flush()

                if ((epoch * data_reader.num_batches + batch) % 1000 == 0 \
                        or (epoch == config.total_max_epoch - 1 and batch == data_reader.num_batches - 1)):
                    
                    checkpoint_path = os.path.join(args['model_dir'], 'model.ckpt')
                    saver.save(session, checkpoint_path, global_step = epoch * data_reader.num_batches + batch)
                    print("Model saved to {}".format(checkpoint_path))
                    sys.stdout.flush()

    session.close()
import numpy as np
import os
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from paddle import fluid

import config
from reader import DataReader

index_word = DataReader().index_word
stop_tag = config.data['stop_idx']
padding_tag = config.data['padding_idx']


def filter(p):
    """
        把索引list转换为单词list
    """
    result = []
    for idx in p:
        if idx == stop_tag:
            break
        if idx == padding_tag: continue
        result.append(index_word[idx])
    return result


def calc_bleu(pred, real, weights=(0.25, 0.25, 0.25, 0.25)):
    if isinstance(pred, np.ndarray):
        if pred.dtype == 'float32':
            pred = np.rint(pred).astype('int32')
Пример #25
0
import config
import evaluate
from tools import util
from tools.logger import Logger
from model.model_adaAttention_aic import ImageCaptionModel
from reader import DataReader

seed = config.train['seed']
decoder_config = config.md['decoder']
encoder_config = config.md['encoder']
batch_size = config.train['batch_size']
capacity = config.train['data_loader_capacity']

logger = Logger()
data_reader = DataReader()
random.seed(seed)
np.random.seed(seed)


def get_optimizer():
    base_lr = config.train['learning_rate']
    strategy = config.train['lr_decay_strategy']
    lr = util.get_lr(strategy, base_lr, config.data['sample_count'], config.train['batch_size'])

    return fluid.optimizer.Adam(lr), lr


def training_net():
    startup_prog, train_prog = fluid.Program(), fluid.Program()
    train_prog.random_seed = 0  # 必须是0,否则dropout会出问题
    self._algo.train(*args)

  def eval(self):
    self._algo.eval()




if __name__ == '__main__':
  import random
  random.seed(1)
  filename = 'data_banknote_authentication.csv'
  max_depth = 3
  min_size = 10
  nCut = 20
  r =DataReader()
  r.load_csv(filename)
  r.str_column_to_float()
  r.cross_validation_split()

  #run simple tree
  a = SimpleTree()
  dtc = DecisionTreeClassifier()
  dtc.set_reader(r)
  dtc.set_algo(a)
  dtc.train(max_depth, min_size, nCut)
  dtc.eval()


  #run random forest
  a = RandomForest()
Пример #27
0
def train(args):
    if not os.path.exists(args.model_path):
        os.mkdir(args.model_path)
    tf.reset_default_graph()
    model = TextClassification(vocab_size=args.vocab_size, 
            encoder_type=args.encoder_type, max_seq_len=args.max_seq_len)
    # optimizer
    train_step = tf.contrib.opt.LazyAdamOptimizer(learning_rate=args.learning_rate).minimize(model.loss)
    saver = tf.train.Saver()
    loss_summary = tf.summary.scalar("train_loss", model.loss)
    init = tf.group(tf.global_variables_initializer(), 
            tf.local_variables_initializer())
    with tf.Session() as sess:
        sess.run(init)
        # feeding embedding
        _writer = tf.summary.FileWriter(args.logdir, sess.graph)

        # summary
        summary_op = tf.summary.merge([loss_summary])
        step = 0
        for epoch in range(args.epochs):
            train_reader = DataReader(args.vocab_path, args.train_data_path, 
                    args.vocab_size, args.batch_size, args.max_seq_len)
            for train_batch in train_reader.batch_generator():
                text, label = train_batch
                _, _loss, _summary, _logits = sess.run([train_step, model.loss, summary_op, model.logits],
                        feed_dict={model.label_in: label, model.text_in: text})
                _writer.add_summary(_summary, step)
                step += 1


                # test
                summary = tf.Summary()
                if step % args.eval_interval == 0:
                    acc, acc_op = tf.metrics.accuracy(labels=tf.argmax(label, 1),
                            predictions=tf.argmax(_logits, 1))
                    sess.run(tf.local_variables_initializer())
                    _, _acc = sess.run([acc, acc_op])
                    summary.value.add(tag="train_accuracy", simple_value=_acc)
                    print("Epochs: {}, Step: {}, Train Loss: {}, Acc: {}".format(epoch, step, _loss, _acc))

                    test_reader = DataReader(args.vocab_path, args.test_data_path, 
                            args.vocab_size, args.batch_size, args.max_seq_len)
                    sum_loss = 0.0
                    sum_acc = 0.0
                    iters = 0
                    for test_batch in test_reader.batch_generator():
                        text, label = test_batch
                        _loss, _logits = sess.run([model.loss, model.logits],
                                feed_dict={model.label_in: label, model.text_in: text})
                        acc, acc_op = tf.metrics.accuracy(labels=tf.argmax(label, 1),
                                predictions=tf.argmax(_logits, 1))
                        sess.run(tf.local_variables_initializer())
                        _, _acc = sess.run([acc, acc_op])
                        sum_acc += _acc
                        sum_loss += _loss
                        iters += 1
                    avg_loss = sum_loss / iters
                    avg_acc = sum_acc / iters
                    summary.value.add(tag="test_accuracy", simple_value=avg_acc)
                    summary.value.add(tag="test_loss", simple_value=avg_loss)
                    _writer.add_summary(summary, step)
                    print("Epochs: {}, Step: {}, Test Loss: {}, Acc: {}".format(epoch, step, avg_loss, avg_acc))
                if step % args.save_interval == 0:
                    save_path = saver.save(sess, "{}/birnn.lm.ckpt".format(args.model_path), global_step=step)
                    print("Model save to path: {}/birnn.lm.ckpt".format(args.model_path))
Пример #28
0
class TextUI:
    __prefix_data = "./data/"
    __data_indices = [2, 11, 18, 23, 42, 44, 46, 50, 52]

    __files: List[AnyStr]

    def __init__(self):
        self.dr = DataReader()

    def find_data_files(self):
        self.__files = []
        for file in os.listdir(self.__prefix_data):
            if file.endswith(".csv") and "Signals" in file:
                if any((("{0:0>2}".format(i) in file)
                        for i in self.__data_indices)):
                    self.__files.append(file)

        self.__files.sort()

    def run(self):
        model = None
        scaler = None
        while True:
            self.find_data_files()

            print(
                "Enter:\nt - to train the model, \ne - to test a trained model, \nl - to load a pre-trained model\nq - to quit"
            )
            print(
                "IMPORTANT: Always train or load a model before testing it!\n")
            choice = input("Your choice: ")
            if choice is "l":
                model = load_model("model.h5")
            elif choice is "t" or choice is "e":
                print("Data files:\n")
                for i, file in enumerate(self.__files):
                    print("{} - {}".format(i + 1, file))

                if choice is "t":
                    number = input(
                        "\nPlease select the file to train the model on: ")
                else:
                    number = input(
                        "\nPlease select the file to test the model on: ")

                index = int(number)
                index -= 1

                if 0 <= index < len(self.__files):
                    data = self.dr.read_set(self.__data_indices[index])

                    pp = Preprocess()
                    data, scaler = pp.clean_up(data)
                    data = pp.convert_to_supervised(data, sample_shift=0)
                    if choice is "t":
                        train, test = pp.prepare_sets(data, 0.2)
                        train_X, train_y = pp.make_input_output(
                            train, remove_resp_from_input=True)
                        test_X, test_y = pp.make_input_output(
                            test, remove_resp_from_input=True)
                        trainer = RespRatePredictor()
                        self.dr.plot(data)
                        model = trainer.make_network(
                            input_shape=(train_X.shape[1], train_X.shape[2]))
                        model = trainer.fit_network(model, train_X, train_y,
                                                    test_X, test_y)
                        model.save("model_{0:0>2}.h5".format(
                            self.__data_indices[index - 1]))
                    else:
                        all_X, all_y = pp.make_input_output(
                            data.drop("Time [s]", axis=1),
                            remove_resp_from_input=True)
                        predict_y = model.predict(all_X, batch_size=640)
                        # min_ = scaler.min_[1]
                        # scale_ = scaler.scale_[1]

                        # predict_y = (predict_y - min_) / scale_
                        predicted = pnd.DataFrame(
                            {"RESP_PREDICTED": predict_y.flatten()})

                        fused = pnd.concat([data, predicted], axis=1)
                        self.dr.plot(fused)
                        self.dr.plot_detail(fused)
                else:
                    continue
            else:
                break
Пример #29
0
def readTaxiData(filename):
	TaxiDataList = DataReader(filename).dataProcess
	return TaxiDataList
Пример #30
0
def main(_):
  vocab = load_vocabulary(FLAGS.data_dir)
  if FLAGS.generating:
    data_reader = DataReader(FLAGS.data_dir, n_reviews=5, generating=True)
  else:
    data_reader = DataReader(FLAGS.data_dir)
  model = Model(total_users=data_reader.total_users, total_items=data_reader.total_items,
                global_rating=data_reader.global_rating, num_factors=FLAGS.num_factors,
                img_dims=[196, 512], vocab_size=len(vocab), word_dim=FLAGS.word_dim,
                lstm_dim=FLAGS.lstm_dim, max_length=FLAGS.max_length, dropout_rate=FLAGS.dropout_rate)

  saver = tf.compat.v1.train.Saver(max_to_keep=10)

  log_file = open('log.txt', 'w')
  test_step = 0

  config = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement)
  config.gpu_options.allow_growth = True

  with tf.Session(config=config) as sess:
      saver.restore(sess, FLAGS.ckpt_dir)
      print('Model succesfully restored')
      # Testing
      review_gen_corpus = defaultdict(list)
      review_ref_corpus = defaultdict(list)

      photo_bleu_scores = defaultdict(list)
      photo_rouge_scores = defaultdict(list)

      review_bleu_scores = defaultdict(list)
      review_rouge_scores = defaultdict(list)

      sess.run(model.init_metrics)
      for users, items, ratings in data_reader.read_real_test_set(FLAGS.batch_size, rating_only=True):
        test_step += 1

        fd = model.feed_dict(users, items, ratings)
        sess.run(model.update_metrics, feed_dict=fd)

        review_users, review_items, review_ratings, photo_ids, reviews = get_review_data(users, items, ratings,
                                                                                         data_reader.real_test_review)
        img_idx = [data_reader.real_test_id2idx[photo_id] for photo_id in photo_ids]
        images = data_reader.real_test_img_features[img_idx]

        fd = model.feed_dict(users=review_users, items=review_items, images=images)
        _reviews, _alphas, _betas = sess.run([model.sampled_reviews, model.alphas, model.betas], feed_dict=fd)

        gen_reviews = decode_reviews(_reviews, vocab)
        ref_reviews = [decode_reviews(batch_review_normalize(ref), vocab) for ref in reviews]

        if FLAGS.generating:
          for gen, ref in zip(gen_reviews, ref_reviews):
            gen_str = "GENERATED:\n"+" ".join(gen)
            ref_str = "REFERENCE:\n"+" ".join([" ".join(sentence) for sentence in ref])+"\n"
            log_info(log_file,gen_str)
            log_info(log_file,ref_str)

        for user, item, gen, refs in zip(review_users, review_items, gen_reviews, ref_reviews):
          review_gen_corpus[(user, item)].append(gen)
          review_ref_corpus[(user, item)] += refs

          bleu_scores = compute_bleu([refs], [gen], max_order=4, smooth=True)
          for order, score in bleu_scores.items():
            photo_bleu_scores[order].append(score)

          rouge_scores = rouge([gen], refs)
          for metric, score in rouge_scores.items():
            photo_rouge_scores[metric].append(score)

      _mae, _rmse = sess.run([model.mae, model.rmse])
      log_info(log_file, '\nRating prediction results: MAE={:.3f}, RMSE={:.3f}'.format(_mae, _rmse))

      log_info(log_file, '\nReview generation results:')
      log_info(log_file, '- Photo level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format(
        np.array(photo_bleu_scores[1]).mean() * 100, np.array(photo_bleu_scores[2]).mean() * 100,
        np.array(photo_bleu_scores[3]).mean() * 100, np.array(photo_bleu_scores[4]).mean() * 100))

      for user_item, gen_reviews in review_gen_corpus.items():
        references = [list(ref) for ref in set(tuple(ref) for ref in review_ref_corpus[user_item])]

        user_item_bleu_scores = defaultdict(list)
        for gen in gen_reviews:
          bleu_scores = compute_bleu([references], [gen], max_order=4, smooth=True)
          for order, score in bleu_scores.items():
            user_item_bleu_scores[order].append(score)
        for order, scores in user_item_bleu_scores.items():
          review_bleu_scores[order].append(np.array(scores).mean())

        user_item_rouge_scores = defaultdict(list)
        for gen in gen_reviews:
          rouge_scores = rouge([gen], references)
          for metric, score in rouge_scores.items():
            user_item_rouge_scores[metric].append(score)
        for metric, scores in user_item_rouge_scores.items():
          review_rouge_scores[metric].append(np.array(scores).mean())

      log_info(log_file, '- Review level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format(
        np.array(review_bleu_scores[1]).mean() * 100, np.array(review_bleu_scores[2]).mean() * 100,
        np.array(review_bleu_scores[3]).mean() * 100, np.array(review_bleu_scores[4]).mean() * 100))

      for metric in ['rouge_1', 'rouge_2', 'rouge_l']:
        log_info(log_file, '- Photo level: {} = {:.2f}, {:.2f}, {:.2f}'.format(
          metric,
          np.array(photo_rouge_scores['{}/p_score'.format(metric)]).mean() * 100,
          np.array(photo_rouge_scores['{}/r_score'.format(metric)]).mean() * 100,
          np.array(photo_rouge_scores['{}/f_score'.format(metric)]).mean() * 100))
        log_info(log_file, '- Review level: {} = {:.2f}, {:.2f}, {:.2f}'.format(
          metric,
          np.array(review_rouge_scores['{}/p_score'.format(metric)]).mean() * 100,
          np.array(review_rouge_scores['{}/r_score'.format(metric)]).mean() * 100,
          np.array(review_rouge_scores['{}/f_score'.format(metric)]).mean() * 100))
Пример #31
0
___author__ = 'Ahmed Hani Ibrahim'
from reader import DataReader
from analysis import Analyzer

file_path = './data/data_science_dataset_wuzzuf.csv'

reader = DataReader(file_path)
data = reader.read_data()
analyzer = Analyzer(data)

analyzer.trending_category()

x = 0