def train(FLAGS):

    # Load the data
    en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \
        process_data('data/en.p', max_vocab_size=5000, target_lang=False)
    sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \
        process_data('data/sp.p', max_vocab_size=5000, target_lang=True)

    # Split into train and validation sets
    train_encoder_inputs, train_decoder_inputs, train_targets, \
        train_en_seq_lens, train_sp_seq_len, \
        valid_encoder_inputs, valid_decoder_inputs, valid_targets, \
        valid_en_seq_lens, valid_sp_seq_len = \
        split_data(en_token_ids, sp_token_ids, en_seq_lens, sp_seq_lens,
            train_ratio=0.8)

    # Update parameters
    FLAGS.en_vocab_size = len(en_vocab_dict)
    FLAGS.sp_vocab_size = len(sp_vocab_dict)

    # Start session
    with tf.Session() as sess:

        # Create new model or load old one
        model = create_model(sess, FLAGS)

        # Training begins
        losses = []
        for epoch_num, epoch in enumerate(
                generate_epoch(train_encoder_inputs, train_decoder_inputs,
                               train_targets, train_en_seq_lens,
                               train_sp_seq_len, FLAGS.num_epochs,
                               FLAGS.batch_size)):

            print "EPOCH: %i" % (epoch_num)
            # Decay learning rate
            sess.run(tf.assign(model.lr, FLAGS.learning_rate * \
                (FLAGS.learning_rate_decay_factor ** epoch_num)))

            batch_loss = []

            for batch_num, (batch_encoder_inputs, batch_decoder_inputs,
                            batch_targets, batch_en_seq_lens,
                            batch_sp_seq_lens) in enumerate(epoch):

                loss, _ = model.step(sess, FLAGS, batch_encoder_inputs,
                                     batch_decoder_inputs, batch_targets,
                                     batch_en_seq_lens, batch_sp_seq_lens,
                                     FLAGS.dropout)

                batch_loss.append(loss)

            losses.append(np.mean(batch_loss))

        plt.plot(losses, label='loss')
        plt.legend()
        plt.show()
Exemplo n.º 2
0
def sample(FLAGS):

    # Load the data needed to convert your sentence
    en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \
        process_data('data/en.p', max_vocab_size=5000, target_lang=False)
    sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \
        process_data('data/sp.p', max_vocab_size=5000, target_lang=True)

    # Change FLAGS parameters
    FLAGS.batch_size = 1
    FLAGS.en_vocab_size = len(en_vocab_dict)
    FLAGS.sp_vocab_size = len(sp_vocab_dict)
    FLAGS.sp_max_len = max(sp_seq_lens) + 1 # GO token

    # Process sample sentence
    inference_sentence = ["I like to play tennis and eat sandwiches."]
    # Split into tokens
    tokenized = []
    for i in xrange(len(inference_sentence)):
        tokenized.append(basic_tokenizer(inference_sentence[i]))
    # Convert data to token ids
    data_as_tokens, sample_en_seq_lens = data_to_token_ids(
        tokenized, en_vocab_dict, target_lang=False, normalize_digits=True)

    # make dummy_sp_inputs
    dummy_sp_inputs = np.array([[GO_ID]*FLAGS.sp_max_len])
    sample_sp_seq_lens = np.array([len(dummy_sp_inputs)])

    print data_as_tokens
    print sample_en_seq_lens
    print dummy_sp_inputs
    print sample_sp_seq_lens

    with tf.Session() as sess:

        # Load trained model
        model = create_model(sess, FLAGS, forward_only=True)

        y_pred = model.step(sess, FLAGS, batch_encoder_inputs=data_as_tokens,
            batch_decoder_inputs=dummy_sp_inputs, batch_targets=None,
            batch_en_seq_lens=sample_en_seq_lens,
            batch_sp_seq_lens=sample_sp_seq_lens,
            dropout=0.0, forward_only=True, sampling=True)

        # compose the predicted sp sentence
        sp_sentence = []
        for idx in y_pred[0]:
            sp_sentence.append(sp_rev_vocab_dict[idx])
        print " ".join([word for word in sp_sentence])
Exemplo n.º 3
0
    def __init__(self):
        self.df = load_oxford_data()
        self.min_date = self.df["Date"].unique().min()
        self.max_date = self.df["Date"].unique().max()
        self.num_date = self.df["Date"].nunique()
        # merge with Hopkins data
        hopkins = Hopkins()
        hdf = hopkins.data
        hdf_max_date = hdf['Date'].max()

        self.df = self.df[self.df['Date'] <= hdf_max_date]
        self.df = self.df.merge(hdf, on=['Date', 'Country_Code'], how='left')
        print("- merged Oxford and Hopkins", self.df.shape)

        self.min_date = self.df["Date"].unique().min()
        self.max_date = self.df["Date"].unique().max()
        self.num_date = self.df["Date"].nunique()
        print("- dates from {} to {} total {} days".format(
            self.min_date, self.max_date, self.num_date))
        self.mdf = self.df[[
            "Country_Code", "CountryName", "Date", "DateTime",
            "ConfirmedCases", "ConfirmedDeaths", "Recovered", "StringencyIndex"
        ]].copy()

        self.mdf = process_data(self.mdf, self.df)
Exemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--mode",
                        choices={"train", "chat"},
                        default="train",
                        help="mode. if not specified, it's in the train mode")
    args = parser.parse_args()

    if not os.path.exists(os.path.join(config.DATA_PATH, "test_ids.dec")):
        data_utils.process_data()
    print("Data ready!")
    # create checkpoints folder if there isn't one already
    data_utils.make_dir(config.CPT_PATH)

    if args.mode == "train":
        train()
    elif args.mode == "chat":
        chat()
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--mode',
                        choices={'train', 'chat'},
                        default='train',
                        help="mode. if not specified, it's in the train mode")
    args = parser.parse_args()

    if not os.path.isdir(config.PROCESSED_PATH):
        data_utils.prepare_raw_data()
        data_utils.process_data()
    print('Data ready!')
    data_utils.make_dir(config.CPT_PATH)

    if args.mode == 'train':
        train()
    elif args.mode == 'chat':
        chat()
Exemplo n.º 6
0
def train(params):
	hindi_token_ids, hindi_seq_lens, hindi_vocab_dict, hindi_rev_vocab_dict = process_data('../data/hindi_dump.p', max_vocab_size=100000, target_lang=False)
	bengali_token_ids, bengali_seq_lens, bengali_vocab_dict, bengali_rev_vocab_dict = process_data('../data/bengali_dump.p', max_vocab_size=100000, target_lang=True)
	train_encoder_inputs, train_decoder_inputs, train_targets, train_hindi_seq_lens, train_bengali_seq_len, valid_encoder_inputs, valid_decoder_inputs, valid_targets, valid_hindi_seq_lens, valid_bengali_seq_lens = split_data(hindi_token_ids, bengali_token_ids, hindi_seq_lens, bengali_seq_lens,train_ratio=0.8)

	params.hindi_vocab_size = len(hindi_vocab_dict)
	params.bengali_vocab_size = len(bengali_vocab_dict)

	print params.hindi_vocab_size, params.bengali_vocab_size

	with tf.Session() as sess:
		_model = model(params)
		sess.run(tf.global_variables_initializer())
		losses = []
		accs = []
		for epoch_num, epoch in enumerate(generate_epoch(train_encoder_inputs,train_decoder_inputs, train_targets,train_hindi_seq_lens, train_bengali_seq_len,params.num_epochs, params.batch_size)):
			print "EPOCH : ", epoch_num
			sess.run(tf.assign(_model.lr, 0.01 * (0.99 ** epoch_num)))
			batch_loss = []
			batch_acc = []
			for batch_num, (batch_encoder_inputs, batch_decoder_inputs,batch_targets, batch_hindi_seq_lens,batch_bengali_seq_lens) in enumerate(epoch):
				loss, _,acc = _model.step(sess, params,batch_encoder_inputs, batch_decoder_inputs, batch_targets,batch_hindi_seq_lens, batch_bengali_seq_lens,params.dropout)
				batch_loss.append(loss)
				batch_acc.append(acc)
			losses.append(np.mean(batch_loss))
			accs.append(np.mean(batch_acc))
			print "Training Loss: ",losses[-1]
			print "Training Accuracy",accs[-1]
		plt.plot(losses, label='loss')
		plt.legend()
		# plt.show()
		
		plt.title('Plot for Training Error versus Epochs', fontsize='20', style='oblique')
		plt.xlabel('Epochs', fontsize='16', color='green')
		plt.ylabel('Training Error', fontsize='16', color='green')
		plt.savefig('../output/plot.png')
		plt.show()

		acc = _model.test(sess, params, valid_encoder_inputs, valid_decoder_inputs, valid_targets, valid_hindi_seq_lens, valid_bengali_seq_lens, params.dropout)
		print acc
Exemplo n.º 7
0
def read_h5_file(h5_path):
    h5_file = h5py.File(h5_path)
    y = np.array(h5_file['annotations'])
    X = np.array(h5_file['features'])
    X, y = data_utils.process_data(X, y)
    h5_file.close()
    '''
  X = non_maximal_suppression(X, range(34, 50), supp_th=0.5)
  X = non_maximal_suppression(X, range(15, 17), supp_th=(0.01*180)/np.pi)
  X = non_maximal_suppression(X, range(56, 57), supp_th=1)  # 1 degree
  '''

    return X, y
Exemplo n.º 8
0
    def __init__(self, data_path, max_length, max_vocab_size, min_freq,
                 eos_token, pad_token, unk_token, embed_dim, special_tokens,
                 threshold, pre_trained=False):
        """
        Args:
            data_path (str): path to data file
            max_length (int): maximum length of each sentence, including <eos>
            max_vocab_size (int): maximum number of words allowed in vocabulary
            min_freq (int): minimum frequency to add word to vocabulary
            eos_token (str): end of sentence token (tells decoder to start or stop)
            pad_token (str): padding token
            unk_token (str): unknown word token
            embed_dim (int): dimension of embedding vectors
            special_tokens (list of str): other tokens to add to vocabulary
            threshold (int): count of unknown words required to prune sentence
            pre_trained (Vector): pre trained word embeddings
        """
        special_tokens = [pad_token, unk_token, eos_token] + special_tokens
        # the value 0 will be regarded as padding
        assert special_tokens[0] == pad_token
        inputs, targets, counter, xlen = process_data(data_path, max_length,
                                                      eos_token, pad_token)
        self.vocab = vocab.Vocab(counter=counter, max_size=max_vocab_size,
                                 min_freq=min_freq, specials=special_tokens)
        if pre_trained is not False:
            self.vocab.load_vectors(pre_trained)
        assert len(inputs) == len(targets) and len(inputs) == len(xlen)

        self.nwords = len(self.vocab)
        self.max_len = max_length
        self.eos_idx = self.vocab.stoi[eos_token]
        self.pad_idx = self.vocab.stoi[pad_token]
        self.unk_idx = self.vocab.stoi[unk_token]
        self.eos_token = eos_token
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.embed_dim = embed_dim
        self.unk_count = 0  # number of unknown words in dataset
        self.total_tokens = 0  # number of tokens in dataset not counting padding
        self.special_tokens = special_tokens
        self.x_lens = xlen
        self.x_data = np.zeros((len(inputs), max_length), dtype=np.int32)
        self.y_data = np.zeros((len(targets), max_length), dtype=np.int32)

        convert_to_index(inputs, self, self.x_data)
        convert_to_index(targets, self, self.y_data)
        self.x_data, self.y_data, self.x_lens = prune_data(self.x_data, self.y_data,
                                                           self.x_lens, self, threshold)
        self.x_data = torch.from_numpy(self.x_data)
        self.y_data = torch.from_numpy(self.y_data)
Exemplo n.º 9
0
def run(args):
    files = [INPUT_DIRECTORY + '/' + f for f in listdir(INPUT_DIRECTORY) if isfile(join(INPUT_DIRECTORY, f))]
    files.sort()

    if not os.path.exists(OUTPUT_DIRECTORY) or not os.path.isdir(OUTPUT_DIRECTORY):
        os.mkdir(OUTPUT_DIRECTORY)

    latex_ouput = ''

    for filename in files:
        latex_part = "\\paragraph{"
        latex_part += filename.split('/')[-1].split('.pdf.txt.txt')[0]
        latex_part += "}\n\\begin{enumerate}\n"

        f = open(filename,"r",encoding='utf-8', errors='ignore')
        sentences = f.readlines()
        sentences = [sentence.replace('\n', '') for sentence in sentences]

        for question in QUESTIONS:
            latex_part += "\\item " + question + "\\\\\n"
            latex_part += "$\\longrightarrow$ "
            reset_dict()
            testS, testQ, testA = process_data(sentences, question)
            answer, answer_probability, mem_probs = get_pred(testS, testQ)
            memory_probabilities = np.round(mem_probs, 4)

            best_sentence_index = 0
            best_sentence_score = 0
            # print(len(memory_probabilities.tolist()))
            for index, mem in enumerate(memory_probabilities.tolist()):
                if mem[2] > best_sentence_score:
                    best_sentence_index = index
                    best_sentence_score = mem[2]

            words_l = []
            for idw in testS[0][best_sentence_index]:
                if idw == 0:
                    break
                words_l.append(decode(idw))
            sentence = ' '.join(words_l)
            sentence.replace('%', '\\%')
            sentence.replace('_', '\\_')

            latex_part += sentence + "\n"
        latex_part += "\\end{enumerate}"
        latex_ouput += "\n" + latex_part
    f = open(join(OUTPUT_DIRECTORY, 'latex_out.txt'), 'w')
    f.write(latex_ouput)
def create_most_informative_hist(gest_seq_h5_filepath, openface_dir,
    cpm_dir, user, gesture):
  gest_seq_h5 = h5py.File(gest_seq_h5_filepath, 'r') 
  per_user_var_stats, per_user_max_vel_stats = {}, {}
  final_feat_names = None

  for g in range(10):
    for target_group in ['train', 'test']:
      for target_user in gest_seq_h5[target_group].keys():
        if len(gest_seq_h5[target_group][target_user][str(g)].shape) <= 1:
          continue
        gest_seq = np.array(gest_seq_h5[target_group][target_user][str(g)])

        target_openface_h5_path = os.path.join(openface_dir, target_user) 
        target_cpm_h5_path = os.path.join(cpm_dir, target_user)
        X, y, cpm_X = data_utils.get_all_features(
            target_openface_h5_path, target_cpm_h5_path)
        X, _ = data_utils.process_data(X, y, cpm_X)
        X_filt, feat_names = filter_features(X)
        if final_feat_names is None:
          final_feat_names = feat_names

        var_stats, max_vel_stats = calculate_feat_variance(
            X_filt, feat_names, gest_seq)

        if per_user_var_stats.get(str(g)) is None:
          per_user_var_stats[str(g)] = {}
          per_user_max_vel_stats[str(g)] = {}

        per_user_var_stats[str(g)][target_user] = var_stats
        per_user_max_vel_stats[str(g)][target_user] = max_vel_stats
  gest_seq_h5.close()


  create_feature_gesture_distribution(per_user_var_stats, final_feat_names)
  
  # Plot the histgram for a given gesture
  sorted_filenames = sorted(per_user_var_stats[gesture].keys())

  start_idx = 4 * PLOT_WIDTH*PLOT_HEIGHT
  plot_histograms(per_user_var_stats[gesture], feat_names,
      sorted_filenames[start_idx:start_idx+PLOT_WIDTH*PLOT_HEIGHT])
Exemplo n.º 11
0
#!/usr/bin/env python

from argparse import ArgumentParser
from data_utils import process_data
from keras.models import load_model


if __name__ == '__main__':
    arg_parser = ArgumentParser(description='load and evalue model')
    arg_parser.add_argument('--train', help='HDF5 file with training '
                            'data')
    arg_parser.add_argument('--test', help='HDF5 file with test data')
    arg_parser.add_argument('model_file', help='HDF5 file containing '
                                               'the model')
    arg_parser.add_argument('--verbose', type=int, default=1,
                            help='verbosity level of evaluation')
    options = arg_parser.parse_args()
    model = load_model(options.model_file)
    if options.train:
        x_train, y_train = process_data(options.train)
        loss, accuracy = model.evaluate(x_train, y_train,
                                        verbose=options.verbose)
        print(f'training: loss = {loss:.4f}, accuracy = {accuracy:.4f}')
    if options.test:
        x_test, y_test = process_data(options.test)
        loss, accuracy = model.evaluate(x_test, y_test,
                                        verbose=options.verbose)
        print(f'test: loss = {loss:.4f}, accuracy = {accuracy:.4f}')
Exemplo n.º 12
0
"""
Setup scripts for downloading AS data
and pre-processing for analysis
"""

import data_utils as dutil

if __name__ == '__main__':
    # print("Downloading data from AWS")
    # download_data()

    print('Processing data')
    dutil.process_data()
Exemplo n.º 13
0
from argparse import ArgumentParser
from data_utils import process_data
from keras.models import load_model

if __name__ == '__main__':
    arg_parser = ArgumentParser(description='load and evalue model')
    arg_parser.add_argument('--train', help='HDF5 file with training ' 'data')
    arg_parser.add_argument('--test', help='HDF5 file with test data')
    arg_parser.add_argument('model_file',
                            help='HDF5 file containing '
                            'the model')
    arg_parser.add_argument('--verbose',
                            type=int,
                            default=1,
                            help='verbosity level of evaluation')
    options = arg_parser.parse_args()
    model = load_model(options.model_file)
    if options.train:
        x_train, y_train = process_data(options.train)
        loss, accuracy = model.evaluate(x_train,
                                        y_train,
                                        verbose=options.verbose)
        print(f'training: loss = {loss:.4f}, accuracy = {accuracy:.4f}')
    if options.test:
        x_test, y_test = process_data(options.test)
        loss, accuracy = model.evaluate(x_test,
                                        y_test,
                                        verbose=options.verbose)
        print(f'test: loss = {loss:.4f}, accuracy = {accuracy:.4f}')
Exemplo n.º 14
0
def train(create_data, log_file):
    """Trains a english to simple english translation model.
    
    Args:
        create_data: whether to load data from databases or not on startup.
        log_file: where to store training data outputs.
    """

    if os.path.isfile('./' + log_file):
        raise ValueError('log file already exists')

    if create_data:
        data_utils.process_data()

    with tf.Session() as sess, open(log_file, 'w+') as log:
        print 'Opening log file'
        fields = [
            'step', 'step-time', 'batch-loss', 'batch-perplexity', 'learnrate',
            'val-loss'
        ]
        log.write(','.join(fields) + '\n')

        print 'creating model'
        model = create_model(sess, False)

        print 'reading data'
        train, valid = read_data(dc.NORMAL_IDS_PATH, dc.SIMPLE_IDS_PATH)

        print 'entering training loop'
        step_time, loss = 0.0, 0.0
        current_step = 0
        prev_losses = []

        # Training loop
        while current_step < dc.NUM_STEPS or dc.IGNORE_STEPS:
            start_time = time.time()
            encoder_in, decoder_in, target_weights = model.get_batch(train)
            step_loss, _ = model.step(sess, encoder_in, decoder_in,
                                      target_weights, False)
            step_time += (time.time() - start_time) / dc.STEPS_PER_CHECKPOINT

            loss += step_loss / dc.STEPS_PER_CHECKPOINT
            current_step += 1

            if current_step < dc.STEPS_PER_CHECKPOINT:
                print "Step: %f" % current_step
                print "Loss: %f" % step_loss
                print "Learning: %f" % model.learning_rate.eval()

            # Every some amount of steps, output stats and check validation loss
            if current_step % dc.STEPS_PER_CHECKPOINT == 0:
                learnrate = model.learning_rate.eval()
                perplex = math.exp(float(loss)) if loss < 300 else float("inf")
                step = model.global_step.eval()
                print "step %d loss %f plex: %f learnrate %f step-time: %f" % (
                    step, loss, perplex, learnrate, step_time)

                if len(prev_losses) > 2 and loss > max(
                        prev_losses[-1 * dc.DECAY_POINT:]):
                    sess.run(model.learning_rate_decay_op)
                prev_losses.append(loss)
                checkpoint_path = os.path.join(dc.CKPT_PATH, 'simplify.ckpt')
                model.saver.save(sess,
                                 checkpoint_path,
                                 global_step=model.global_step)

                encoder_in, decoder_in, target_weights = model.get_batch(valid)
                val_loss, outputs = model.step(sess, encoder_in, decoder_in,
                                               target_weights, True)

                if dc.DEBUG:
                    print "ENCODER LEN"
                    print len(encoder_in[0])
                    print "OUTPUT LENs"
                    print len(outputs)
                    print len(outputs[0])
                    print len(outputs[0][0])
                outputs = [
                    int(np.argmax(logit, axis=1)[0]) for logit in outputs
                ]

                if dc.DEBUG:
                    print outputs

                print "validation loss: %f" % val_loss

                fields = [step, step_time, loss, perplex, learnrate, val_loss]
                log.write(','.join(map(str, fields)) + '\n')

                step_time, loss = 0.0, 0.0
            sys.stdout.flush()
Exemplo n.º 15
0
    def save(self, step):
        self.saver.save(self.sess,
                        self.config.ckpt_path + '.ckpt',
                        global_step=step)

    def restore(self):
        # get checkpoint state
        ckpt = tf.train.get_checkpoint_state(self.config.ckpt_path)
        # restore session
        if ckpt and ckpt.model_checkpoint_path:
            self.saver.restore(self.sess, ckpt.model_checkpoint_path)


if __name__ == '__main__':
    K.set_learning_phase(1)
    graph = tf.Graph()
    sess = tf.Session()
    config = Config()
    model = Model(config, sess, graph)
    train_data, validation_data, test_data = du.process_data()
    batches = du.generate_train_batches(train_data, config.batch_size)
    batch = du.get_next_batch(batches)
    batch_images, batch_labels = map(list, zip(*batch))
    batch_images = np.array(batch_images)
    batch_labels = np.array(batch_labels)
    batch_images = batch_images.reshape(-1, config.image_size,
                                        config.image_size, config.channels)
    pred, loss = model.predict(batch_images, batch_labels)
    print(loss)
Exemplo n.º 16
0
def predict(data_path='data/predict/',
            model_path='pretrained/model.h5',
            use_prophet=False):
    """ Main function to run the prediction

    """

    print('Loading and processing data ...')
    # get the processed dataframe
    df = process_data(data_path)

    # predict using only last 14 days of the time series data
    df = get_last_nday(df, args.num_day)
    print('done!')

    gh_list = list(df.geohash6.unique())
    chunk_size = len(gh_list) // (args.num_thread - 1)
    gh_chunks = [
        gh_list[chunk_size * i:chunk_size * (i + 1)]
        for i in range(args.num_thread)
    ]
    gh_chunks = [ch for ch in gh_chunks if ch]

    pool = Pool(args.num_thread)

    if use_prophet:
        # use facebook's prophet to predict
        p_prophet_predict = partial(prophet_predict, df)

        predictions = []
        preds = pool.map(p_prophet_predict, gh_chunks)
        [predictions.extend(pred) for pred in preds]

        pred_df = pd.concat(predictions, ignore_index=True)

        pred_df.loc[:,
                    'demand'] = pred_df['demand'].clip_lower(0).clip_upper(1)

        return pred_df
    else:
        print('Predict using wavenet model...')
        # load model
        model = load_model(model_path)

        # extract feature for all locations
        print('Extracting features ...')
        p_extract_feature = partial(extract_feature, df)

        features = []

        fts = pool.map(p_extract_feature, gh_chunks)
        [features.extend(ft) for ft in fts]
        print('done!')

        # split features into batches
        feature_chunks = [
            features[args.batch_size * i:args.batch_size * (i + 1)]
            for i in range(int(len(features) / args.batch_size) + 1)
        ]
        feature_chunks = [ch for ch in feature_chunks if ch]

        # run prediction using keras model
        print('Predicting ...')
        predictions = []
        for each_chunk in feature_chunks:
            print(len(each_chunk))
            batch = np.concatenate(each_chunk, axis=0)
            pred = model.predict(batch)
            pred = np.reshape(pred, (pred.shape[0], pred.shape[1]))
            predictions.extend(list(pred))
        print('done!')
        print(len(predictions))

        # create result dataframe from prediction results
        print('Constructing result dataframe ...')
        pred_df = []
        for i, pred in enumerate(predictions):
            res = build_result_dataframe(gh_list[i], pred, df)
            pred_df.append(res)

        pred_df = pd.concat(pred_df, ignore_index=True)
        print('done!')

        pred_df.loc[:,
                    'demand'] = pred_df['demand'].clip_lower(0).clip_upper(1)

        return pred_df
Exemplo n.º 17
0
def main(_):

    vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path)
    embed_path = FLAGS.embed_path or pjoin(
        "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    logging.info(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    # ========= Load Dataset =========
    # You can change this code to load dataset in your own way

    dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path))
    dev_filename = os.path.basename(FLAGS.dev_path)
    context_data, question_data, context_data_chars, question_data_chars, question_uuid_data = prepare_dev(
        dev_dirname, dev_filename, vocab)

    context_data, context_lengths = process_data(context_data,
                                                 FLAGS.output_size)
    question_data, question_lengths = process_data(question_data,
                                                   FLAGS.max_question_length)

    # TODO: use process_data()
    for question_token in question_data_chars:
        question_token.extend(
            [[qa_data.PAD_ID] * FLAGS.max_word_length] *
            (FLAGS.max_question_length - len(question_token)))

    for context_token in context_data_chars:
        context_token.extend([[qa_data.PAD_ID] * FLAGS.max_word_length] *
                             (FLAGS.output_size - len(context_token)))

    dataset = (context_data, context_lengths, question_data, question_lengths,
               context_data_chars, question_data_chars, question_uuid_data)

    # ========= Model-specific =========
    # You must change the following code to adjust to your model

    config = Config(FLAGS)
    encoder = Encoder(config)
    if FLAGS.model == 'baseline' or FLAGS.model == 'baseline-v2' or FLAGS.model == 'baseline-v3' or FLAGS.model == 'baseline-v4' or FLAGS.model == 'baseline-v5':
        decoder = Decoder(config)
    else:
        decoder = HMNDecoder(config)
    mixer = Mixer(config)

    qa = QASystem(encoder, decoder, mixer, embed_path, config, FLAGS.model)

    with tf.Session() as sess:
        train_dir = get_normalized_train_dir(FLAGS.train_dir)
        initialize_model(sess, qa, train_dir)
        answers = generate_answers(sess, qa, dataset, rev_vocab)

        # write to json file to root dir
        with io.open('dev-prediction.json', 'w', encoding='utf-8') as f:
            f.write(unicode(json.dumps(answers, ensure_ascii=False)))
Exemplo n.º 18
0
def train(FLAGS):

    # Load the data
    en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \
        process_data('data/tst2013.en', max_vocab_size=30000, target_lang=False)
    sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \
        process_data('data/tst2013.tr', max_vocab_size=30000, target_lang=True)

    # Split into train and validation sets
    train_encoder_inputs, train_decoder_inputs, train_targets, \
        train_en_seq_lens, train_sp_seq_len, \
        valid_encoder_inputs, valid_decoder_inputs, valid_targets, \
        valid_en_seq_lens, valid_sp_seq_len = \
        split_data(en_token_ids, sp_token_ids, en_seq_lens, sp_seq_lens,
            train_ratio=0.8)
    
    output = open('data/vocab_en.pkl', 'wb')
    pickle.dump(en_vocab_dict, output)
    output.close()
    output = open('data/vocab_sp.pkl', 'wb')
    pickle.dump(sp_vocab_dict, output)
    output.close()

    # Update parameters
    FLAGS.en_vocab_size = len(en_vocab_dict)
    FLAGS.sp_vocab_size = len(sp_vocab_dict)

    print 'len(en_vocab_dict)', len(en_vocab_dict)
    print 'len(sp_vocab_dict)', len(sp_vocab_dict)
    
    # Start session
    with tf.Session() as sess:
        model = None
        # Create new model or load old one
        f = checkpoint_path + ".index"
        print f
        exit()
        if os.path.isfile(f):
            model = restore_model(sess)
        else:
            model = create_model(sess, FLAGS)

        # Training begins
        losses = []
        for epoch_num, epoch in enumerate(generate_epoch(train_encoder_inputs,
            train_decoder_inputs, train_targets,
            train_en_seq_lens, train_sp_seq_len,
            FLAGS.num_epochs, FLAGS.batch_size)):

            print "EPOCH: %i" % (epoch_num)
            # Decay learning rate
            sess.run(tf.assign(model.lr, FLAGS.learning_rate * \
                (FLAGS.learning_rate_decay_factor ** epoch_num)))

            batch_loss = []

            for batch_num, (batch_encoder_inputs, batch_decoder_inputs,
                batch_targets, batch_en_seq_lens,
                batch_sp_seq_lens) in enumerate(epoch):

                loss, _ = model.step(sess, FLAGS,
                    batch_encoder_inputs, batch_decoder_inputs, batch_targets,
                    batch_en_seq_lens, batch_sp_seq_lens,
                    FLAGS.dropout)
                print loss
                batch_loss.append(loss)
            print 'mean: ', np.mean(batch_loss)

            print "Saving the model."
            model.saver.save(sess, checkpoint_path)
def create_data_augmentation_2(
        fdir,
        gest_seq_h5,
        new_filepath,
        win_sizes=[16, 32, 64],
        labels=[6, 7, 8, 9, 10],
        aug_type=OpenfaceAugmentationType.LANDMARKS_ONLY):
    h5_f = h5py.File(gest_seq_h5, 'r')
    h5_train = h5_f['train']
    new_h5 = h5py.File(new_filepath, 'w')
    all_aug_map = {}
    count = 0
    for f in h5_train.keys():
        all_aug_map[f] = {}
        v = h5_train[f]
        fpath = fdir + '/' + f
        currf_h5 = h5py.File(fpath, 'r')
        num_augmentations = 16
        if aug_type.aug_type == OpenfaceAugmentationType.LANDMARKS_ONLY:
            X = data_utils.trim_extra_landmarks(currf_h5['features'])
        elif aug_type.aug_type == OpenfaceAugmentationType.LANDMARKS_AND_VELOCITY:
            X, _ = data_utils.process_data(currf_h5['features'],
                                           currf_h5['annotations'])
        elif aug_type.aug_type == OpenfaceAugmentationType.ALL_LANDMARKS_AND_POSE:
            X = np.array(currf_h5['features'])
            X = X[:, :148]
            num_augmentations = 16
        else:
            assert (False)

        for i in labels:
            str_i = str(i)
            gest_seq = v[str(i)]
            all_aug_map[f][str_i] = {}
            for seq in gest_seq:
                if type(seq) != type(np.array([])):
                    continue
                gest_len = seq[1] - seq[0]
                gest_start = seq[0] + (gest_len // 5)
                gest_end = seq[1] - (gest_len // 5)
                for t in range(gest_start, gest_end + 1, WIN_STEP):
                    str_t = str(t)
                    all_aug_map[f][str_i][str_t] = {}
                    for win_size in win_sizes:
                        seq_augmentation = get_all_seq_augmentations_4(
                            X,
                            t,
                            win_size,
                            num_augmentations=num_augmentations)
                        all_aug_map[f][str_i][str_t][str(
                            win_size)] = seq_augmentation
                        count = count + 1
                        if count % 300 == 0:
                            print('Did get seq augmentation for file: {}, label: {}, ' \
                                  't: {}, win_size: {}'.format(f, i, t, win_size))

        print('Did process file {}'.format(f))
        data_utils.recursively_save_dict_contents_to_group(
            new_h5, str('/' + f + '/'), all_aug_map[f])
        print('Did write {} augmentations'.format(f))
        new_h5.flush()
        all_aug_map[f] = {}

    new_h5.flush()
    new_h5.close()
Exemplo n.º 20
0
def train(FLAGS):

    # Load the data
    en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \
        process_data('data/en.p', max_vocab_size=5000, target_lang=False)
    sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \
        process_data('data/sp.p', max_vocab_size=5000, target_lang=True)

    # Split into train and validation sets
    train_encoder_inputs, train_decoder_inputs, train_targets, \
        train_en_seq_lens, train_sp_seq_len, \
        valid_encoder_inputs, valid_decoder_inputs, valid_targets, \
        valid_en_seq_lens, valid_sp_seq_len = \
        split_data(en_token_ids, sp_token_ids, en_seq_lens, sp_seq_lens,
            train_ratio=0.8)

    # Update parameters
    FLAGS.en_vocab_size = len(en_vocab_dict)
    FLAGS.sp_vocab_size = len(sp_vocab_dict)
    FLAGS.sp_max_len = max(sp_seq_lens) + 1 # GO token

    # Start session
    with tf.Session() as sess:

        # Create new model or load old one
        model = create_model(sess, FLAGS, forward_only=False)

        # Training begins
        train_losses = []
        valid_losses = []
        for epoch_num, epoch in enumerate(generate_epoch(train_encoder_inputs,
            train_decoder_inputs, train_targets,
            train_en_seq_lens, train_sp_seq_len,
            FLAGS.num_epochs, FLAGS.batch_size)):

            print "EPOCH: %i" % (epoch_num)
            # Decay learning rate
            sess.run(tf.assign(model.lr, FLAGS.learning_rate * \
                (FLAGS.learning_rate_decay_factor ** epoch_num)))

            batch_loss = []

            for batch_num, (batch_encoder_inputs, batch_decoder_inputs,
                batch_targets, batch_en_seq_lens,
                batch_sp_seq_lens) in enumerate(epoch):

                y_pred, loss, _ = model.step(sess, FLAGS,
                    batch_encoder_inputs, batch_decoder_inputs, batch_targets,
                    batch_en_seq_lens, batch_sp_seq_lens,
                    FLAGS.dropout, forward_only=False)

                batch_loss.append(loss)
            train_losses.append(np.mean(batch_loss))

            for valid_epoch_num, valid_epoch in enumerate(generate_epoch(valid_encoder_inputs,
                valid_decoder_inputs, valid_targets,
                valid_en_seq_lens, valid_sp_seq_len,
                num_epochs=1, batch_size=FLAGS.batch_size)):

                batch_loss = []

                for batch_num, (batch_encoder_inputs, batch_decoder_inputs,
                    batch_targets, batch_en_seq_lens,
                    batch_sp_seq_lens) in enumerate(valid_epoch):

                    loss = model.step(sess, FLAGS,
                        batch_encoder_inputs, batch_decoder_inputs, batch_targets,
                        batch_en_seq_lens, batch_sp_seq_lens,
                        dropout=0.0, forward_only=True, sampling=False)

                    batch_loss.append(loss)
                valid_losses.append(np.mean(batch_loss))

        # Save checkpoint.
        if not os.path.isdir(FLAGS.ckpt_dir):
            os.makedirs(FLAGS.ckpt_dir)
        checkpoint_path = os.path.join(FLAGS.ckpt_dir, "model.ckpt")
        print "Saving the model."
        model.saver.save(sess, checkpoint_path,
                         global_step=model.global_step)

        plt.plot(train_losses, label='train_loss')
        plt.plot(valid_losses, label='valid_loss')
        plt.legend()
        plt.show()
Exemplo n.º 21
0
dir_results = "./results/"
if not os.path.isdir(dir_trained):
    os.mkdir(dir_trained)
if not os.path.isdir(dir_results):
    os.mkdir(dir_results)

config = Config()
config.set_params_parser()

data, idxs = read_data(use_loaded=True)
idxs_train, idxs_dev, idxs_test = idxs
X, y, emb, tokenizer, label_encoder = preprocess_data(data=data,
                                                      use_loaded=True)
X_sup_train, y_sup_train = process_data(get_supplementation(
    data.iloc[idxs_train], 'train', use_loaded=True),
                                        tokenizer,
                                        label_encoder,
                                        max_len_seq=35)
X_sup_dev, y_sup_dev = process_data(get_supplementation(data.iloc[idxs_dev],
                                                        'dev',
                                                        use_loaded=True),
                                    tokenizer,
                                    label_encoder,
                                    max_len_seq=35)
X_eec, y_eec, idxs_identity = read_eec(tokenizer, label_encoder)
debias_weights = read_weights()

data_official_test = read_official_test()
X_official_test, _ = process_data(data_official_test, tokenizer, label_encoder)

acc_dev_list, auc_dev_list, acc_test_list, auc_test_list = [], [], [], []
Exemplo n.º 22
0
def _main(args):
    data_path = os.path.expanduser(args.data_path)
    classes_path = os.path.expanduser(args.classes_path)
    anchors_path = os.path.expanduser(args.anchors_path)
    result_path = os.path.expanduser(args.result_path)
    test_path = os.path.expanduser(args.test_path)
    model_prefix = os.path.expanduser(args.model_prefix)
    num_frozen = int(args.num_frozen)
    num_trials = int(args.num_trials)
    num_epochs = int(args.num_epochs)
    shuffle_input = bool(int(args.shuffle))

    class_names = get_classes(classes_path)

    data = np.load(data_path)  # custom data saved as a numpy file.
    #  has 2 arrays: an object array 'boxes' (variable length of boxes in each image)
    #  and an array of images 'images'

    anchors = get_anchors(anchors_path)
    anchors = YOLO_ANCHORS

    for trial in range(num_trials):

        # Reprocess data to populate image_data_gen. Sacrifice latency for memory
        image_data_gen, boxes = data_utils.process_data(
            iter(data['images']),
            data['images'].shape[2],
            data['images'].shape[1],
            data['boxes'],
            dim=608)
        detectors_mask, matching_true_boxes = get_detector_mask(boxes, anchors)

        model_name = model_prefix + "-" + str(num_frozen) + "fr-trial" + str(
            trial)
        print "Training model:", model_name

        train(class_names,
              anchors,
              image_data_gen,
              boxes,
              detectors_mask,
              matching_true_boxes,
              model_name,
              num_frozen,
              num_epochs,
              shuffle_input=shuffle_input)

        if test_path != "" and result_path != "":
            mAP, precision, recalls = run_inference(
                model_name + ".h5",
                anchors,
                classes_path,
                test_path,
                None,  # output_path
                1,  # mode
                0.5,  # score_threshold
                0.5,  # iou_threshold
                0.5)  # mAP_iou_threshold
            with open(result_path, "a+") as f:
                line = "%d,%d,%.6g,%.6g,%.6g,%d,%s\n" % (
                    trial, num_frozen, mAP, np.average(precision),
                    np.average(recalls), num_epochs, model_name + ".h5")
                f.write(line)
Exemplo n.º 23
0
# Import nescessary Library
import string
import math
from collections import Counter  # To find different characters between two sentences
from difflib import SequenceMatcher
import json
import data_utils as dt
import matplotlib.pyplot as plt
import nltk

# input: and essay with plain_text and markup
# output: return a list of similar errors of word_choice with number of error for each
# The related words for that error and the indices (on markup) of that error
# Getting data
data = dt.process_data('Data/tai-documents-v3/tai-documents-v3.json')


def word_choice(input):
    output = []
    check = False
    for i in range(len(input['markup'])):
        error = input['markup'][i]
        check = False
        if error['type'] == 'word choice':
            # Check if we see this error before, update the error_count
            for item in output:
                if ((error['old_text'] in item['words'])
                        or (error['new_text'] in item['words'])):
                    check = True  # Set this error already marked
                    item['index'].append(i)
                    item['words'].add(error['old_text'])
Exemplo n.º 24
0
def train(FLAGS):

    # Load the data
    en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \
        process_data('data/my_en.txt', max_vocab_size=5000, target_lang=False)
    sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \
        process_data('data/my_sp.txt', max_vocab_size=5000, target_lang=True)

    # Split into train and validation sets
    train_encoder_inputs, train_decoder_inputs, train_targets, \
        train_en_seq_lens, train_sp_seq_len, \
        valid_encoder_inputs, valid_decoder_inputs, valid_targets, \
        valid_en_seq_lens, valid_sp_seq_len = \
        split_data(en_token_ids, sp_token_ids, en_seq_lens, sp_seq_lens,
            train_ratio=0.8)

    output = open('data/vocab_en.pkl', 'wb')
    pickle.dump(en_vocab_dict, output)
    output.close()
    output = open('data/vocab_sp.pkl', 'wb')
    pickle.dump(sp_vocab_dict, output)
    output.close()

    # Update parameters
    FLAGS.en_vocab_size = len(en_vocab_dict)
    FLAGS.sp_vocab_size = len(sp_vocab_dict)

    print 'len(en_vocab_dict)', len(en_vocab_dict)
    print 'len(sp_vocab_dict)', len(sp_vocab_dict)

    # Start session
    with tf.Session() as sess:

        # Create new model or load old one
        model = create_model(sess, FLAGS)

        # Training begins
        losses = []
        for epoch_num, epoch in enumerate(
                generate_epoch(train_encoder_inputs, train_decoder_inputs,
                               train_targets, train_en_seq_lens,
                               train_sp_seq_len, FLAGS.num_epochs,
                               FLAGS.batch_size)):

            print "EPOCH: %i" % (epoch_num)
            # Decay learning rate
            sess.run(tf.assign(model.lr, FLAGS.learning_rate * \
                (FLAGS.learning_rate_decay_factor ** epoch_num)))

            batch_loss = []

            for batch_num, (batch_encoder_inputs, batch_decoder_inputs,
                            batch_targets, batch_en_seq_lens,
                            batch_sp_seq_lens) in enumerate(epoch):

                loss, _ = model.step(sess, FLAGS, batch_encoder_inputs,
                                     batch_decoder_inputs, batch_targets,
                                     batch_en_seq_lens, batch_sp_seq_lens,
                                     FLAGS.dropout)

                batch_loss.append(loss)

            losses.append(np.mean(batch_loss))

        checkpoint_path = "/tmp/model.ckpt"
        print "Saving the model."
        model.saver.save(sess, checkpoint_path)
        plt.plot(losses, label='loss')
        plt.legend()
        plt.savefig('seq_01.png')