示例#1
0
def get_pairs():
    """
        Create a list with tuples with pairs of airports. Like:
            [("BCN", "CAG"), ("GRO", "CAG")]

        This is created using all possible combinations from:
            congig.cfg/AIRPORTS/ORIGINDS

        And might be limited the size using congig.cfg/AIRPORTS/LIMIT
    """

    # My airports
    airports = config["AIRPORTS"]["ORIGINS"].split(",")

    pairs = []

    for origin in airports:
        for dest in airports:
            if dest != origin:
                # Append flights from both directions
                pairs.append((dest, origin))
                pairs.append((origin, dest))

    log.info(f"There are {len(pairs)} airport pairs")

    limit = int(config["AIRPORTS"]["LIMIT"])

    if limit > 0:

        pairs = pairs[:limit]
        log.info(f"Limiting the query to {limit} pairs")

    return pairs
示例#2
0
def eval_step(input_ids, 
               target_ids, 
               ):

    target_inp = target_ids[:, :-1]
    _, combined_mask, dec_padding_mask = create_masks(input_ids, target_inp)  
    (draft_predictions, draft_attention_weights, 
    refine_predictions, refine_attention_weights) = Model(
                                                           input_ids,
                                                           dec_padding_mask=dec_padding_mask,
                                                           target_ids=target_inp,
                                                           look_ahead_mask=combined_mask, 
                                                           training=False
                                                           )
    loss, target = loss_function(target_ids, 
                         draft_predictions,
                         refine_predictions, 
                         Model
                         )
    train_loss(loss)
    log.info(Model.summary())
    if config.save_initial_weights:
        initial_weights = os.path.join(config.initial_weights,'initial_weights')
        Model.save_weights(initial_weights)

    return loss
示例#3
0
def save_evaluate_monitor(ck_pt_mgr, val_dataset, target_tokenizer,
                          predictions, target_ids, step, start_time,
                          bert_f1_score):

    ckpt_save_path = ck_pt_mgr.save()
    # print the detokenized training output of a single sample
    train_sanity_check(target_tokenizer, predictions, target_ids, log)
    # Run evaluation only if the train_loss is lesser than validate_when_train_loss_is
    if train_loss.result() < config.validate_when_train_loss_is:
        (validation_perplexity, val_bert_score, draft_attention_weights,
         refine_attention_weights) = evaluate_validation_set(val_dataset)
        early_stop_training = monitor_eval_metrics(ckpt_save_path,
                                                   validation_perplexity,
                                                   val_bert_score,
                                                   train_loss.result(), step,
                                                   log, config)
    else:
        log.info(
            'Not running evaluation since loss is not lesser than config.validate_when_train_loss_is'
        )
        (validation_perplexity, val_bert_score) = (0, 0)
        early_stop_training = False
        draft_attention_weights = None
        refine_attention_weights = None

    training_results(step, train_loss.result(), bert_f1_score.numpy(),
                     validation_perplexity, val_bert_score,
                     (time.time() - start_time), ckpt_save_path, log, config)
    train_loss.reset_states()
    return (early_stop_training, draft_attention_weights,
            refine_attention_weights)
示例#4
0
def main():
    """ Read the airport data from internet and store it as a pickle """

    df = get_airports()
    df = fix_encodings(df)

    uri = f"{config['PATHS']['DATA']}airports.pickle"
    df.to_pickle(uri)
    log.info(f"Airport data exported to '{uri}'")
def train_sanity_check(tokenizer, predictions, target_id):
    # use the last sample in the batch
    predicted, target = detokenize(
        tokenizer, tf.squeeze(tf.argmax(predictions, axis=-1)[-1:]),
        tf.squeeze(target_id[:, :-1][-1:]))
    log.info(f'the true output_sequence is {target}')
    log.info(f'the predicted output_seq with teacher forcing is \
      {        predicted if predicted else "empty hence evaluation will be skipped"}'
             )
    return predicted
def check_ckpt(checkpoint_path):
    ckpt = tf.train.Checkpoint(Model=Model, optimizer=optimizer)
    ckpt_manager = tf.train.CheckpointManager(ckpt,
                                              checkpoint_path,
                                              max_to_keep=10)
    if tf.train.latest_checkpoint(checkpoint_path):
        ckpt.restore(ckpt_manager.latest_checkpoint)
        log.info(ckpt_manager.latest_checkpoint + ' restored')
    else:
        log.info('No checkpoint found')
    return (ckpt_manager)
示例#7
0
    def run(self):
        # Store start time and task name
        self.name = self.__class__.__name__
        self.t_data["name"] = self.name
        self.start_time = time.time()
        self.t_data["start_time"] = datetime.now().strftime(
            "%Y-%m-%d %H:%M:%S")

        # Run the task and store the resutls
        log.info(f"Starting {self.name}")
        self.run_std()
        self.save_result()
示例#8
0
def check_ckpt(checkpoint_path):

    ckpt = tf.train.Checkpoint(Model=Model, optimizer=optimizer)
    ckpt_manager = tf.train.CheckpointManager(ckpt,
                                              checkpoint_path,
                                              max_to_keep=10)
    if tf.train.latest_checkpoint(checkpoint_path):
        ckpt.restore(ckpt_manager.latest_checkpoint)
        log.info(ckpt_manager.latest_checkpoint + ' restored')
    else:
        log.warning('No checkpoint found so using the initialized_weights')

    return ckpt_manager
示例#9
0
def main(
    uri_in=f"{config['PATHS']['DATA']}flights/{date.today():%Y_%m_%d}/*.csv",
    uri_out=f"{config['PATHS']['DATA']}flights.parquet",
):
    """ Retreive airports """

    spark = SparkSession.builder.getOrCreate()
    sdf = spark.read.option("header", "true").csv(uri_in)

    log.info(f"Merging {sdf.count()} rows of flights data")

    sdf.write.partitionBy("Inserted").parquet(uri_out, mode="overwrite")
    log.info(f"File '{uri_out}' exported")
示例#10
0
def main(path=f"{config['PATHS']['DATA']}flights/{date.today():%Y_%m_%d}/"):
    """ Check data there is data in the flights folder """

    spark = SparkSession.builder.getOrCreate()
    sdf = spark.read.option("header", "true").csv(f"{path}*.csv")

    n_rows = sdf.count()

    if n_rows == 0:
        msg = f"There is no data in {path}"
        log.info(msg)
        raise ValueError(msg)

    log.info(f"There are {n_rows} rows of flights data")
示例#11
0
def save_url(url):
    blocked_urls = select(b.value for b in Block
                          if b.type == "Url" and b.active)[:]

    if not any(x for x in blocked_urls if x in url):
        result = select(p for p in Url if p.url == url).count()
        if result == 0:
            if ".onion" in url:
                url_object = Url(url=url, date_added=datetime.now())
            else:
                print("{} is blocked".format(url))
    else:
        log.info("URL: {} is blocked".format(url))

    commit()
def _embedding_from_bert():

    with tf.device("CPU:0"):
        input_pretrained_bert = TFAutoModel.from_pretrained(
            config.input_pretrained_model,
            trainable=False,
            name=config.input_pretrained_model)
        target_pretrained_bert = TFAutoModel.from_pretrained(
            config.target_pretrained_model,
            trainable=False,
            name=config.target_pretrained_model
        ) if config['task'] == 'translate' else input_pretrained_bert
    decoder_embedding = target_pretrained_bert.get_weights()[0]
    log.info(f"Decoder_Embedding matrix shape '{decoder_embedding.shape}'")

    return (decoder_embedding, input_pretrained_bert, target_pretrained_bert)
示例#13
0
def monitor_run(ckpt_save_path,
                bert_score,
                rouge_score,
                train_loss,
                step,
                to_monitor=config.monitor_metric):

    ckpt_fold, ckpt_string = os.path.split(ckpt_save_path)
    if config.run_tensorboard:
        with valid_output_sequence_writer.as_default():
            tf.summary.scalar('ROUGE_f1', rouge_score, step=step)
            tf.summary.scalar('BERT_f1', bert_score, step=step)
    monitor_metrics = dict()
    monitor_metrics['BERT_f1'] = bert_score
    monitor_metrics['ROUGE_f1'] = rouge_score
    monitor_metrics['combined_metric'] = (monitor_metrics['BERT_f1'],
                                          monitor_metrics['ROUGE_f1'])
    # multiply with the weights
    monitor_metrics['combined_metric'] = round(
        tf.reduce_sum([(i * j) for i, j in zip(
            monitor_metrics['combined_metric'], config.combined_metric_weights)
                       ]).numpy(), 2)
    log.info(f"combined_metric {monitor_metrics['combined_metric']:4f}")
    if config.last_recorded_value < monitor_metrics[to_monitor]:
        # reset tolerance to zero if the monitor_metric decreases before the tolerance threshold
        config.tolerance = 0
        config.last_recorded_value = monitor_metrics[to_monitor]
        ckpt_files_tocopy = [files for files in os.listdir(os.path.split(ckpt_save_path)[0]) \
                             if ckpt_string in files]
        log.info(
            f'{to_monitor} is {monitor_metrics[to_monitor]:4f} so checkpoint files {ckpt_string} \
                 will be copied to best checkpoint directory')
        # copy the best checkpoints
        shutil.copy2(os.path.join(ckpt_fold, 'checkpoint'),
                     config.best_ckpt_path)
        for files in ckpt_files_tocopy:
            shutil.copy2(os.path.join(ckpt_fold, files), config.best_ckpt_path)
    else:
        config.tolerance += 1
    # stop if minimum training loss is reached
    if train_loss < config.min_train_loss:
        log.info(f'Stop training since minimum training loss reached')
        return False
    else:
        return True
    # Warn and early stop
    if config.tolerance > config.tolerance_threshold:
        log.warning('Tolerance exceeded')
        if config.early_stop:
            log.info(
                f'Early stopping since the {to_monitor} reached the tolerance threshold'
            )
            return False
        else:
            return True
    else:
        return True
示例#14
0
def change_dataset_and_train(addtional_tokens_per_batch, batch_size):
    
    memory_test_dataset = create_dataset(
                              split='train', 
                              source_tokenizer=source_tokenizer, 
                              target_tokenizer=target_tokenizer, 
                              from_=90, 
                              to=100, 
                              shuffle=True,
                              batch_size=batch_size
                              )
    log.info(f'Training with tokens_per_batch set to {addtional_tokens_per_batch}\
               and batch_size set to {batch_size}')
    training_loop(memory_test_dataset.take(1000), False)
    gpu_usage = check_gpu_usage()
    log.info(f'GPU memory utilization is {gpu_usage}')

    return gpu_usage
示例#15
0
def main():
    """ Check that the airports passed are valid iata codes """

    uri = f"{config['PATHS']['DATA']}airports.pickle"
    df = pd.read_pickle(uri)

    valid_codes = df["iata_code"].dropna().unique()

    airports = config["AIRPORTS"]["ORIGINS"].split(",")

    invalid_airports = [x for x in airports if x not in valid_codes]

    if invalid_airports:
        msg = f"Airports {invalid_airports} are not valid IATA codes"
        log.error(msg)
        raise ValueError(msg)

    log.info("All airports are valid IATA codes")
示例#16
0
文件: bee.py 项目: cradess/HIVE
def start_bee():
    """The start_bee function starts all the tasks related to starting the bee.
    When this function is called the bee will start working."""
    log.debug("Bee has been started")
    while True:
        urls = get_urls()
        # update the url so different instances don't crawl the same url

        for url in urls:
            update_url(url)

        if len(urls) == 0:
            print("No URLs to be crawled, waiting for 60 seconds.")
            log.info('No URLs to be crawled, waiting for 60 seconds.')
            sleep(60)
            commit()
            continue

        connect_to_tor()

        for url in urls:
            try:
                content = get_content_from_url(url.url)
                print("{} is now being beeed".format(url.url))
                content_hashed = hash_content(content)

                content_cleaned = clean_html(content)

                check_blocked = check_blocked_keywords(content_cleaned)

                if check_blocked is None:
                    keywords = filter_keywords(content_cleaned)
                    save_content(url.id, content_cleaned, content,
                                 content_hashed, keywords)
                else:
                    print(
                        "URL: " + url.url +
                        "  has been blocked because it contains a blocked keyword "
                        + check_blocked)

            except (ValueError, NameError, TypeError) as error:
                log.error(str(error))
def eval_step(input_ids, target_ids_, target_ids, draft_mask, refine_mask):

    (draft_predictions, draft_attention_weights, refine_predictions,
     refine_attention_weights) = Model(input_ids, target_ids_, False)
    draft_output_sequence_loss = loss_function(target_ids[:, 1:, :],
                                               draft_predictions, draft_mask)
    if config.use_refine_decoder:
        refine_output_sequence_loss = loss_function(target_ids[:, :-1, :],
                                                    refine_predictions,
                                                    refine_mask)
    else:
        refine_output_sequence_loss = 0
    regularization_loss = tf.add_n(Model.losses)
    loss = draft_output_sequence_loss + refine_output_sequence_loss + regularization_loss
    log.info(Model.summary())
    if config.save_initial_weights:
        initial_weights = os.path.join(config.initial_weights,
                                       'initial_weights')
        Model.save_weights(initial_weights)
    return loss
示例#18
0
def save_evaluate_monitor(ck_pt_mgr,
                          val_dataset,
                          target_tokenizer,
                          predictions,
                          target_ids,
                          step,
                          start_time,
                          return_attention=False):

    ckpt_save_path = ck_pt_mgr.save()
    # print the detokenized training output of a single sample
    predicted = train_sanity_check(target_tokenizer, predictions, target_ids,
                                   log)
    evaluate = train_loss.result(
    ) < config.start_evaluate_when and True if predicted else False
    # Run evaluation only if the predictions made by the teacher forced output is not empty
    # and the train_loss is lesser than start_evaluate_when
    if evaluate:
        (task_score, bert_score, draft_attention_weights,
         refine_attention_weights) = evaluate_validation_set(
             val_dataset, step, return_with_attention_weights=return_attention)
        early_stop_training = monitor_eval_metrics(ckpt_save_path, bert_score,
                                                   task_score,
                                                   train_loss.result(), step,
                                                   log, config)
    else:
        log.info(
            'Not running evaluation since loss is not lesser than config.start_evaluate_when'
        )
        (task_score, bert_score) = (0, 0)
        early_stop_training = False
        draft_attention_weights = None
        refine_attention_weights = None
    training_results(step, train_loss.result(), train_accuracy.result(),
                     task_score, bert_score, (time.time() - start_time),
                     ckpt_save_path, log, config)
    train_loss.reset_states()
    train_accuracy.reset_states()

    return (early_stop_training, draft_attention_weights,
            refine_attention_weights)
示例#19
0
async def main(loop):
    log.debug('scout has been started')
    while True:
        urls = get_urls_from_database()
        # update urls immediately to avoid different instances crawling the same urls
        for url in urls:
            update_url(url)

        if len(urls) == 0:
            print("No URLs to be crawled, waiting for 60 seconds.")
            log.info('No URLs to be crawled, waiting for 60 seconds.')
            sleep(60)
            commit()
            continue

        results = await tor.get_content_from_urls(loop, urls)
        urls_from_content = get_urls_from_results(urls, results)

        for u in urls_from_content:
            if u is not None:
                save_url(u)
        print('Found ', len(urls_from_content), ' urls')
示例#20
0
def training_loop(dataset, check_model_capacity, detokenize_samples=None):

    min_loss = 10000000
    if check_model_capacity:
        dataset = dataset.repeat(670)
    for (step, (input_ids, target_ids)) in tqdm(enumerate(dataset, 1), initial=1):
        start=time.time()
        grad_accum_flag = (True if ((step)%config.gradient_accumulation_steps) == 0 else False) if config.accumulate_gradients else None
        predictions = train_step(
                                  input_ids,  
                                  target_ids, 
                                  grad_accum_flag
                                  )
        if grad_accum_flag is not None:
            if grad_accum_flag:
                if (step)%config.steps_to_print_training_info==0:
                    predicted_ids = train_sanity_check(target_tokenizer, predictions, target_ids)
                    train_loss = batch_run_check(
                                              step,  
                                              start
                                              )
        else:
            if (step)%config.steps_to_print_training_info==0:
                train_loss = batch_run_check(
                                          step,  
                                          start
                                          )
            if check_model_capacity:
                if min_loss > train_loss:
                    min_loss = train_loss
                else:
                    log.warning('Loss not decreasing watch out')
                    monitor_early_stop = monitor_run(
                                    'not saving', 
                                    0, 
                                    0,
                                    0.0, 
                                    1,
                                    copy_best_ckpt=False
                                    )
                    
    if check_model_capacity:
        log.info(f'target_ids are {target_ids}')
        log.info(f'predicted ids are {predicted_ids}')
        if train_loss < config.min_train_loss:
            log.info('Minimum training loss reached')
        else:
            log.info("Loss didn't reach upto the min_train_loss specified, try to increase\
                  the parameters of the model or number of train steps")
示例#21
0
def create_dataset(split,
                   source_tokenizer,
                   target_tokenizer,
                   from_,
                   to,
                   batch_size,
                   buffer_size=None,
                   use_tfds=True,
                   csv_path=None,
                   drop_remainder=False,
                   num_examples_to_select=config.samples_to_test):

    examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en',
                                   with_info=True,
                                   as_supervised=True)
    train_examples, val_examples = examples['train'], examples['validation']

    #initialize the first dataset to the train_examples variable
    #Concatenate all the train datasets
    if split == 'train':
        raw_dataset = train_examples
    elif split == 'validation':
        raw_dataset = val_examples

    tf_dataset = raw_dataset.map(tf_encode(source_tokenizer, target_tokenizer,
                                           config.input_seq_length,
                                           config.target_seq_length),
                                 num_parallel_calls=AUTOTUNE)
    tf_dataset = tf_dataset.filter(filter_max_length)
    tf_dataset = tf_dataset.take(num_examples_to_select)
    tf_dataset = tf_dataset.cache()
    if buffer_size:
        tf_dataset = tf_dataset.shuffle(buffer_size, seed=100)
    tf_dataset = tf_dataset.padded_batch(batch_size,
                                         padded_shapes=([-1], [-1]),
                                         drop_remainder=drop_remainder)
    tf_dataset = tf_dataset.prefetch(buffer_size=AUTOTUNE)
    log.info(f'{split} tf_dataset created')
    return tf_dataset
def create_vocab(tokenizer_path, tok_type):

	try:
		tokenizer = tfds.features.text.SubwordTextEncoder.load_from_file(config.tokenizer_path)
	except FileNotFoundError:
		log.warning(f'Vocab files not available in {config.tokenizer_path} so building it from the training set')
	    if config.use_tfds:
			examples, metadata = tfds.load(config.tfds_name, with_info=True, as_supervised=True)
			train_examples = examples['train']
			if tok_type=='source':
			  tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
			          (ip_seq.numpy() for ip_seq, _ in train_examples), target_vocab_size=2**13)
			else:
			  tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
			          (op_seq.numpy() for _, op_seq in train_examples), target_vocab_size=2**13)
	    tokenizer.save_to_file(config.tokenizer_path)
	if tok_type=='source':
		assert(tokenizer.vocab_size+2 == config.input_vocab_size),f'{tok_type}vocab size in configuration script should be {tokenizer.vocab_size+2}'
	else:
		assert(tokenizer.vocab_size+2 == config.output_vocab_size),f'{tok_type}vocab size in configuration script should be {tokenizer.vocab_size+2}'
	log.info(f'{tok_type} vocab file created and saved to {config.tokenizer_path}')
	return tokenizer
def training_results(step, rouge_score, bert_score, timing_info,
                     ckpt_save_path):

    log.info(
        model_metrics.format(step, train_loss.result(),
                             train_accuracy.result(), rouge_score, bert_score))
    log.info(evaluation_step.format(step, timing_info))
    log.info(checkpoint_details.format(step, ckpt_save_path))
def create_dataset(split,
                   source_tokenizer,
                   target_tokenizer,
                   from_,
                   to,
                   batch_size,
                   shuffle=None,
                   use_tfds=True,
                   drop_remainder=False,
                   num_examples_to_select=config.samples_to_test):

    raw_dataset, ds_info = tfds.load(config.tfds_name,
                                     with_info=True,
                                     as_supervised=True,
                                     data_dir=config.tfds_data_dir,
                                     builder_kwargs=config.tfds_data_version,
                                     split=tfds.core.ReadInstruction(
                                         split, from_=from_, to=to, unit='%'))

    tf_dataset = raw_dataset.map(tf_encode(source_tokenizer, target_tokenizer,
                                           config.input_seq_length,
                                           config.target_seq_length),
                                 num_parallel_calls=AUTOTUNE)
    tf_dataset = tf_dataset.filter(filter_max_length)
    tf_dataset = tf_dataset.take(num_examples_to_select)
    tf_dataset = tf_dataset.cache()
    if shuffle:
        buffer_size = (sum(
            [i.num_examples for i in list(ds_info.splits.values())]))
        tf_dataset = tf_dataset.shuffle(buffer_size, seed=100)
    tf_dataset = tf_dataset.padded_batch(batch_size,
                                         padded_shapes=([-1], [-1]),
                                         drop_remainder=drop_remainder)
    tf_dataset = tf_dataset.prefetch(buffer_size=AUTOTUNE)
    log.info(f'{split} tf_dataset created')

    return tf_dataset
示例#25
0
def main():
    """
        Get all flights of each pair and export them as a csv for each pair
    """

    path_raw = config["PATHS"]["DATA"] + f"flights/{date.today():%Y_%m_%d}/"

    # Create folder
    os.makedirs(path_raw, exist_ok=True)

    airports_pairs = get_pairs()
    total_pairs = len(airports_pairs)

    for i, (origin, dest) in enumerate(airports_pairs):

        log.info(
            f"Quering flights from '{origin}' to '{dest}' ({i + 1}/{total_pairs})"
        )
        df = query_pair(origin, dest)

        if df is not None:
            uri = f"{path_raw}{origin}_{dest}.csv"
            df.to_csv(uri, index=False)
            log.debug(f"Exporting '{uri}'")
def batch_run_check(batch, start):
    if config.run_tensorboard:
        with train_output_sequence_writer.as_default():
            tf.summary.scalar('train_loss', train_loss.result(), step=batch)
            tf.summary.scalar('train_accuracy',
                              train_accuracy.result(),
                              step=batch)
    if config.display_model_summary:
        log.info(Model.summary())
        log.info(batch_zero.format(time.time() - start))
        config.display_model_summary = False
    log.info(
        batch_run_details.format(train_loss.result(), train_accuracy.result()))
    return train_loss.result()
示例#27
0
def batch_run_check(batch, start_time, bert_f1_score):

    if config.run_tensorboard:
        with train_output_sequence_writer.as_default():
            tf.summary.scalar('train_loss', train_loss.result(), step=batch)
    if config.display_model_summary:
        log.info(Model.summary())
        log.info(batch_zero.format(time.time() - start_time))
        config.display_model_summary = False
    log.info(
        batch_run_details.format(
            tf.debugging.assert_all_finite(train_loss.result(),
                                           message="NaN's or Inf's.",
                                           name='NAN_assertion'),
            bert_f1_score.numpy()))
             top_k=config.top_k):

        batch_size = tf.shape(input_ids)[0]
        if training is not None:
            return self.fit(input_ids, target_ids, training, look_ahead_mask,
                            dec_padding_mask, batch_size)
        else:
            return self.predict(input_ids,
                                batch_size=batch_size,
                                draft_decoder_type=decoder_type,
                                beam_size=beam_size,
                                length_penalty=length_penalty,
                                temperature=temperature,
                                top_p=top_p,
                                top_k=top_k)


Model = Bertified_transformer(
    num_layers=config.num_layers,
    d_model=config.d_model,
    num_heads=config.num_heads,
    dff=config.dff,
    input_vocab_size=config.input_vocab_size,
    target_vocab_size=config.target_vocab_size,
    add_pointer_generator=config.add_pointer_generator)

if config.print_config:
    if config['add_bias']:
        config['add_bias'] = True
    log.info(f'Configuration used \n {config}')
def create_dataset(split,
                   source_tokenizer,
                   target_tokenizer,
                   from_,
                   to,
                   batch_size,
                   shuffle=None,
                   drop_remainder=False,
                   num_examples_to_select=config.samples_to_train):

    if config.tfds_name == 'en_tam_parallel_text':
        en_tam_ds = defaultdict(list)
        record_count = 0
        #List of available datasets in the package
        dataset_names = [
            'GNOME_v1_en_to_ta', 'GNOME_v1_en_AU_to_ta',
            'GNOME_v1_en_CA_to_ta', 'GNOME_v1_en_GB_to_ta',
            'GNOME_v1_en_US_to_ta', 'KDE4_v2_en_to_ta', 'KDE4_v2_en_GB_to_ta',
            'Tatoeba_v20190709_en_to_ta', 'Ubuntu_v14.10_en_to_ta_LK',
            'Ubuntu_v14.10_en_GB_to_ta_LK', 'Ubuntu_v14.10_en_AU_to_ta_LK',
            'Ubuntu_v14.10_en_CA_to_ta_LK', 'Ubuntu_v14.10_en_US_to_ta_LK',
            'Ubuntu_v14.10_en_to_ta', 'Ubuntu_v14.10_en_GB_to_ta',
            'Ubuntu_v14.10_en_AU_to_ta', 'Ubuntu_v14.10_en_CA_to_ta',
            'Ubuntu_v14.10_en_NZ_to_ta', 'Ubuntu_v14.10_en_US_to_ta',
            'OpenSubtitles_v2018_en_to_ta', 'OpenSubtitles_v2016_en_to_ta',
            'en_ta', 'github_joshua_en_ta'
        ]
        for name in dataset_names:
            en_tam_ds[(name, 'metadata_' + name)] = tfds.load(
                f'{config.tfds_name}/' + name,
                with_info=True,
                as_supervised=True,
                data_dir=config.tfds_data_dir,
                builder_kwargs={'version': config.tfds_data_version},
            )
        for i, j in en_tam_ds.keys():
            record_count += (sum([
                i.num_examples
                for i in list(en_tam_ds[(i, j)][1].splits.values())
            ]))
        if not config.test_script:
            log.info(f'Total record count without filtering is {record_count}')
        #initialize the first dataset to the train_examples variable
        #Concatenate all the train datasets
        if split == 'train':
            raw_dataset = en_tam_ds[('GNOME_v1_en_to_ta',
                                     'metadata_GNOME_v1_en_to_ta')][0]['train']
            for typ in list(en_tam_ds.keys())[1:]:
                raw_dataset = raw_dataset.concatenate(
                    en_tam_ds[typ][0]['train'])
        #other splits include validation and test
        else:
            raw_dataset = en_tam_ds[('en_ta', 'metadata_en_ta')][0][split]
    else:
        raw_dataset, ds_info = tfds.load(
            config.tfds_name,
            with_info=True,
            as_supervised=True,
            data_dir=config.tfds_data_dir,
            builder_kwargs={'version': config.tfds_data_version},
            split=tfds.core.ReadInstruction(split,
                                            from_=from_,
                                            to=to,
                                            unit='%'))
        record_count = (sum(
            [i.num_examples for i in list(ds_info.splits.values())]))
    tf_dataset = raw_dataset.map(tf_encode(source_tokenizer, target_tokenizer),
                                 num_parallel_calls=parallel_calls)
    tf_dataset = tf_dataset.filter(filter_max_length)
    if config.model == 'bertified_transformer':
        tf_dataset = tf_dataset.map(tf_pad_encoded_ids)
    tf_dataset = tf_dataset.take(num_examples_to_select)
    tf_dataset = tf_dataset.cache()
    if shuffle:
        tf_dataset = tf_dataset.shuffle(record_count, seed=100)
    tf_dataset = tf_dataset.padded_batch(batch_size,
                                         padded_shapes=([-1], [-1]),
                                         drop_remainder=drop_remainder)
    tf_dataset = tf_dataset.prefetch(buffer_size=parallel_calls)
    log.info(f'{split} tf_dataset created')

    return tf_dataset
示例#30
0
                    (task_score, 
                    bert_score,
                    draft_attention_weights,
                    refine_attention_weights) = evaluate_validation_set(test_dataset,
                                                                       step,
                                                                       decoder_type,
                                                                       beam_size,
                                                                       length_penalty,
                                                                       temperature, 
                                                                       top_p,
                                                                       top_k)
                    combined_metric = (0.8*bert_score) +  (0.2*task_score)
                    if max_combined_metric < combined_metric:
                        max_combined_metric = combined_metric
                        best_beam_size = beam_size
                        best_length_penalty = length_penalty
                        best_temperature = temperature 
                        best_top_p = top_p
                        best_top_k = top_k
                    log.info(infer_template.format(task_score, bert_score, combined_metric, beam_size, step))
log.info(best_combo.format(
                        best_beam_size,
                        best_length_penalty,
                        best_temperature,
                        best_top_p,
                        best_top_k, 
                        max_combined_metric
                        )
        )