示例#1
0
def query_pair(origin, destination, n_days=366):
    """
        Query all flights between 2 airports

        Args:
            origin:         code for origin airport
            destination:    code for destination airport
            n_days:         max days of history
    """

    # Start at day 1 since it will only query when day==1
    start_day = date.today()

    dfs = []
    for x in range(n_days):
        query_day = start_day + timedelta(x)

        # Only do first day of month
        if (query_day.day != 1) and (query_day != start_day):
            log.trace(f"Skiping day '{query_day}'")
            continue

        response = query_flights(origin, destination, query_day)
        data = response.json()

        if data["Quotes"]:
            dfs.append(parse_data(data))

    if dfs:
        return pd.concat(dfs).reset_index(drop=True)
    else:
        log.warning(f"No flights from '{origin}' to '{destination}'")
示例#2
0
def write_output_sequence(tar_real, predictions, step, write_output_seq):
    ref_sents = []
    hyp_sents = []
    rouge_all = Rouge()
    for tar, ref_hyp in zip(tar_real, predictions):
        detokenized_refs, detokenized_hyp_sents = detokenize(
            target_tokenizer, tf.squeeze(tar), tf.squeeze(ref_hyp))
        ref_sents.append(detokenized_refs)
        hyp_sents.append(detokenized_hyp_sents)
    try:
        rouges = rouge_all.get_scores(ref_sents, hyp_sents)
        avg_rouge_f1 = np.mean([
            np.mean([
                rouge_scores['rouge-1']["f"], rouge_scores['rouge-2']["f"],
                rouge_scores['rouge-l']["f"]
            ]) for rouge_scores in rouges
        ])
        _, _, bert_f1 = b_score(ref_sents,
                                hyp_sents,
                                model_type=config.bert_score_model)
        avg_bert_f1 = np.mean(bert_f1.numpy())
    except:
        log.warning(
            'Some problem while calculating ROUGE so setting ROUGE score to zero'
        )
        avg_rouge_f1 = 0
        avg_bert_f1 = 0

    if write_output_seq:
        with tf.io.gfile.GFile(
                config.output_sequence_write_path + str(step.numpy()),
                'w') as f:
            for ref, hyp in zip(ref_sents, hyp_sents):
                f.write(ref + '\t' + hyp + '\n')
    return (avg_rouge_f1, avg_bert_f1)
示例#3
0
def monitor_run(ckpt_save_path,
                bert_score,
                rouge_score,
                train_loss,
                step,
                to_monitor=config.monitor_metric):

    ckpt_fold, ckpt_string = os.path.split(ckpt_save_path)
    if config.run_tensorboard:
        with valid_output_sequence_writer.as_default():
            tf.summary.scalar('ROUGE_f1', rouge_score, step=step)
            tf.summary.scalar('BERT_f1', bert_score, step=step)
    monitor_metrics = dict()
    monitor_metrics['BERT_f1'] = bert_score
    monitor_metrics['ROUGE_f1'] = rouge_score
    monitor_metrics['combined_metric'] = (monitor_metrics['BERT_f1'],
                                          monitor_metrics['ROUGE_f1'])
    # multiply with the weights
    monitor_metrics['combined_metric'] = round(
        tf.reduce_sum([(i * j) for i, j in zip(
            monitor_metrics['combined_metric'], config.combined_metric_weights)
                       ]).numpy(), 2)
    log.info(f"combined_metric {monitor_metrics['combined_metric']:4f}")
    if config.last_recorded_value < monitor_metrics[to_monitor]:
        # reset tolerance to zero if the monitor_metric decreases before the tolerance threshold
        config.tolerance = 0
        config.last_recorded_value = monitor_metrics[to_monitor]
        ckpt_files_tocopy = [files for files in os.listdir(os.path.split(ckpt_save_path)[0]) \
                             if ckpt_string in files]
        log.info(
            f'{to_monitor} is {monitor_metrics[to_monitor]:4f} so checkpoint files {ckpt_string} \
                 will be copied to best checkpoint directory')
        # copy the best checkpoints
        shutil.copy2(os.path.join(ckpt_fold, 'checkpoint'),
                     config.best_ckpt_path)
        for files in ckpt_files_tocopy:
            shutil.copy2(os.path.join(ckpt_fold, files), config.best_ckpt_path)
    else:
        config.tolerance += 1
    # stop if minimum training loss is reached
    if train_loss < config.min_train_loss:
        log.info(f'Stop training since minimum training loss reached')
        return False
    else:
        return True
    # Warn and early stop
    if config.tolerance > config.tolerance_threshold:
        log.warning('Tolerance exceeded')
        if config.early_stop:
            log.info(
                f'Early stopping since the {to_monitor} reached the tolerance threshold'
            )
            return False
        else:
            return True
    else:
        return True
示例#4
0
 def evaluate_bert_score(self):
     
     try:
         _, _, bert_f1 = b_score(self.ref_sents, self.hyp_sents, 
                               model_type=config.bert_score_model,
                               device='cpu')
         avg_bert_f1 = np.mean(bert_f1.numpy())
     except:
         log.warning('Some problem while calculating BERT score so setting it to zero')
         avg_bert_f1 = 0
         
     return avg_bert_f1
示例#5
0
    def evaluate_rouge(self):
        
        try:
            all_rouge_scores = self.calculate_rouge.get_scores(self.ref_sents , self.hyp_sents)
            avg_rouge_f1 = np.mean([np.mean([rouge_scores['rouge-1']["f"], 
                              rouge_scores['rouge-2']["f"], 
                              rouge_scores['rouge-l']["f"]]) for rouge_scores in all_rouge_scores])
        except:
            log.warning('Some problem while calculating ROUGE so setting it to zero')
            avg_rouge_f1 = 0

        return avg_rouge_f1
示例#6
0
def check_ckpt(checkpoint_path):

    ckpt = tf.train.Checkpoint(Model=Model, optimizer=optimizer)
    ckpt_manager = tf.train.CheckpointManager(ckpt,
                                              checkpoint_path,
                                              max_to_keep=10)
    if tf.train.latest_checkpoint(checkpoint_path):
        ckpt.restore(ckpt_manager.latest_checkpoint)
        log.info(ckpt_manager.latest_checkpoint + ' restored')
    else:
        log.warning('No checkpoint found so using the initialized_weights')

    return ckpt_manager
示例#7
0
def query_flights(
    origin,
    destination,
    day,
    max_attempts=20,
    seconds_sleep=1,
    country="ES",
    currency="EUR",
    locale="en-US",
):
    """
        Query flights iterating until there is a result

        Args:
            origin:         code for origin airport
            destination:    code for destination airport
            day:            day for the flights [date]
            max_attempts:   number of retries
            seconds_sleep:  seconds to sleep before returning a result
            country:        code for country (default: ES)
            currency:       code for currency (default: EUR)
            locale:         code for output info (default: en-US)
    """

    url = f"{BASE_URL}{country}/{currency}/{locale}/{origin}/{destination}/{day:%Y-%m-%d}"

    for attemp_num in range(max_attempts):

        log.debug(
            f"Quering {origin}-{destination} for date '{day}' (attempt {attemp_num})"
        )

        response = requests.get(url, headers=HEADERS)

        if response.status_code == 200:
            sleep(seconds_sleep)
            return response

        # If there are 'Too many requests' sleep a little
        elif response.status_code == 429:
            log.warning(f"API limit reached at attempt {attemp_num + 1}")
            sleep(2 * attemp_num + 1)

        # Raise unknown cases
        else:
            response.raise_for_status()

    log.error(f"Number max of attempts reached ({max_attempts})")
    raise TimeoutError("TimeOut")
示例#8
0
def training_loop(dataset, check_model_capacity, detokenize_samples=None):

    min_loss = 10000000
    if check_model_capacity:
        dataset = dataset.repeat(670)
    for (step, (input_ids, target_ids)) in tqdm(enumerate(dataset, 1), initial=1):
        start=time.time()
        grad_accum_flag = (True if ((step)%config.gradient_accumulation_steps) == 0 else False) if config.accumulate_gradients else None
        predictions = train_step(
                                  input_ids,  
                                  target_ids, 
                                  grad_accum_flag
                                  )
        if grad_accum_flag is not None:
            if grad_accum_flag:
                if (step)%config.steps_to_print_training_info==0:
                    predicted_ids = train_sanity_check(target_tokenizer, predictions, target_ids)
                    train_loss = batch_run_check(
                                              step,  
                                              start
                                              )
        else:
            if (step)%config.steps_to_print_training_info==0:
                train_loss = batch_run_check(
                                          step,  
                                          start
                                          )
            if check_model_capacity:
                if min_loss > train_loss:
                    min_loss = train_loss
                else:
                    log.warning('Loss not decreasing watch out')
                    monitor_early_stop = monitor_run(
                                    'not saving', 
                                    0, 
                                    0,
                                    0.0, 
                                    1,
                                    copy_best_ckpt=False
                                    )
                    
    if check_model_capacity:
        log.info(f'target_ids are {target_ids}')
        log.info(f'predicted ids are {predicted_ids}')
        if train_loss < config.min_train_loss:
            log.info('Minimum training loss reached')
        else:
            log.info("Loss didn't reach upto the min_train_loss specified, try to increase\
                  the parameters of the model or number of train steps")
示例#9
0
    def evaluate_bleu_score(self, case_sensitive=False):

        ref_filename = tempfile.NamedTemporaryFile(delete=False)
        hyp_filename = tempfile.NamedTemporaryFile(delete=False)

        with tf.io.gfile.GFile(ref_filename.name, 'w') as f_ref:
            with tf.io.gfile.GFile(hyp_filename.name, 'w') as f_hyp:
                for references, hypothesis_output in zip(self.ref_sents , self.hyp_sents):
                    f_hyp.write(hypothesis_output+'\n')
                    f_ref.write(references+'\n')
        try:
            bleu_score = compute_bleu.bleu_wrapper(ref_filename = ref_filename.name, 
                                                   hyp_filename = hyp_filename.name,
                                                   case_sensitive = False)
        except:
            log.warning('Some problem while calculating BLEU score so setting it to zero')
            bleu_score = 0

        return bleu_score
def create_vocab(tokenizer_path, tok_type):

	try:
		tokenizer = tfds.features.text.SubwordTextEncoder.load_from_file(config.tokenizer_path)
	except FileNotFoundError:
		log.warning(f'Vocab files not available in {config.tokenizer_path} so building it from the training set')
	    if config.use_tfds:
			examples, metadata = tfds.load(config.tfds_name, with_info=True, as_supervised=True)
			train_examples = examples['train']
			if tok_type=='source':
			  tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
			          (ip_seq.numpy() for ip_seq, _ in train_examples), target_vocab_size=2**13)
			else:
			  tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
			          (op_seq.numpy() for _, op_seq in train_examples), target_vocab_size=2**13)
	    tokenizer.save_to_file(config.tokenizer_path)
	if tok_type=='source':
		assert(tokenizer.vocab_size+2 == config.input_vocab_size),f'{tok_type}vocab size in configuration script should be {tokenizer.vocab_size+2}'
	else:
		assert(tokenizer.vocab_size+2 == config.output_vocab_size),f'{tok_type}vocab size in configuration script should be {tokenizer.vocab_size+2}'
	log.info(f'{tok_type} vocab file created and saved to {config.tokenizer_path}')
	return tokenizer