def prepare_global_logging(serialization_dir: str, file_friendly_logging: bool) -> None: """ This function configures 3 global logging attributes - streaming stdout and stderr to a file as well as the terminal, setting the formatting for the python logging library and setting the interval frequency for the Tqdm progress bar. Note that this function does not set the logging level, which is set in ``allennlp/run.py``. Parameters ---------- serializezation_dir : ``str``, required. The directory to stream logs to. file_friendly_logging : ``bool``, required. Whether logs should clean the output to prevent carridge returns (used to update progress bars on a single terminal line). """ Tqdm.set_slower_interval(file_friendly_logging) std_out_file = os.path.join(serialization_dir, "stdout.log") sys.stdout = TeeLogger(std_out_file, # type: ignore sys.stdout, file_friendly_logging) sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), # type: ignore sys.stderr, file_friendly_logging) stdout_handler = logging.FileHandler(std_out_file) stdout_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(stdout_handler)
def _validation_loss(self) -> Tuple[float, int]: """ Computes the validation loss. Returns it and the number of batches. """ logger.info("Validating") self._model.eval() val_generator = self._iterator(self._validation_data, num_epochs=1, cuda_device=self._iterator_device, for_training=False) num_validation_batches = self._iterator.get_num_batches(self._validation_data) val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches) batches_this_epoch = 0 val_loss = 0 for batch in val_generator_tqdm: batches_this_epoch += 1 loss = self._batch_loss(batch, for_training=False) val_loss += loss.data.cpu().numpy() # Update the description with the latest metrics val_metrics = self._get_metrics(val_loss, batches_this_epoch) description = self._description_from_metrics(val_metrics) val_generator_tqdm.set_description(description, refresh=False) return val_loss, batches_this_epoch
def from_instances(cls, instances: Iterable['adi.Instance'], min_count: Dict[str, int] = None, max_vocab_size: Union[int, Dict[str, int]] = None, non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES, pretrained_files: Optional[Dict[str, str]] = None, only_include_pretrained_words: bool = False, tokens_to_add: Dict[str, List[str]] = None, min_pretrained_embeddings: Dict[str, int] = None) -> 'Vocabulary': """ Constructs a vocabulary given a collection of `Instances` and some parameters. We count all of the vocabulary items in the instances, then pass those counts and the other parameters, to :func:`__init__`. See that method for a description of what the other parameters do. """ logger.info("Fitting token dictionary from dataset.") namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int)) for instance in Tqdm.tqdm(instances): instance.count_vocab_items(namespace_token_counts) return cls(counter=namespace_token_counts, min_count=min_count, max_vocab_size=max_vocab_size, non_padded_namespaces=non_padded_namespaces, pretrained_files=pretrained_files, only_include_pretrained_words=only_include_pretrained_words, tokens_to_add=tokens_to_add, min_pretrained_embeddings=min_pretrained_embeddings)
def extend_from_instances(self, params: Params, instances: Iterable['adi.Instance'] = ()) -> None: """ Extends an already generated vocabulary using a collection of instances. """ min_count = params.pop("min_count", None) max_vocab_size = pop_max_vocab_size(params) non_padded_namespaces = params.pop("non_padded_namespaces", DEFAULT_NON_PADDED_NAMESPACES) pretrained_files = params.pop("pretrained_files", {}) min_pretrained_embeddings = params.pop("min_pretrained_embeddings", None) only_include_pretrained_words = params.pop_bool("only_include_pretrained_words", False) tokens_to_add = params.pop("tokens_to_add", None) params.assert_empty("Vocabulary - from dataset") logger.info("Fitting token dictionary from dataset.") namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int)) for instance in Tqdm.tqdm(instances): instance.count_vocab_items(namespace_token_counts) self._extend(counter=namespace_token_counts, min_count=min_count, max_vocab_size=max_vocab_size, non_padded_namespaces=non_padded_namespaces, pretrained_files=pretrained_files, only_include_pretrained_words=only_include_pretrained_words, tokens_to_add=tokens_to_add, min_pretrained_embeddings=min_pretrained_embeddings)
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: _warned_tqdm_ignores_underscores = False check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) for batch in generator_tqdm: batch = util.move_to_device(batch, cuda_device) model(**batch) metrics = model.get_metrics() if (not _warned_tqdm_ignores_underscores and any(metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") _warned_tqdm_ignores_underscores = True description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_")]) + " ||" generator_tqdm.set_description(description, refresh=False) return model.get_metrics(reset=True)
def get_from_cache(url: str, cache_dir: str = None) -> str: """ Given a URL, look for the corresponding dataset in the local cache. If it's not there, download it. Then return the path to the cached file. """ if cache_dir is None: cache_dir = DATASET_CACHE os.makedirs(cache_dir, exist_ok=True) # make HEAD request to check ETag response = requests.head(url, allow_redirects=True) if response.status_code != 200: raise IOError("HEAD request failed for url {}".format(url)) # add ETag to filename if it exists etag = response.headers.get("ETag") filename = url_to_filename(url, etag) # get cache path to put the file cache_path = os.path.join(cache_dir, filename) if not os.path.exists(cache_path): # Download to temporary file, then copy to cache dir once finished. # Otherwise you get corrupt cache entries if the download gets interrupted. with tempfile.NamedTemporaryFile() as temp_file: logger.info("%s not found in cache, downloading to %s", url, temp_file.name) # GET file object req = requests.get(url, stream=True) content_length = req.headers.get('Content-Length') total = int(content_length) if content_length is not None else None progress = Tqdm.tqdm(unit="B", total=total) for chunk in req.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks progress.update(len(chunk)) temp_file.write(chunk) progress.close() # we are copying the file before closing it, so flush to avoid truncation temp_file.flush() # shutil.copyfileobj() starts at the current position, so go to the start temp_file.seek(0) logger.info("copying %s to cache at %s", temp_file.name, cache_path) with open(cache_path, 'wb') as cache_file: shutil.copyfileobj(temp_file, cache_file) logger.info("creating metadata file for %s", cache_path) meta = {'url': url, 'etag': etag} meta_path = cache_path + '.json' with open(meta_path, 'w') as meta_file: json.dump(meta, meta_file) logger.info("removing temp file %s", temp_file.name) return cache_path
def http_get(url: str, temp_file: IO) -> None: req = requests.get(url, stream=True) content_length = req.headers.get('Content-Length') total = int(content_length) if content_length is not None else None progress = Tqdm.tqdm(unit="B", total=total) for chunk in req.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks progress.update(len(chunk)) temp_file.write(chunk) progress.close()
def embed_file(self, input_file: IO, output_file_path: str, output_format: str = "all", batch_size: int = DEFAULT_BATCH_SIZE) -> None: """ Computes ELMo embeddings from an input_file where each line contains a sentence tokenized by whitespace. The ELMo embeddings are written out in HDF5 format, where each sentences is saved in a dataset. Parameters ---------- input_file : ``IO``, required A file with one tokenized sentence per line. output_file_path : ``str``, required A path to the output hdf5 file. output_format : ``str``, optional, (default = "all") The embeddings to output. Must be one of "all", "top", or "average". batch_size : ``int``, optional, (default = 64) The number of sentences to process in ELMo at one time. """ assert output_format in ["all", "top", "average"] # Tokenizes the sentences. sentences = [line.strip() for line in input_file if line.strip()] split_sentences = [sentence.split() for sentence in sentences] # Uses the sentence as the key. embedded_sentences = zip(sentences, self.embed_sentences(split_sentences, batch_size)) logger.info("Processing sentences.") with h5py.File(output_file_path, 'w') as fout: for key, embeddings in Tqdm.tqdm(embedded_sentences): if key in fout.keys(): logger.warning(f"Key already exists in {output_file_path}, skipping: {key}") else: if output_format == "all": output = embeddings elif output_format == "top": output = embeddings[2] elif output_format == "average": output = numpy.average(embeddings, axis=0) fout.create_dataset( key, output.shape, dtype='float32', data=output ) input_file.close()
def _read_pretrained_tokens(embeddings_file_uri: str) -> List[str]: # Moving this import to the top breaks everything (cycling import, I guess) from allennlp.modules.token_embedders.embedding import EmbeddingsTextFile logger.info('Reading pretrained tokens from: %s', embeddings_file_uri) tokens: List[str] = [] with EmbeddingsTextFile(embeddings_file_uri) as embeddings_file: for line_number, line in enumerate(Tqdm.tqdm(embeddings_file), start=1): token_end = line.find(' ') if token_end >= 0: token = line[:token_end] tokens.append(token) else: line_begin = line[:20] + '...' if len(line) > 20 else line logger.warning(f'Skipping line number %d: %s', line_number, line_begin) return tokens
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: model.eval() iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device, for_training=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) for batch in generator_tqdm: model(**batch) metrics = model.get_metrics() description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description, refresh=False) return model.get_metrics(reset=True)
def get_from_cache(url: str, cache_dir: str = None) -> str: """ Given a URL, look for the corresponding dataset in the local cache. If it's not there, download it. Then return the path to the cached file. """ if cache_dir is None: cache_dir = DATASET_CACHE os.makedirs(cache_dir, exist_ok=True) # make HEAD request to check ETag response = requests.head(url) if response.status_code != 200: raise IOError("HEAD request failed for url {}".format(url)) # add ETag to filename if it exists etag = response.headers.get("ETag") filename = url_to_filename(url, etag) # get cache path to put the file cache_path = os.path.join(cache_dir, filename) if not os.path.exists(cache_path): # Download to temporary file, then copy to cache dir once finished. # Otherwise you get corrupt cache entries if the download gets interrupted. _, temp_filename = tempfile.mkstemp() logger.info("%s not found in cache, downloading to %s", url, temp_filename) # GET file object req = requests.get(url, stream=True) content_length = req.headers.get('Content-Length') total = int(content_length) if content_length is not None else None progress = Tqdm.tqdm(unit="B", total=total) with open(temp_filename, 'wb') as temp_file: for chunk in req.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks progress.update(len(chunk)) temp_file.write(chunk) progress.close() logger.info("copying %s to cache at %s", temp_filename, cache_path) shutil.copyfile(temp_filename, cache_path) logger.info("removing temp file %s", temp_filename) os.remove(temp_filename) return cache_path
def main(serialization_directory, device): """ serialization_directory : str, required. The directory containing the serialized weights. device: int, default = -1 The device to run the evaluation on. """ config = Params.from_file(os.path.join(serialization_directory, "config.json")) dataset_reader = DatasetReader.from_params(config['dataset_reader']) evaluation_data_path = config['validation_data_path'] model = Model.load(config, serialization_dir=serialization_directory, cuda_device=device) prediction_file_path = os.path.join(serialization_directory, "predictions.txt") gold_file_path = os.path.join(serialization_directory, "gold.txt") prediction_file = open(prediction_file_path, "w+") gold_file = open(gold_file_path, "w+") # Load the evaluation data and index it. print("Reading evaluation data from {}".format(evaluation_data_path)) instances = dataset_reader.read(evaluation_data_path) iterator = BasicIterator(batch_size=32) iterator.index_with(model.vocab) model_predictions = [] batches = iterator(instances, num_epochs=1, shuffle=False, cuda_device=device, for_training=False) for batch in Tqdm.tqdm(batches): result = model(**batch) predictions = model.decode(result) model_predictions.extend(predictions["tags"]) for instance, prediction in zip(instances, model_predictions): fields = instance.fields try: # Most sentences have a verbal predicate, but not all. verb_index = fields["verb_indicator"].labels.index(1) except ValueError: verb_index = None gold_tags = fields["tags"].labels sentence = fields["tokens"].tokens write_to_conll_eval_file(prediction_file, gold_file, verb_index, sentence, prediction, gold_tags) prediction_file.close() gold_file.close()
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as text_file: instance_strings = text_file.readlines() if self._tokens_per_instance is not None: all_text = " ".join([x.replace("\n", " ").strip() for x in instance_strings]) tokenized_text = self._tokenizer.tokenize(all_text) num_tokens = self._tokens_per_instance + 1 tokenized_strings = [] logger.info("Creating dataset from all text in file: %s", file_path) for index in Tqdm.tqdm(range(0, len(tokenized_text) - num_tokens, num_tokens - 1)): tokenized_strings.append(tokenized_text[index : (index + num_tokens)]) else: tokenized_strings = [self._tokenizer.tokenize(s) for s in instance_strings] for tokenized_string in tokenized_strings: input_field = TextField(tokenized_string[:-1], self._token_indexers) output_field = TextField(tokenized_string[1:], self._output_indexer) yield Instance({"input_tokens": input_field, "output_tokens": output_field})
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: check_for_gpu(cuda_device) model.eval() iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device, for_training=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) for batch in generator_tqdm: # print(batch) model(**batch) metrics = model.get_metrics() description = ', '.join( ["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description, refresh=False) return model.get_metrics(reset=True)
def from_instances( cls, instances: Iterable["adi.Instance"], min_count: Dict[str, int] = None, max_vocab_size: Union[int, Dict[str, int]] = None, non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES, pretrained_files: Optional[Dict[str, str]] = None, only_include_pretrained_words: bool = False, tokens_to_add: Dict[str, List[str]] = None, min_pretrained_embeddings: Dict[str, int] = None, padding_token: Optional[str] = DEFAULT_PADDING_TOKEN, oov_token: Optional[str] = DEFAULT_OOV_TOKEN, ) -> "Vocabulary": """ Constructs a vocabulary given a collection of `Instances` and some parameters. We count all of the vocabulary items in the instances, then pass those counts and the other parameters, to :func:`__init__`. See that method for a description of what the other parameters do. """ logger.info("Fitting token dictionary from dataset.") padding_token = padding_token if padding_token is not None else DEFAULT_PADDING_TOKEN oov_token = oov_token if oov_token is not None else DEFAULT_OOV_TOKEN namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict( lambda: defaultdict(int)) for instance in Tqdm.tqdm(instances): instance.count_vocab_items(namespace_token_counts) return cls( counter=namespace_token_counts, min_count=min_count, max_vocab_size=max_vocab_size, non_padded_namespaces=non_padded_namespaces, pretrained_files=pretrained_files, only_include_pretrained_words=only_include_pretrained_words, tokens_to_add=tokens_to_add, min_pretrained_embeddings=min_pretrained_embeddings, padding_token=padding_token, oov_token=oov_token, )
def _validation_loss(self) -> Tuple[float, int]: """ Computes the validation loss. Returns it and the number of batches. """ logger.info("Validating") self._model.eval() if self._validation_iterator is not None: val_iterator = self._validation_iterator else: val_iterator = self._iterator val_generator = val_iterator(self._validation_data, num_epochs=1, cuda_device=self._iterator_device) num_validation_batches = val_iterator.get_num_batches(self._validation_data) val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches) batches_this_epoch = 0 val_loss = 0 for batch in val_generator_tqdm: loss = self._batch_loss(batch, for_training=False) if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. batches_this_epoch += 1 val_loss += loss.detach().cpu().numpy() # Update the description with the latest metrics val_metrics = self._get_metrics(val_loss, batches_this_epoch) description = self._description_from_metrics(val_metrics) val_generator_tqdm.set_description(description, refresh=False) return val_loss, batches_this_epoch
def _gather_train_instances_and_compute_gradients(self) -> None: logger.info( "Gathering training instances and computing gradients. " "The result will be cached so this only needs to be done once.") self._train_instances = [] self.model.train() for instance in Tqdm.tqdm(self._train_loader.iter_instances(), desc="calculating training gradients"): batch = Batch([instance]) batch.index_instances(self.vocab) tensor_dict = move_to_device(batch.as_tensor_dict(), self.device) self.model.zero_grad() # Compute loss with respect to the test instance. output_dict = self.model(**tensor_dict) loss = output_dict["loss"] if self._used_params is None or self._used_param_names is None: self._used_params = [] self._used_param_names = [] # we only know what parameters in the models requires gradient after # we do the first .backward() and we store those used parameters loss.backward(retain_graph=True) for name, param in self.model.named_parameters(): if param.requires_grad and param.grad is not None: self._used_params.append(param) self._used_param_names.append(name) # Get gradients. grads = autograd.grad(loss, self._used_params) # Sanity check. assert len(grads) == len(self._used_params) self._train_instances.append( InstanceWithGrads(instance=instance, loss=loss.detach().item(), grads=grads))
def from_files_and_instances( cls, instances: Iterable["adi.Instance"], directory: str, padding_token: Optional[str] = DEFAULT_PADDING_TOKEN, oov_token: Optional[str] = DEFAULT_OOV_TOKEN, min_count: Dict[str, int] = None, max_vocab_size: Union[int, Dict[str, int]] = None, non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES, pretrained_files: Optional[Dict[str, str]] = None, only_include_pretrained_words: bool = False, tokens_to_add: Dict[str, List[str]] = None, min_pretrained_embeddings: Dict[str, int] = None, ) -> "Vocabulary": """ Extends an already generated vocabulary using a collection of instances. The `instances` parameter does not get an entry in a typical AllenNLP configuration file, but the other parameters do (if you want non-default parameters). See `__init__` for a description of what the other parameters mean. """ vocab = cls.from_files(directory, padding_token, oov_token) logger.info("Fitting token dictionary from dataset.") namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict( lambda: defaultdict(int)) for instance in Tqdm.tqdm(instances): instance.count_vocab_items(namespace_token_counts) vocab._extend( counter=namespace_token_counts, min_count=min_count, max_vocab_size=max_vocab_size, non_padded_namespaces=non_padded_namespaces, pretrained_files=pretrained_files, only_include_pretrained_words=only_include_pretrained_words, tokens_to_add=tokens_to_add, min_pretrained_embeddings=min_pretrained_embeddings, ) return vocab
def _read(self, file_path: str): self._debug_prints = 5 cached_file_path = cached_path(file_path) if file_path.endswith('.gz'): data_file = gzip.open(cached_file_path, 'rb') else: data_file = open(cached_file_path, 'r') logger.info("Reading QA instances from jsonl dataset at: %s", file_path) item_jsons = [] for line in data_file: item_jsons.append(json.loads(line.strip())) if self._sample != -1: item_jsons = random.sample(item_jsons, self._sample) logger.info("Sampling %d examples", self._sample) for item_json in Tqdm.tqdm(item_jsons, total=len(item_jsons)): self._debug_prints -= 1 if self._debug_prints >= 0: logger.info(f"====================================") logger.info(f"Input json: {item_json}") item_id = item_json["id"] statement_text = item_json["phrase"] metadata = {} if "metadata" not in item_json else item_json[ "metadata"] context = item_json["context"] if "context" in item_json else None yield self.text_to_instance(item_id=item_id, question=statement_text, answer_id=item_json["answer"], context=context, org_metadata=metadata) data_file.close()
def _read(self, file_path: str) -> Iterable[Instance]: # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) # Group into alternative divider / sentence chunks. for is_divider, lines in Tqdm.tqdm(itertools.groupby(data_file, _is_divider)): # Ignore the divider chunks, so that `lines` corresponds to the words # of a single sentence. if not is_divider: fields = [line.strip().split() for line in lines] # unzipping trick returns tuples, but our Fields need lists tokens, pos_tags, chunk_tags, ner_tags = [list(field) for field in zip(*fields)] # TextField requires ``Token`` objects tokens = [Token(token) for token in tokens] sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} # Add "feature labels" to instance if 'pos' in self.feature_labels: instance_fields['pos_tags'] = SequenceLabelField(pos_tags, sequence, "pos_tags") if 'chunk' in self.feature_labels: instance_fields['chunk_tags'] = SequenceLabelField(chunk_tags, sequence, "chunk_tags") if 'ner' in self.feature_labels: instance_fields['ner_tags'] = SequenceLabelField(ner_tags, sequence, "ner_tags") # Add "tag label" to instance if self.tag_label == 'ner': instance_fields['tags'] = SequenceLabelField(ner_tags, sequence) elif self.tag_label == 'pos': instance_fields['tags'] = SequenceLabelField(pos_tags, sequence) elif self.tag_label == 'chunk': instance_fields['tags'] = SequenceLabelField(chunk_tags, sequence) yield Instance(instance_fields)
def evaluate(self, model: Model): model.eval() generator_tqdm = Tqdm.tqdm(self.dataloader, total=len(self.dataloader)) model_outputs = {} for batch in generator_tqdm: with torch.no_grad(): batch = util.move_to_device(batch, self.cuda_device) output_dict = model.predict(**batch) for key in output_dict: if key not in model_outputs: model_outputs[key] = output_dict[key] else: model_outputs[key] += output_dict[key] evaluation_results = self.corpus.evaluate(model_outputs, self.dataset) print(evaluation_results['logging']) model.train() return evaluation_results['score']
def embed_file(self, input_path: str, output_file_prefix: str, is_propbank: bool = False, batch_size: int = DEFAULT_BATCH_SIZE) -> None: def get_sentences(): if not is_propbank: return self.get_qasrl_sentences(input_path) else: return self.get_propbank_sentences(input_path) with open(output_file_prefix + "_ids.jsonl", "w") as f_ids: with open(output_file_prefix + "_emb.bin", "wb") as f_emb: for sentence_batch in lazy_groups_of( Tqdm.tqdm(self.get_qasrl_sentences(input_path)), batch_size): batch_sentences, batch_metas = map(list, zip(*sentence_batch)) for verb_id, emb in self.embed_batch( batch_sentences, batch_metas): f_ids.write(json.dumps(verb_id) + "\n") bs = emb.numpy().tobytes() f_emb.write(bs)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as text_file: instance_strings = text_file.readlines() if self._tokens_per_instance is not None: all_text = " ".join([x.replace("\n", " ").strip() for x in instance_strings]) tokenized_text = self._tokenizer.tokenize(all_text) num_tokens = self._tokens_per_instance + 1 tokenized_strings = [] logger.info("Creating dataset from all text in file: %s", file_path) for index in Tqdm.tqdm(range(0, len(tokenized_text) - num_tokens, num_tokens - 1)): tokenized_strings.append(tokenized_text[index:(index + num_tokens)]) else: tokenized_strings = [self._tokenizer.tokenize(s) for s in instance_strings] for tokenized_string in tokenized_strings: input_field = TextField(tokenized_string[:-1], self._token_indexers) output_field = TextField(tokenized_string[1:], self._output_indexer) yield Instance({'input_tokens': input_field, 'output_tokens': output_field})
def _read(self, file_path): for filename in os.listdir(file_path): with open(os.path.join(file_path, filename), "r") as data_file: logger.info("Reading instances from lines in file at: %s", filename) filename_splitted = filename.split('_') task_name = filename_splitted[-3] domain_name = filename_splitted[-2] for line_num, line in enumerate(Tqdm.tqdm(data_file)): line = line.strip("\n") if not line: continue line_parts = line.split('\t') if len(line_parts) != 2: raise ConfigurationError( "Invalid line format: %s (line number %d)" % (line, line_num + 1)) source_sequence, target_sequence = line_parts yield self.text_to_instance(task_name, domain_name, source_sequence, target_sequence)
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() for sentences in Tqdm.tqdm( ontonotes_reader.dataset_document_iterator(file_path)): clusters: DefaultDict[int, List[Tuple[ int, int]]] = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append( (start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) yield self.text_to_instance([s.words for s in sentences], canonical_clusters)
def from_instances(cls, instances: Iterable['adi.Instance'], min_count: Dict[str, int] = None, max_vocab_size: Union[int, Dict[str, int]] = None, non_padded_namespaces: Sequence[str] = DEFAULT_NON_PADDED_NAMESPACES, pretrained_files: Optional[Dict[str, str]] = None, only_include_pretrained_words: bool = False) -> 'Vocabulary': """ Constructs a vocabulary given a collection of `Instances` and some parameters. We count all of the vocabulary items in the instances, then pass those counts and the other parameters, to :func:`__init__`. See that method for a description of what the other parameters do. """ logger.info("Fitting token dictionary from dataset.") namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int)) for instance in Tqdm.tqdm(instances): instance.count_vocab_items(namespace_token_counts) return Vocabulary(counter=namespace_token_counts, min_count=min_count, max_vocab_size=max_vocab_size, non_padded_namespaces=non_padded_namespaces, pretrained_files=pretrained_files, only_include_pretrained_words=only_include_pretrained_words)
def _read(self, file_path): # if `file_path` is a URL, redirect to the cache # file_path = cached_path(file_path) for filename in os.listdir(file_path): filename_splitted = filename.split('_') task_name = filename_splitted[-3] domain_name = filename_splitted[-2] if task_name not in self._tasks or domain_name not in self._domains: continue with open(os.path.join(file_path, filename), "r") as data_file: logger.info("Reading instances from lines in file at: %s", filename) for line in Tqdm.tqdm(data_file): line = line.strip("\n") # skip blank lines if not line: continue tokens_and_tags = [ pair.rsplit(self._word_tag_delimiter, 1) for pair in line.split(self._token_delimiter) ] tokens_and_tags = ([['<<' + task_name + '>>', 'O'], ['<<' + domain_name + '>>', 'O']] + tokens_and_tags) tokens = [Token(token) for token, tag in tokens_and_tags] tags = [tag for token, tag in tokens_and_tags] task_field = LabelField(task_name, label_namespace="task_labels") sequence = TextField(tokens, self._token_indexers) sequence_tags = SequenceLabelField( tags, sequence, label_namespace='labels') yield Instance({ 'task_token': task_field, 'tokens': sequence, 'tags': sequence_tags })
def _train_epoch(self, epoch: int) -> Dict[str, float]: train_loss = 0.0 self.model.train() num_gpus = len(self._cuda_devices) if getattr(self, "train_dataset", None) is None: self.train_dataset = DMDataSet(data=self.train_data[0], batch_size=self.batch_size, num_gpus=num_gpus, shuffle=True, distributed=True, data_slice=True) self.train_dataset.set_epoch(epoch) num_training_batches = math.ceil( len(self.train_dataset) / self.batch_size / num_gpus) self._last_log = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 logger.info("Training") train_generator_tqdm = Tqdm.tqdm(self.train_dataset, total=num_training_batches) for batch_group in train_generator_tqdm: # print('batch_size: ', len(batch_group["source_tokens"]["tokens"])) # gpu_data = batch_group # src_data = gpu_data["source_tokens"]["tokens"] # tgt_data = gpu_data["target_tokens"]["tokens"] # for sdata, tdata in zip(src_data, tgt_data): # s = ''.join([self.get_model().vocab.get_token_from_index(x, "source_tokens") if x != 0 else '' for x in # sdata.numpy()]) # t = ''.join([self.get_model().vocab.get_token_from_index(x, "target_tokens") if x != 0 else '' for x in # tdata.numpy()]) # print(s) # print(t) batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() loss = self.batch_loss(batch_group, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss.backward() train_loss += loss.item() if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) self.optimizer.step() metrics = training_util.get_metrics(self.get_model(), train_loss, batches_this_epoch) description = self.get_desc_from_metrics(metrics, epoch) train_generator_tqdm.set_description(description, refresh=False) metrics = training_util.get_metrics(self.get_model(), train_loss, batches_this_epoch, reset=True) return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}") for gpu, memory in gpu_memory_mb().items(): logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._model.train() # Get tqdm for the training batches train_generator = self._iterator(self._train_data, num_epochs=1, cuda_device=self._iterator_device) num_training_batches = self._iterator.get_num_batches(self._train_data) train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 if self._histogram_interval is not None: histogram_parameters = set(self._model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") for batch in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self._log_histograms_this_batch = self._histogram_interval is not None and ( batch_num_total % self._histogram_interval == 0) self._optimizer.zero_grad() loss = self._batch_loss(batch, for_training=True) loss.backward() # Make sure Variable is on the cpu before converting to numpy. # .cpu() is a no-op if you aren't using GPUs. train_loss += loss.data.cpu().numpy() batch_grad_norm = self._rescale_gradients() # This does nothing if batch_num_total is None or you are using an # LRScheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._log_histograms_this_batch: # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = {name: param.detach().data.cpu().clone() for name, param in self._model.named_parameters()} self._optimizer.step() for name, param in self._model.named_parameters(): param_updates[name].sub_(param.detach().data.cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )) self._tensorboard.add_train_scalar("gradient_update/" + name, update_norm / (param_norm + 1e-7), batch_num_total) else: self._optimizer.step() # Update the description with the latest metrics metrics = self._get_metrics(train_loss, batches_this_epoch) description = self._description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if batch_num_total % self._summary_interval == 0: self._parameter_and_gradient_statistics_to_tensorboard(batch_num_total, batch_grad_norm) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"], batch_num_total) self._metrics_to_tensorboard(batch_num_total, {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._log_histograms_this_batch: self._histograms_to_tensorboard(batch_num_total, histogram_parameters) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval ): last_save_time = time.time() self._save_checkpoint( '{0}.{1}'.format(epoch, time_to_str(int(last_save_time))), [], is_best=False ) return self._get_metrics(train_loss, batches_this_epoch, reset=True)
if single_file_path_cached.endswith('.gz'): f = gzip.open(single_file_path_cached, 'rb') else: f = open(single_file_path_cached) for example in f: context = json.loads(example) if 'header' in context: continue contexts.append(context) f.close() s = socket.socket() for i in range(600): # Try for 10 minutes try: s.connect(('127.0.0.1', args.port)) except socket.error as e: if e.errno != errno.ECONNREFUSED: # Something other than Connection refused means server is running break time.sleep(1) else: raise Exception('Could not connect to server') s.close() for context in Tqdm.tqdm(contexts, total=len(contexts)): pred = requests.post('http://127.0.0.1:%d' % args.port, json=context) all_predictions.update(pred.json()) with open(args.output_file, 'w') as f: json.dump(all_predictions, f)
def get_inverse_hvp_lissa( vs: Sequence[torch.Tensor], model: Model, used_params: Sequence[torch.Tensor], lissa_data_loader: DataLoader, damping: float, num_samples: int, scale: float, ) -> torch.Tensor: """ This function approximates the product of the inverse of the Hessian and the vectors `vs` using LiSSA. Adapted from [github.com/kohpangwei/influence-release] (https://github.com/kohpangwei/influence-release/blob/0f656964867da6ddcca16c14b3e4f0eef38a7472/influence/genericNeuralNet.py#L475), the repo for [Koh, P.W., & Liang, P. (2017)](https://api.semanticscholar.org/CorpusID:13193974), and [github.com/xhan77/influence-function-analysis] (https://github.com/xhan77/influence-function-analysis/blob/78d5a967aba885f690d34e88d68da8678aee41f1/bert_util.py#L336), the repo for [Han, Xiaochuang et al. (2020)](https://api.semanticscholar.org/CorpusID:218628619). """ inverse_hvps = [torch.tensor(0) for _ in vs] for _ in Tqdm.tqdm(range(num_samples), desc="LiSSA samples", total=num_samples): # See a explanation at "Stochastic estimation" paragraph in [https://arxiv.org/pdf/1703.04730.pdf] # initialize \tilde{H}^{−1}_0 v = v cur_estimates = vs recursion_iter = Tqdm.tqdm(lissa_data_loader, desc="LiSSA depth", total=len(lissa_data_loader)) for j, training_batch in enumerate(recursion_iter): # TODO (epwalsh): should we make sure `model` is in "train" or "eval" mode here? model.zero_grad() train_output_dict = model(**training_batch) # Hessian of loss @ \tilde{H}^{−1}_{j - 1} v hvps = get_hvp(train_output_dict["loss"], used_params, cur_estimates) # This is the recursive step: # cur_estimate = \tilde{H}^{−1}_{j - 1} v # (i.e. Hessian-Vector Product estimate from last iteration) # Updating for \tilde{H}^{−1}_j v, the new current estimate becomes: # v + (I - (Hessian_at_x + damping)) * cur_estimate # = v + (I + damping) * cur_estimate - Hessian_at_x * cur_estimate # We divide `hvp / scale` here (or, equivalently `Hessian_at_x / scale`) # so that we're effectively dividing the loss by `scale`. cur_estimates = [ v + (1 - damping) * cur_estimate - hvp / scale for v, cur_estimate, hvp in zip(vs, cur_estimates, hvps) ] # Update the Tqdm progress bar with the current norm so the user can # see it converge. if (j % 50 == 0) or (j == len(lissa_data_loader) - 1): norm = np.linalg.norm( _flatten_tensors(cur_estimates).cpu().numpy()) recursion_iter.set_description( desc=f"calculating inverse HVP, norm = {norm:.5f}") # Accumulating X_{[i,S_2]} (notation from the LiSSA (algo. 1) [https://arxiv.org/pdf/1602.03943.pdf] # Need to divide by `scale` again here because the `vs` represent gradients # that haven't been scaled yet. inverse_hvps = [ inverse_hvp + cur_estimate / scale for inverse_hvp, cur_estimate in zip(inverse_hvps, cur_estimates) ] return_ihvp = _flatten_tensors(inverse_hvps) return_ihvp /= num_samples return return_ihvp
def evaluate( model: Model, data_loader: DataLoader, cuda_device: int, batch_weight_key: str, ) -> Dict[str, Any]: check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = iter(data_loader) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=len(data_loader)) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 for batch in generator_tqdm: batch_count += 1 batch = nn_util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight if not HasBeenWarned.tqdm_ignores_underscores and any( metric_name.startswith("_") for metric_name in metrics ): logger.warning( 'Metrics with names beginning with "_" will ' "not be logged to the tqdm progress bar." ) HasBeenWarned.tqdm_ignores_underscores = True description = ( ", ".join( [ "%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_") ] ) + " ||" ) generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError( "The model you are trying to evaluate only sometimes " + "produced a loss!" ) final_metrics["loss"] = total_loss / total_weight return final_metrics
def embed_file(self, input_file, output_file_path, output_format=u"all", batch_size=DEFAULT_BATCH_SIZE, forget_sentences=False, use_sentence_keys=False): u""" Computes ELMo embeddings from an input_file where each line contains a sentence tokenized by whitespace. The ELMo embeddings are written out in HDF5 format, where each sentence embedding is saved in a dataset with the line number in the original file as the key. Parameters ---------- input_file : ``IO``, required A file with one tokenized sentence per line. output_file_path : ``str``, required A path to the output hdf5 file. output_format : ``str``, optional, (default = "all") The embeddings to output. Must be one of "all", "top", or "average". batch_size : ``int``, optional, (default = 64) The number of sentences to process in ELMo at one time. forget_sentences : ``bool``, optional, (default = False). If use_sentence_keys is False, whether or not to include a string serialized JSON dictionary that associates sentences with their line number (its HDF5 key). The mapping is placed in the "sentence_to_index" HDF5 key. This is useful if you want to use the embeddings without keeping the original file of sentences around. use_sentence_keys : ``bool``, optional, (default = False). Whether or not to use full sentences as keys. By default, the line numbers of the input file are used as ids, which is more robust. """ assert output_format in [u"all", u"top", u"average"] # Tokenizes the sentences. sentences = [line.strip() for line in input_file] blank_lines = [i for (i, line) in enumerate(sentences) if line == u""] if blank_lines: raise ConfigurationError( "Your input file contains empty lines at indexes " "{blank_lines}. Please remove them.") split_sentences = [sentence.split() for sentence in sentences] # Uses the sentence index as the key. if use_sentence_keys: logger.warning( u"Using sentences as keys can fail if sentences " u"contain forward slashes or colons. Use with caution.") embedded_sentences = izip( sentences, self.embed_sentences(split_sentences, batch_size)) else: embedded_sentences = ((unicode(i), x) for i, x in enumerate( self.embed_sentences(split_sentences, batch_size))) sentence_to_index = {} logger.info(u"Processing sentences.") with h5py.File(output_file_path, u'w') as fout: for key, embeddings in Tqdm.tqdm(embedded_sentences): if use_sentence_keys and key in list(fout.keys()): raise ConfigurationError( "Key already exists in {output_file_path}. " "To encode duplicate sentences, do not pass " "the --use-sentence-keys flag.") if not forget_sentences and not use_sentence_keys: sentence = sentences[int(key)] sentence_to_index[sentence] = key if output_format == u"all": output = embeddings elif output_format == u"top": output = embeddings[-1] elif output_format == u"average": output = numpy.average(embeddings, axis=0) fout.create_dataset(unicode(key), output.shape, dtype=u'float32', data=output) if not forget_sentences and not use_sentence_keys: sentence_index_dataset = fout.create_dataset( u"sentence_to_index", (1, ), dtype=h5py.special_dtype(vlen=unicode)) sentence_index_dataset[0] = json.dumps(sentence_to_index) input_file.close()
def prepare_global_logging(serialization_dir: str, file_friendly_logging: bool, rank: int = 0, world_size: int = 1) -> None: # If we don't have a terminal as stdout, # force tqdm to be nicer. if not sys.stdout.isatty(): file_friendly_logging = True Tqdm.set_slower_interval(file_friendly_logging) # Handlers for stdout/err logging output_stream_log_handler = logging.StreamHandler(sys.stdout) error_stream_log_handler = logging.StreamHandler(sys.stderr) if world_size == 1: # This case is not distributed training and hence will stick to the older # log file names output_file_log_handler = logging.FileHandler( filename=os.path.join(serialization_dir, "stdout.log")) error_file_log_handler = logging.FileHandler( filename=os.path.join(serialization_dir, "stderr.log")) else: # Create log files with worker ids output_file_log_handler = logging.FileHandler(filename=os.path.join( serialization_dir, f"stdout_worker{rank}.log")) error_file_log_handler = logging.FileHandler(filename=os.path.join( serialization_dir, f"stderr_worker{rank}.log")) # This adds the worker's rank to messages being logged to files. # This will help when combining multiple worker log files using `less` command. worker_filter = WorkerLogFilter(rank) output_file_log_handler.addFilter(worker_filter) error_file_log_handler.addFilter(worker_filter) formatter = logging.Formatter( "%(asctime)s - %(levelname)s - %(name)s - %(message)s") root_logger = logging.getLogger() # Remove the already set stream handler in root logger. # Not doing this will result in duplicate log messages # printed in the console if len(root_logger.handlers) > 0: for handler in root_logger.handlers: root_logger.removeHandler(handler) # file handlers need to be handled for tqdm's \r char file_friendly_log_filter = FileFriendlyLogFilter() if os.environ.get("ALLENNLP_DEBUG"): LEVEL = logging.DEBUG else: LEVEL = logging.INFO if rank == 0: # stdout/stderr handlers are added only for the # master worker. This is to avoid cluttering the console # screen with too many log messages from all workers. output_stream_log_handler.setFormatter(formatter) error_stream_log_handler.setFormatter(formatter) output_stream_log_handler.setLevel(LEVEL) error_stream_log_handler.setLevel(logging.ERROR) if file_friendly_logging: output_stream_log_handler.addFilter(file_friendly_log_filter) error_stream_log_handler.addFilter(file_friendly_log_filter) root_logger.addHandler(output_stream_log_handler) root_logger.addHandler(error_stream_log_handler) output_file_log_handler.addFilter(file_friendly_log_filter) error_file_log_handler.addFilter(file_friendly_log_filter) output_file_log_handler.setFormatter(formatter) error_file_log_handler.setFormatter(formatter) output_file_log_handler.setLevel(LEVEL) error_file_log_handler.setLevel(logging.ERROR) root_logger.addHandler(output_file_log_handler) root_logger.addHandler(error_file_log_handler) root_logger.setLevel(LEVEL)
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, batch_weight_key: str) -> Dict[str, Any]: _warned_tqdm_ignores_underscores = False check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 for batch in generator_tqdm: batch_count += 1 batch = util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight if (not _warned_tqdm_ignores_underscores and any(metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") _warned_tqdm_ignores_underscores = True description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_")]) + " ||" generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError("The model you are trying to evaluate only sometimes " + "produced a loss!") final_metrics["loss"] = total_loss / total_weight return final_metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}") for gpu, memory in gpu_memory_mb().items(): logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._model.train() # Get tqdm for the training batches train_generator = self._iterator(self._train_data, num_epochs=1, cuda_device=self._iterator_device) num_training_batches = self._iterator.get_num_batches(self._train_data) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 if self._histogram_interval is not None: histogram_parameters = set(self._model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) for batch in train_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self._log_histograms_this_batch = self._histogram_interval is not None and ( batch_num_total % self._histogram_interval == 0) self._optimizer.zero_grad() loss = self._batch_loss(batch, for_training=True) loss.backward() train_loss += loss.item() batch_grad_norm = self._rescale_gradients() # This does nothing if batch_num_total is None or you are using an # LRScheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._log_histograms_this_batch: # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = {name: param.detach().cpu().clone() for name, param in self._model.named_parameters()} self._optimizer.step() for name, param in self._model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar("gradient_update/" + name, update_norm / (param_norm + 1e-7), batch_num_total) else: self._optimizer.step() # Update the description with the latest metrics metrics = self._get_metrics(train_loss, batches_this_epoch) description = self._description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if batch_num_total % self._summary_interval == 0: self._parameter_and_gradient_statistics_to_tensorboard(batch_num_total, batch_grad_norm) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"], batch_num_total) self._metrics_to_tensorboard(batch_num_total, {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._log_histograms_this_batch: self._histograms_to_tensorboard(batch_num_total, histogram_parameters) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval ): last_save_time = time.time() self._save_checkpoint( '{0}.{1}'.format(epoch, time_to_str(int(last_save_time))), [], is_best=False ) return self._get_metrics(train_loss, batches_this_epoch, reset=True)
def main(serialization_directory: int, device: int, data: str, prefix: str, domain: str = None): """ serialization_directory : str, required. The directory containing the serialized weights. device: int, default = -1 The device to run the evaluation on. data: str, default = None The data to evaluate on. By default, we use the validation data from the original experiment. prefix: str, default="" The prefix to prepend to the generated gold and prediction files, to distinguish different models/data. domain: str, optional (default = None) If passed, filters the ontonotes evaluation/test dataset to only contain the specified domain. This overwrites the domain in the config file from the model, to allow evaluation on domains other than the one the model was trained on. """ config = Params.from_file(os.path.join(serialization_directory, "config.json")) if domain is not None: # Hack to allow evaluation on different domains than the # model was trained on. config["dataset_reader"]["domain_identifier"] = domain prefix = f"{domain}_{prefix}" else: config["dataset_reader"].pop("domain_identifier", None) dataset_reader = DatasetReader.from_params(config['dataset_reader']) evaluation_data_path = data if data else config['validation_data_path'] archive = load_archive(os.path.join(serialization_directory, "model.tar.gz"), cuda_device=device) model = archive.model model.eval() prediction_file_path = os.path.join(serialization_directory, prefix + "_predictions.txt") gold_file_path = os.path.join(serialization_directory, prefix + "_gold.txt") prediction_file = open(prediction_file_path, "w+") gold_file = open(gold_file_path, "w+") # Load the evaluation data and index it. print("reading evaluation data from {}".format(evaluation_data_path)) instances = dataset_reader.read(evaluation_data_path) with torch.autograd.no_grad(): iterator = BasicIterator(batch_size=32) iterator.index_with(model.vocab) model_predictions = [] batches = iterator(instances, num_epochs=1, shuffle=False, cuda_device=device) for batch in Tqdm.tqdm(batches): result = model(**batch) predictions = model.decode(result) model_predictions.extend(predictions["tags"]) for instance, prediction in zip(instances, model_predictions): fields = instance.fields try: # Most sentences have a verbal predicate, but not all. verb_index = fields["verb_indicator"].labels.index(1) except ValueError: verb_index = None gold_tags = fields["tags"].labels sentence = [x.text for x in fields["tokens"].tokens] write_to_conll_eval_file(prediction_file, gold_file, verb_index, sentence, prediction, gold_tags) prediction_file.close() gold_file.close()
def embed_file(self, input_file: IO, output_file_path: str, output_format: str = "all", batch_size: int = DEFAULT_BATCH_SIZE, forget_sentences: bool = False, use_sentence_keys: bool = False) -> None: """ Computes ELMo embeddings from an input_file where each line contains a sentence tokenized by whitespace. The ELMo embeddings are written out in HDF5 format, where each sentence embedding is saved in a dataset with the line number in the original file as the key. Parameters ---------- input_file : ``IO``, required A file with one tokenized sentence per line. output_file_path : ``str``, required A path to the output hdf5 file. output_format : ``str``, optional, (default = "all") The embeddings to output. Must be one of "all", "top", or "average". batch_size : ``int``, optional, (default = 64) The number of sentences to process in ELMo at one time. forget_sentences : ``bool``, optional, (default = False). If use_sentence_keys is False, whether or not to include a string serialized JSON dictionary that associates sentences with their line number (its HDF5 key). The mapping is placed in the "sentence_to_index" HDF5 key. This is useful if you want to use the embeddings without keeping the original file of sentences around. use_sentence_keys : ``bool``, optional, (default = False). Whether or not to use full sentences as keys. By default, the line numbers of the input file are used as ids, which is more robust. """ assert output_format in ["all", "top", "average"] # Tokenizes the sentences. sentences = [line.strip() for line in input_file] blank_lines = [i for (i, line) in enumerate(sentences) if line == ""] if blank_lines: raise ConfigurationError(f"Your input file contains empty lines at indexes " f"{blank_lines}. Please remove them.") split_sentences = [sentence.split() for sentence in sentences] # Uses the sentence index as the key. if use_sentence_keys: logger.warning("Using sentences as keys can fail if sentences " "contain forward slashes or colons. Use with caution.") embedded_sentences = zip(sentences, self.embed_sentences(split_sentences, batch_size)) else: embedded_sentences = ((str(i), x) for i, x in enumerate(self.embed_sentences(split_sentences, batch_size))) sentence_to_index = {} logger.info("Processing sentences.") with h5py.File(output_file_path, 'w') as fout: for key, embeddings in Tqdm.tqdm(embedded_sentences): if use_sentence_keys and key in fout.keys(): raise ConfigurationError(f"Key already exists in {output_file_path}. " f"To encode duplicate sentences, do not pass " f"the --use-sentence-keys flag.") if not forget_sentences and not use_sentence_keys: sentence = sentences[int(key)] sentence_to_index[sentence] = key if output_format == "all": output = embeddings elif output_format == "top": output = embeddings[-1] elif output_format == "average": output = numpy.average(embeddings, axis=0) fout.create_dataset( str(key), output.shape, dtype='float32', data=output ) if not forget_sentences and not use_sentence_keys: sentence_index_dataset = fout.create_dataset( "sentence_to_index", (1,), dtype=h5py.special_dtype(vlen=str)) sentence_index_dataset[0] = json.dumps(sentence_to_index) input_file.close()
def explore_embedding_space( embedding_fn: str, out_fn : str, num_samples=1000) -> None: """ Calculate the following statistics for each layer of the model: 1. mean cosine similarity between a sentence and its words 2. mean dot product between a sentence and its words 3. mean word embedding norm 4. mean cosine similarity between randomly sampled words 5. mean dot product between randomly sampled words 6. mean variance explained by first principal component for a random sample of words num_samples sentences/words are used to estimate each of these metrics. We randomly sample words by first uniformly randomly sampling sentences and then uniformly randomly sampling a single word from each sampled sentence. This is because: - When we say we are interested in the similarity between random words, what we really mean is the similarity between random _word occurrences_ (since each word has a unique vector based on its context). - By explicitly sampling from different contexts, we avoid running into cases where two words are similar due to sharing the same context. Create a dictionary mapping each layer to a dictionary of the statistics write it to out_fn. """ f = h5py.File(embedding_fn, 'r') num_layers = f["0"].shape[0] num_sentences = len(f) sentence_indices = random.sample(list(range(num_sentences)), num_samples) mean_cos_sim_between_sent_and_words = { f'layer_{layer}' : [] for layer in range(num_layers) } mean_cos_sim_across_words = { f'layer_{layer}' : -1 for layer in range(num_layers) } word_norm_std = { f'layer_{layer}' : -1 for layer in range(num_layers) } word_norm_mean = { f'layer_{layer}' : -1 for layer in range(num_layers) } variance_explained_random = { f'layer_{layer}' : -1 for layer in range(num_layers) } for layer in Tqdm.tqdm(range(num_layers)): word_vectors = [] word_norms = [] mean_cos_sims = [] mean_dot_products = [] for sent_index in sentence_indices: # average word vectors to get sentence vector sentence_vector = f[str(sent_index)][layer].mean(axis=0) num_words = f[str(sent_index)].shape[1] # randomly add a word vector (not all of them, because that would bias towards longer sentences) word_vectors.append(f[str(sent_index)][layer, random.choice(list(range(num_words)))]) # what is the mean cosine similarity between the sentence and its words? mean_cos_sim = np.nanmean([ 1 - cosine(f[str(sent_index)][layer,i], sentence_vector) for i in range(num_words) if f[str(sent_index)][layer, i].shape != () ]) mean_cos_sims.append(round(mean_cos_sim, 3)) # what is the mean embedding norm across words? word_norms.extend([np.linalg.norm(f[str(sent_index)][layer,i]) for i in range(num_words)]) mean_cos_sim_between_sent_and_words[f'layer_{layer}'] = round(float(np.mean(mean_cos_sims)), 3) mean_cos_sim_across_words[f'layer_{layer}'] = round(np.nanmean([ 1 - cosine(random.choice(word_vectors), random.choice(word_vectors)) for _ in range(num_samples)]), 3) word_norm_std[f'layer_{layer}'] = round(float(np.std(word_norms)), 3) word_norm_mean[f'layer_{layer}'] = round(float(np.mean(word_norms)), 3) # how much of the variance in randomly chosen words can be explained by their first principal component? pca = TruncatedSVD(n_components=100) pca.fit(word_vectors) variance_explained_random[f'layer_{layer}'] = min(1.0, round(float(pca.explained_variance_ratio_[0]), 3)) json.dump({ 'mean cosine similarity between sentence and words' : mean_cos_sim_between_sent_and_words, 'mean cosine similarity across words' : mean_cos_sim_across_words, 'word norm std' : word_norm_std, 'word norm mean' : word_norm_mean, 'variance explained for random words' : variance_explained_random }, open(out_fn, 'w'), indent=1)
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) batch_group_generator = lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / self._num_gradient_accumulation_steps) # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown if self._master: batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator, total=num_training_batches) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") cumulative_batch_group_size = 0 for batch_group in batch_group_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() for batch in batch_group: loss = self.batch_loss(batch, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1)) param_norm = torch.norm(param.view(-1)).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) # Updating tqdm only for the master as the trainers wouldn't have one if self._master: description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard (only from the master) if self._tensorboard.should_log_this_batch() and self._master: self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: batch_group_size = sum( training_util.get_batch_size(batch) for batch in batch_group) cumulative_batch_group_size += batch_group_size if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_group_size / batches_this_epoch logger.info( f"current batch size: {batch_group_size} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", batch_group_size) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if (self._model_save_interval is not None and (time.time() - last_save_time > self._model_save_interval) and self._master): last_save_time = time.time() self._save_checkpoint("{0}.{1}".format( epoch, training_util.time_to_str(int(last_save_time)))) # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=[self.cuda_device], ) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def evaluate_perplexity(model: Model, sampler: Model, num_samples: int, instances: Iterator[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: check_for_gpu(cuda_device) logger.info('Iterating over dataset') with torch.no_grad(): summands = [] penalized_summands = [] for i in range(num_samples): iterator = data_iterator(instances, num_epochs=1, shuffle=False) generator_tqdm = Tqdm.tqdm(iterator, total=0) model.eval() sampler.eval() summand = 0.0 penalized_summand = 0.0 denom = 0 for batch, _ in generator_tqdm: batch = util.move_to_device(batch, cuda_device) # We need sequence length to help compute perplexity n_tokens = util.get_text_field_mask( batch['source']).float().sum().item() denom += n_tokens # Draw a sample sampler_output = sampler.sample(**batch) sample_logp = sampler_output['logp'] sample = sampler_output['sample'] # Evaluate on sample model_output = model(**sample) model_logp = model_output['logp'] model_penalized_logp = model_output['penalized_logp'] summand += (model_logp - sample_logp).item() penalized_summand += (model_penalized_logp - sample_logp).item() summands.append(summand) penalized_summands.append(penalized_summand) t = torch.tensor(summands) p = torch.tensor(penalized_summands) t_sum = torch.logsumexp(t, dim=0) p_sum = torch.logsumexp(p, dim=0) sum_logp = (t_sum - math.log(i + 1)).item() sum_logp_penalized = (p_sum - math.log(i + 1)).item() ppl = math.exp(-sum_logp / denom) upp = math.exp(-sum_logp_penalized / denom) print('PPL: %f' % ppl) print('UPP: %f' % upp) metrics = {'ppl': ppl, 'upp': upp} return metrics
def evaluate( model: Model, data_loader: DataLoader, cuda_device: int = -1, batch_weight_key: str = None, ) -> Dict[str, Any]: """ # Parameters model : `Model` The model to evaluate data_loader : `DataLoader` The `DataLoader` that will iterate over the evaluation data (data loaders already contain their data). cuda_device : `int`, optional (default=`-1`) The cuda device to use for this evaluation. The model is assumed to already be using this device; this parameter is only used for moving the input data to the correct device. batch_weight_key : `str`, optional (default=`None`) If given, this is a key in the output dictionary for each batch that specifies how to weight the loss for that batch. If this is not given, we use a weight of 1 for every batch. """ check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = iter(data_loader) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 for batch in generator_tqdm: batch_count += 1 batch = nn_util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight if not HasBeenWarned.tqdm_ignores_underscores and any( metric_name.startswith("_") for metric_name in metrics ): logger.warning( 'Metrics with names beginning with "_" will ' "not be logged to the tqdm progress bar." ) HasBeenWarned.tqdm_ignores_underscores = True description = ( ", ".join( [ "%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_") ] ) + " ||" ) generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError( "The model you are trying to evaluate only sometimes " + "produced a loss!" ) final_metrics["loss"] = total_loss / total_weight return final_metrics
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, batch_weight_key: str) -> Dict[str, Any]: check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm( iterator, total=data_iterator.get_num_batches(instances)) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 # ksk total_probs, all_example_ids = [], [] for batch in generator_tqdm: batch_count += 1 batch = nn_util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight # ksk if 'probs' in output_dict: total_probs.extend(output_dict['probs']) all_example_ids.extend([ batch['metadata'][batch_index]['example_ids'] for batch_index in range(len(batch['metadata'])) ]) if (not HasBeenWarned.tqdm_ignores_underscores and any( metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") HasBeenWarned.tqdm_ignores_underscores = True description = ', '.join([ "%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_") ]) + " ||" generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError( "The model you are trying to evaluate only sometimes " + "produced a loss!") final_metrics["loss"] = total_loss / total_weight # ksk if 'probs' in output_dict: total_probs.extend(output_dict['probs']) all_example_ids.extend([ batch['metadata'][batch_index]['example_ids'] for batch_index in range(len(batch['metadata'])) ]) final_metrics["probs"] = total_probs final_metrics["example_ids"] = all_example_ids return final_metrics
def main(serialization_directory: str, device: int, data: str, prefix: str, domain: str = None): """ serialization_directory : str, required. The directory containing the serialized weights. device: int, default = -1 The device to run the evaluation on. data: str, default = None The data to evaluate on. By default, we use the validation data from the original experiment. prefix: str, default="" The prefix to prepend to the generated gold and prediction files, to distinguish different models/data. domain: str, optional (default = None) If passed, filters the ontonotes evaluation/test dataset to only contain the specified domain. This overwrites the domain in the config file from the model, to allow evaluation on domains other than the one the model was trained on. """ config = Params.from_file( os.path.join(serialization_directory, "config.json")) if domain is not None: # Hack to allow evaluation on different domains than the # model was trained on. config["dataset_reader"]["domain_identifier"] = domain prefix = f"{domain}_{prefix}" else: config["dataset_reader"].pop("domain_identifier", None) dataset_reader = DatasetReader.from_params(config["dataset_reader"]) evaluation_data_path = data if data else config["validation_data_path"] archive = load_archive(os.path.join(serialization_directory, "model.tar.gz"), cuda_device=device) model = archive.model model.eval() prediction_file_path = os.path.join(serialization_directory, prefix + "_predictions.txt") gold_file_path = os.path.join(serialization_directory, prefix + "_gold.txt") prediction_file = open(prediction_file_path, "w+") gold_file = open(gold_file_path, "w+") # Load the evaluation data and index it. print("reading evaluation data from {}".format(evaluation_data_path)) dataset = list(dataset_reader.read(evaluation_data_path)) with torch.autograd.no_grad(): loader = SimpleDataLoader(dataset, 32) model_predictions: List[List[str]] = [] for batch in Tqdm.tqdm(loader): batch = move_to_device(batch, device) result = model(**batch) predictions = model.decode(result) model_predictions.extend(predictions["tags"]) for instance, prediction in zip(dataset, model_predictions): fields = instance.fields verb_index = fields["metadata"]["verb_index"] gold_tags = fields["metadata"]["gold_tags"] sentence = fields["metadata"]["words"] write_to_conll_eval_file(prediction_file, gold_file, verb_index, sentence, prediction, gold_tags) prediction_file.close() gold_file.close()
dataset_reader = DatasetReader.from_params(config.pop('dataset_reader')) evaluation_data_path = args.input_file logger.info("Reading evaluation data from %s", evaluation_data_path) instances = dataset_reader.read(evaluation_data_path) config['iterator']['type'] = 'basic' del config['iterator']['sorting_keys'] data_iterator = DataIterator.from_params(config.pop("iterator")) data_iterator.index_with(model.vocab) cuda_device = args.cuda_device #### EVALUATION AQUI model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False, cuda_device=cuda_device, for_training=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) label_probs = [] for batch in generator_tqdm: lol = model(**batch) label_probs.append(model(**batch)['label_probs'].data.cpu().numpy()) label_probs = np.concatenate(label_probs) my_preds = pd.DataFrame(label_probs, columns=['ending0','ending1','ending2','ending3']) my_preds['pred'] = label_probs.argmax(1) my_preds.to_csv(args.output_file)
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.model.train() num_gpus = len(self._cuda_devices) # Get tqdm for the training batches raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) train_generator = lazy_groups_of(raw_train_generator, num_gpus) num_training_batches = math.ceil(self.iterator.get_num_batches(self.train_data)/num_gpus) self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set(self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) cumulative_batch_size = 0 for batch_group in train_generator_tqdm: self.model.train() batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() loss = self.batch_loss(batch_group, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch(): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = {name: param.detach().cpu().clone() for name, param in self.model.named_parameters()} self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1, )) param_norm = torch.norm(param.view(-1, )).cpu() self._tensorboard.add_train_scalar("gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch) description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._tensorboard.should_log_this_batch(): self._tensorboard.log_parameter_and_gradient_statistics(self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics({"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch(): self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: cur_batch = sum([training_util.get_batch_size(batch) for batch in batch_group]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_size/batches_this_epoch logger.info(f"current batch size: {cur_batch} mean batch size: {average}") self._tensorboard.add_train_scalar("current_batch_size", cur_batch) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval ): last_save_time = time.time() self._save_checkpoint( '{0}.{1}'.format(epoch, training_util.time_to_str(int(last_save_time))) ) if self._early_stopping_by_batch and self._batch_num_total % 10 == 0: if self._validation_data is not None: with torch.no_grad(): # We have a validation set, so compute all the metrics on it. val_loss, num_batches = self._validation_loss() val_metrics = training_util.get_metrics(self.model, val_loss, num_batches, reset=True) # Check validation metric for early stopping this_epoch_val_metric = val_metrics[self._validation_metric] self._metric_tracker.add_metric(this_epoch_val_metric) if self._metric_tracker.is_best_so_far(): metrics['best_batch'] = self._batch_num_total for key, value in val_metrics.items(): metrics["best_validation_" + key] = value self._metric_tracker.best_epoch_metrics = val_metrics self._save_checkpoint(self._batch_num_total) if self.callbacks is not None: for callback in self.callbacks: callback.on_batch_end(self._batch_num_total) metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics['gpu_'+str(gpu_num)+'_memory_MB'] = memory return metrics
def semi_train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self.trainer._num_epochs - 1) peak_cpu_usage = peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self.trainer.model.train() num_gpus = len(self.trainer._cuda_devices) self.trainer._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self.trainer._batch_num_total is None: self.trainer._batch_num_total = 0 histogram_parameters = set( self.trainer.model. get_parameters_for_histogram_tensorboard_logging()) #Pdb().set_trace() mixed_generator, num_training_batches = get_mixer( self.trainer.iterator, self.trainer.train_data, self.trainer.iterator, self.unlabelled_dataset, num_gpus, self.labelled_id, self.which_mixer, self.min_pct_of_unlabelled) #mixed_generator, num_training_batches = get_mixer(self.trainer.iterator, self.trainer.train_data, self.trainer._validation_iterator, self.unlabelled_dataset,num_gpus, self.labelled_id, self.which_mixer) #generator for lambda update mixed_generator_for_lambda, _ = get_mixer(self.trainer.iterator, self.trainer.train_data, self.trainer.iterator, self.unlabelled_dataset, num_gpus, self.labelled_id, 'cm', 1.0) #mixed_generator_for_lambda, _ = get_mixer(self.trainer._validation_iterator, self.trainer.train_data, self.trainer._validation_iterator, self.unlabelled_dataset, num_gpus, self.labelled_id, 'cm') logger.info("Training") train_generator_tqdm = Tqdm.tqdm(mixed_generator, total=num_training_batches) #train_generator_tqdm = Tqdm.tqdm(zip(train_generator,unlabelled_train_generator), # total=num_training_batches) cumulative_batch_size = 0 unlabelled_loss = 0 unlabelled_batches_this_epoch = 0 batches_since_last_step = 0 agg_loss = 0.0 flag = False batch_grad_norm = None for batch_group, group_id in train_generator_tqdm: #print(batch_group[0]['sentence']['tokens'].shape) if self.total_supervised_iters < self.dd_semi_warmup_iters and group_id != self.labelled_id: continue output_dict = self.batch_loss( batch_group, for_training=True, eval_metric=(group_id == self.labelled_id)) penalties = defaultdict(float) if self.constraints_model is not None: penalties = self.constraints_model( output_dict['task1_tag_logits'], output_dict['task2_tag_logits'], output_dict['mask']) loss = 0.0 if 'loss' in output_dict: loss = output_dict['loss'] train_loss += loss.item() loss += output_dict.get('regularization_penalty', 0.0) loss += self.constraints_wt * penalties['loss'] unlabelled_loss += penalties['loss'].item() if torch.is_tensor( penalties['loss']) else penalties['loss'] agg_loss += loss batches_since_last_step += 1 if batches_since_last_step == self.backprop_after_xbatches: #print("STEP THROUGH! : {}. loss: {} agg_loss: {}".format(group_id, loss, agg_loss)) batch_grad_norm = self.step(agg_loss) batches_since_last_step = 0 agg_loss = 0.0 flag = False else: flag = True #print("skipp : {}. loss: {} agg_loss: {}".format(group_id, loss, agg_loss)) if (group_id != self.labelled_id): unlabelled_batches_this_epoch += 1 #self.trainer.optimizer.zero_grad() #loss.backward() #batch_grad_norm = self.trainer.rescale_gradients() #self.trainer.optimizer.step() else: self.total_supervised_iters += 1.0 batches_this_epoch += 1 self.trainer._batch_num_total += 1 batch_num_total = self.trainer._batch_num_total #self.trainer.optimizer.zero_grad() #loss.backward() #batch_grad_norm = self.trainer.rescale_gradients() # This does nothing if batch_num_total is None or you are using an # LRScheduler which doesn't update per batch. if self.trainer._learning_rate_scheduler: self.trainer._learning_rate_scheduler.step_batch( batch_num_total) if self.trainer._tensorboard.should_log_histograms_this_batch( ): # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.trainer.model.named_parameters() } #self.trainer.optimizer.step() for name, param in self.trainer.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view( -1, )) param_norm = torch.norm(param.view(-1, )).cpu() self.trainer._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: pass #self.trainer.optimizer.step() # Update moving averages if self.trainer._moving_average is not None: self.trainer._moving_average.apply(batch_num_total) # metrics = training_util.get_metrics(self.trainer.model, train_loss, batches_this_epoch) metrics["uloss"] = float( unlabelled_loss / (batches_this_epoch + unlabelled_batches_this_epoch)) # Update the description with the latest metrics description = training_util.description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self.trainer._tensorboard.should_log_this_batch( ) and batch_grad_norm is not None: self.trainer._tensorboard.log_parameter_and_gradient_statistics( self.trainer.model, batch_grad_norm) self.trainer._tensorboard.log_learning_rates( self.trainer.model, self.trainer.optimizer) self.trainer._tensorboard.add_train_scalar( "loss/loss_train", metrics["loss"]) self.trainer._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self.trainer._tensorboard.should_log_histograms_this_batch( ): self.trainer._tensorboard.log_histograms( self.trainer.model, histogram_parameters) if self.trainer._log_batch_size_period: cur_batch = sum([ training_util.get_batch_size(batch) for batch in batch_group ]) cumulative_batch_size += cur_batch if (batches_this_epoch - 1) % self.trainer._log_batch_size_period == 0: average = cumulative_batch_size / batches_this_epoch logger.info( f"current batch size: {cur_batch} mean batch size: {average}" ) self.trainer._tensorboard.add_train_scalar( "current_batch_size", cur_batch) self.trainer._tensorboard.add_train_scalar( "mean_batch_size", average) # Save model if needed. if self.trainer._model_save_interval is not None and ( time.time() - last_save_time > self.trainer._model_save_interval): last_save_time = time.time() self.trainer._save_checkpoint('{0}.{1}'.format( epoch, training_util.time_to_str(int(last_save_time)))) #lambda update #if (self.constraints_model is not None) and (self.dd_optimizer is not None) and (self.total_supervised_iters >= self.dd_warmup_iters) and (batches_this_epoch % self.dd_update_freq == 0): if (self.constraints_model is not None) and (self.dd_optimizer is not None) and ( self.total_supervised_iters >= self.dd_warmup_iters ) and (self.total_supervised_iters - self.last_lambda_update >= self.dd_update_freq): for batch_group, group_id in mixed_generator_for_lambda: self.lambda_update(batch_group) self.last_lambda_update = self.total_supervised_iters break self.count_lambda_updates += 1 if (self.dd_increase_freq_after is not None) and (self.count_lambda_updates % self.dd_increase_freq_after == 0): self.dd_update_freq += self.dd_increase_freq_by if flag: batch_grad_norm = self.step(agg_loss) batches_since_last_step = 0 agg_loss = 0.0 flag = False #lambda update #if (self.constraints_model is not None) and (self.dd_optimizer is not None) and (self.total_supervised_iters >= self.dd_warmup_iters): if (self.constraints_model is not None) and (self.dd_optimizer is not None) and ( self.total_supervised_iters >= self.dd_warmup_iters) and ( self.total_supervised_iters - self.last_lambda_update >= self.dd_update_freq): for batch_group, group_id in mixed_generator_for_lambda: self.lambda_update(batch_group) self.last_lambda_update = self.total_supervised_iters break self.count_lambda_updates += 1 if (self.dd_increase_freq_after is not None) and (self.count_lambda_updates % self.dd_increase_freq_after == 0): self.dd_update_freq += self.dd_increase_freq_by metrics = training_util.get_metrics(self.trainer.model, train_loss, batches_this_epoch, reset=True) metrics['cpu_memory_MB'] = peak_cpu_usage metrics['lb'] = batches_this_epoch metrics['ub'] = unlabelled_batches_this_epoch metrics["uloss"] = float( unlabelled_loss / (batches_this_epoch + unlabelled_batches_this_epoch)) if self.constraints_model is not None: lambda_stats_dict = self.constraints_model.lambda_stats() metrics.update(lambda_stats_dict) for (gpu_num, memory) in gpu_usage: metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory return metrics