def get_pytext_home(): internal_home = os.path.realpath(os.path.join(__file__, "../../")) oss_home = os.path.realpath(os.path.join(__file__, "../../../")) default_home = "" # use tests as anchor which will always in PYTEXT_HOME/tests if PathManager.exists(os.path.join(internal_home, "tests")): default_home = internal_home elif PathManager.exists(os.path.join(oss_home, "tests")): default_home = oss_home else: # when PyText is used as a module and packed as part of a single file X # __file__ will be path of X instead of path.py # in these case, PYTEXT_HOME will be the parent folder of X default_home = os.path.dirname(__file__) pytext_home = os.environ.get("PYTEXT_HOME", default_home) return pytext_home
def __init__( self, pre_train_model_path, analyzed_sparsity, max_analysis_batches, max_skipped_weight, pre_analysis_path, sparsity, ): assert PathManager.exists( pre_train_model_path), "The pre-trained model must be exist" self.pre_train_model_path = pre_train_model_path self.param_dict = None assert (0.0 <= analyzed_sparsity <= 1.0), "Analyzed sparsity need to be in the range of [0, 1]" self.analyzed_sparsity = analyzed_sparsity self.max_analysis_batches = max_analysis_batches self.max_skipped_weight = max_skipped_weight self.require_mask_parameters = [] self.pre_analysis_path = pre_analysis_path assert (0.0 <= sparsity <= 1.0), "Pruning sparsity need to be in the range of [0, 1]" self.sparsity = sparsity self._masks = None self.analysis_state = State.OTHERS
def process_squad_json(self, fname): if not fname: return if not PathManager.exists(fname): print(f"{fname} does not exist. Not unflattening.") return with PathManager.open(fname) as infile: dump = json.load(infile) id = 0 for article in dump["data"]: for paragraph in article["paragraphs"]: doc = paragraph["context"] for question in paragraph["qas"]: has_answer = not question.get("is_impossible", False) answers = (question["answers"] if has_answer else question["plausible_answers"]) question = question["question"] answer_texts = [answer["text"] for answer in answers] answer_starts = [ int(answer["answer_start"]) for answer in answers ] for piece_dict in _split_document( id, doc, question, answer_texts, answer_starts, has_answer, self.ignore_impossible, self.max_character_length, self.min_overlap, ): yield piece_dict id += 1
def get_absolute_path(file_path: str) -> str: if os.path.isabs(file_path): return file_path absolute_path = os.path.realpath(os.path.join(PYTEXT_HOME, file_path)) if PathManager.exists(absolute_path): return absolute_path return file_path
def process_file(self, fname, is_train): if not fname: print(f"File path is either empty or None. Not unflattening.") return if not PathManager.exists(fname): print(f"{fname} does not exist. Not unflattening.") return with PathManager.open(fname) as infile: dump = json.load(infile) # Code pointer: https://fburl.com/yv8osgvo for row in dump: question = row["question"] positive_ctx = combine_title_text(row["positive_ctxs"][0], self.use_title) negative_ctxs = ([ combine_title_text(ctx, self.use_title) for ctx in row["negative_ctxs"] ] if is_train else None) num_negative_ctx = min(self.num_negative_ctxs, len(negative_ctxs)) yield { "question": question, "positive_ctx": positive_ctx, "negative_ctxs": random.shuffle(negative_ctxs), "label": "1", # Make LabelTensorizer.initialize() happy. "num_negative_ctx": num_negative_ctx, }
def process_file(self, fname, is_train): if not fname: print("File path is either empty or None. Not unflattening.") return if not PathManager.exists(fname): print(f"{fname} does not exist. Not unflattening.") return for row in self.read_file(fname): question = row["question"] positive_ctx = combine_title_text_id(row["positive_ctxs"][0], self.use_title) negative_ctxs = [ combine_title_text_id(ctx, self.use_title) for ctx in row["negative_ctxs"] ] if not negative_ctxs and row.get("distant_negatives"): # use distant_negatives in case we don't have hard negatives # it's better to have at least one negative for training negative_ctxs = [ combine_title_text_id(ctx, self.use_title) for ctx in row["distant_negatives"] ] if is_train: random.shuffle(negative_ctxs) if isinstance(question, list): # We can have a list of questions in the training data. # This is to account for paraphrases. We randomly sample a single paraphrases # in every epoch. Thus, with enough epochs all questions should be tried. question = question[random.randint(0, len(question) - 1)] else: # for non training runs, always take the num_negative_ctxs without shuffling # this makes the evaluation and test sets deterministic negative_ctxs = negative_ctxs[:self.num_negative_ctxs] num_negative_ctx = min(self.num_negative_ctxs, len(negative_ctxs)) yield { "question": question, "positive_ctx": positive_ctx, "negative_ctxs": negative_ctxs, "label": "1", # Make LabelTensorizer.initialize() happy. "num_negative_ctx": num_negative_ctx, }
def process_file(self, fname, is_train): if not fname: print(f"File path is either empty or None. Not unflattening.") return if not PathManager.exists(fname): print(f"{fname} does not exist. Not unflattening.") return with PathManager.open(fname) as infile: # Code pointer: https://fburl.com/yv8osgvo for line in infile: row = json.loads(line) question = row["question"] positive_ctx = combine_title_text(row["positive_ctxs"][0], self.use_title) negative_ctxs = [ combine_title_text(ctx, self.use_title) for ctx in row["negative_ctxs"] ] if not negative_ctxs and row.get("distant_negatives"): # use distant_negatives in case we don't have hard negatives # it's better to have at least one negative for training negative_ctxs = [ combine_title_text(ctx, self.use_title) for ctx in row["distant_negatives"] ] if is_train: random.shuffle(negative_ctxs) else: # for non training runs, always take the num_negative_ctxs without shuffling # this makes the evaluation and test sets deterministic negative_ctxs = negative_ctxs[:self.num_negative_ctxs] num_negative_ctx = min(self.num_negative_ctxs, len(negative_ctxs)) yield { "question": question, "positive_ctx": positive_ctx, "negative_ctxs": negative_ctxs, "label": "1", # Make LabelTensorizer.initialize() happy. "num_negative_ctx": num_negative_ctx, }
def __init__( self, pre_train_model_path, analyzed_sparsity, max_analysis_batches, max_skipped_weight, ): assert PathManager.exists( pre_train_model_path ), "The pre-trained model must be exist" self.pre_train_model_path = pre_train_model_path assert ( 0.0 <= analyzed_sparsity <= 1.0 ), "Analyzed sparsity need to be in the range of [0, 1]" self.analyzed_sparsity = analyzed_sparsity self.max_analysis_batches = max_analysis_batches self.max_skipped_weight = max_skipped_weight self.require_mask_parameters = []
def __init__( self, pre_train_model_path, analyzed_sparsity, max_analysis_batches, max_skipped_weight, pre_analysis_path, sparsity, iterative_pruning, pruning_iterations, start_sparsity_ratio, ): assert PathManager.exists( pre_train_model_path), "The pre-trained model must be exist" self.pre_train_model_path = pre_train_model_path self.param_dict = None assert (0.0 <= analyzed_sparsity <= 1.0), "Analyzed sparsity need to be in the range of [0, 1]" self.analyzed_sparsity = analyzed_sparsity self.max_analysis_batches = max_analysis_batches self.max_skipped_weight = max_skipped_weight self.require_mask_parameters = [] self.pre_analysis_path = pre_analysis_path assert (0.0 <= sparsity <= 1.0), "Pruning sparsity need to be in the range of [0, 1]" self.sparsity = sparsity self._masks = None self.analysis_state = State.OTHERS self.iterative_pruning = iterative_pruning # members used for iterative pruning if self.iterative_pruning: assert ( pruning_iterations > 1 ), "iterative pruning should contains at least two pruning iterations" self.pruning_iterations = pruning_iterations self.start_sparsity = start_sparsity_ratio * sparsity self.end_sparsity = self.sparsity self.epochs_per_iter = 0 self.sparsity_increment = 0.0
def get_latest_checkpoint_path(dir_path: Optional[str] = None) -> str: """ Get the latest checkpoint path args: dir_path: the dir to scan for existing checkpoint files. Default: if None, the latest checkpoint path saved in momery will be returned Returns: checkpoint_path """ if not dir_path: return _CHECKPOINT_MANAGER.get_latest_checkpoint_path() if PathManager.exists(dir_path): checkpoint_indices = [ int(file_path.split("-")[1]) for file_path in PathManager.ls(dir_path) if file_path.startswith("checkpoint") ] if checkpoint_indices: latest_checkpoint_path = f"{dir_path}/checkpoint-{max(checkpoint_indices)}" logger.info( f"find the latest checkpoint: {latest_checkpoint_path}") return latest_checkpoint_path return None
def save( self, config: PyTextConfig, model: Model, meta: Optional[CommonMetadata], tensorizers: Dict[str, Tensorizer], training_state: Optional[TrainingState] = None, identifier: str = None, ) -> str: """ save a checkpoint to given path, config, model and training_state together represent the checkpoint. When identifier is None, this function is used to save post-training snapshot """ saved_path = "" if identifier: # saving during-training checkpoints saved_path = self.generate_checkpoint_path(config, identifier) print("Saving checkpoint to ", saved_path) else: # saving post-training snapshot if no identifer given saved_path = config.save_snapshot_path print(f"Saving pytorch model to: {saved_path}") saved_folder = os.path.dirname(saved_path) if not PathManager.exists(saved_folder): PathManager.mkdirs(saved_folder) print(f"created {saved_folder}") with PathManager.open(saved_path, "wb") as checkpoint_f: save_checkpoint(checkpoint_f, config, model, meta, tensorizers, training_state) if identifier: self._saved_paths.append(saved_path) else: self._post_training_snapshot_path = saved_path return saved_path
def save( config: PyTextConfig, model: Model, meta: Optional[CommonMetadata], tensorizers: Dict[str, Tensorizer], training_state: Optional[TrainingState] = None, identifier: Optional[str] = None, ) -> str: """ Save all stateful information of a training task to a specified file-like object, will save the original config, model state, metadata, training state if training is not completed Args: identifier (str): used to identify a checkpoint within a training job, used as a suffix for save path config (PytextConfig): contains all raw parameter/hyper-parameters for training task model (Model): actual model in training training_state (TrainingState): stateful infomation during training Returns: identifier (str): if identifier is not specified, will save to config.save_snapshot_path to be consistent to post-training snapshot; if specified, will be used to save checkpoint during training, identifier is used to identify checkpoints in the same training """ saved_path = "" if identifier: # saving during-training checkpoints saved_path = generate_checkpoint_path(config, identifier) else: # saving post-training snapshot if no identifer given saved_path = config.save_snapshot_path print(f"Saving pytorch model to: {saved_path}") saved_folder = os.path.dirname(saved_path) if not PathManager.exists(saved_folder): PathManager.mkdirs(saved_folder) print(f"created {saved_folder}") # Currently torch.save() has error pickling certain models when not saving # by model.state_dict(), thus currently overriding the model in # training_state with None, and put back saving # https://github.com/pytorch/pytorch/issues/15116 model_in_training_state = None if training_state: model_in_training_state, training_state.model = training_state.model, None try: state = { DATA_STATE: meta, CONFIG_JSON: config_to_json(PyTextConfig, config), MODEL_STATE: model.state_dict(), SERIALIZE_VERSION_KEY: LATEST_SERIALIZE_VERSION, TENSORIZERS: tensorizers, TRAINING_STATE: training_state, } if identifier is not None: _CHECKPOINT_MANAGER.save_checkpoint(state, saved_path) else: _CHECKPOINT_MANAGER.save_snapshot(state, saved_path) finally: if training_state: training_state.model = model_in_training_state return saved_path