AutoConfig, AutoModelForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling, HfArgumentParser, Trainer, TrainingArguments, set_seed, ) from transformers.trainer_utils import get_last_checkpoint, is_main_process from transformers.utils import check_min_version sys.path.append('.') sys.path.append('src/.') from src.utils import load_models, configuration config_repository = configuration.Config() # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.6.0") logger = logging.getLogger(__name__) MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys()) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) @dataclass class ModelArguments: """ Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. """
import sys from joblib import Parallel, delayed import os import copy import pandas as pd import numpy as np import time import Levenshtein sys.path.append("/usr/local/lib/python3.8/site-packages") #for pywrapfst sys.path.append('/usr/local/lib/python3.6/site-packages') import pywrapfst from src.utils import configuration config = configuration.Config() def vectorized_compute_all_likelihoods_for_w_over_paths(d_fsa, w_fsas, ws): '''return a vector with entries corresponding to the total path weights from this d_fsa to each word in ws''' return ([get_likelihood_for_fsas_over_paths(d_fsa, w_fsas, w) for w in ws]) def compute_all_likelihoods_for_w_over_paths_one(list_of_tuples): '''wrapper to compute likelihoods for a list of n (d, d_fsa, w_fsa, w, cache_path) tuples. Check if d.npy is in cache_path first''' # if cache_path + d exists, read that in and return that distances = [] for x in list_of_tuples: distances_cache_path = os.path.join(x[4], x[3] + '.npy')
def get_directory(spec_dict): ''' Single specification for all directory structures in sampling, data extraction, training, fitting, evaluation, and analysis Paths follow output/experiments<exp_identifier>/<phase>/<n_samples>/<identifier> <exp_identifier>: "full_scale" <phase>: "sample", "extract_data", "train", "fit", "eval", "analyze" <n_samples>:n=x where x is the number of items that are drawn for.... <identifier>: <training_split>_<training_dataset>(x<tags>)(x<model_type>)(x<test_split>_<test_dataset>_<context_width>) for <identifier>: when phase == "sample" use the first part; when phase == "extract_data" use the second part; when phase == 'train', add the model_type when phase == "fit", "eval", "analyze", add the details of the test set and the context width `task_name` is not currently used, but it could be useful (eg non_child vs. child). non_child vs. child is implicit in the training data, etc. ''' config = configuration.Config() validate_spec_dict(spec_dict, config.spec_dict_params) validate_phase(spec_dict['task_phase'], config.task_phases) validate_training_params(spec_dict) if spec_dict['task_phase'] == 'sample': confirm_values_are_none( spec_dict, ['context_width', 'test_dataset', 'test_split']) confirm_values_are_not_none( spec_dict, ['training_split', 'training_dataset', 'n_samples']) n_str = 'n=' + str(spec_dict['n_samples']) path = join( config.exp_dir, spec_dict['task_phase'], n_str, spec_dict['training_split'] + '_' + spec_dict['training_dataset']) elif spec_dict['task_phase'] == 'extract_data': confirm_values_are_none( spec_dict, ['context_width', 'test_dataset', 'test_split']) confirm_values_are_not_none( spec_dict, ['training_split', 'training_dataset', 'use_tags', 'n_samples']) tags_str = 'with_tags' if spec_dict['use_tags'] else 'no_tags' n_str = 'n=' + str(spec_dict['n_samples']) path = join( config.exp_dir, spec_dict['task_phase'], n_str, spec_dict['training_split'] + '_' + spec_dict['training_dataset'] + '_' + tags_str) elif spec_dict['task_phase'] == 'train': confirm_values_are_none( spec_dict, ['context_width', 'test_dataset', 'test_split']) confirm_values_are_not_none(spec_dict, [ 'task_phase', 'model_type', 'training_split', 'training_dataset', 'use_tags' ]) tags_str = 'with_tags' if spec_dict['use_tags'] else 'no_tags' n_str = 'n=' + str(spec_dict['n_samples']) path = join( config.exp_dir, spec_dict['task_phase'], n_str, spec_dict['training_split'] + '_' + spec_dict['training_dataset'] + '_' + tags_str + 'x' + spec_dict['model_type']) elif spec_dict['task_phase'] in ('fit', 'eval'): confirm_values_are_not_none(spec_dict, [ 'training_split', 'training_dataset', 'use_tags', 'n_samples', 'test_split', 'test_dataset', 'context_width' ]) validate_test_params(spec_dict) tags_str = 'with_tags' if spec_dict['use_tags'] else 'no_tags' n_str = 'n=' + str(spec_dict['n_samples']) path = join( config.exp_dir, spec_dict['task_phase'], n_str, spec_dict['training_split'] + '_' + spec_dict['training_dataset'] + '_' + tags_str + 'x' + spec_dict['model_type'] + '_' + spec_dict['test_split'] + '_' + spec_dict['test_dataset'] + '_' + str(spec_dict['context_width'])) else: raise ValueError('Task phase not recognized. Must be one of ' + config.task_phases) return (path)