Пример #1
0
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers.utils import check_min_version

sys.path.append('.')
sys.path.append('src/.')
from src.utils import load_models, configuration
config_repository = configuration.Config()

# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.6.0")

logger = logging.getLogger(__name__)
MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
    """
Пример #2
0
import sys
from joblib import Parallel, delayed
import os
import copy
import pandas as pd
import numpy as np
import time
import Levenshtein

sys.path.append("/usr/local/lib/python3.8/site-packages")  #for pywrapfst
sys.path.append('/usr/local/lib/python3.6/site-packages')
import pywrapfst

from src.utils import configuration
config = configuration.Config()


def vectorized_compute_all_likelihoods_for_w_over_paths(d_fsa, w_fsas, ws):
    '''return a vector with entries corresponding to the total path weights from this d_fsa to each word in ws'''
    return ([get_likelihood_for_fsas_over_paths(d_fsa, w_fsas, w) for w in ws])


def compute_all_likelihoods_for_w_over_paths_one(list_of_tuples):
    '''wrapper to compute likelihoods for a list of n (d, d_fsa, w_fsa, w, cache_path) tuples. Check if d.npy is in cache_path first'''

    # if cache_path + d exists, read that in and return that
    distances = []

    for x in list_of_tuples:
        distances_cache_path = os.path.join(x[4], x[3] + '.npy')
Пример #3
0
def get_directory(spec_dict):
    ''' 
	Single specification for all directory structures in sampling, data extraction, training, fitting, evaluation, and analysis

	Paths follow 
	output/experiments<exp_identifier>/<phase>/<n_samples>/<identifier>
	
	<exp_identifier>: "full_scale"
	<phase>: "sample", "extract_data", "train", "fit", "eval", "analyze"
	<n_samples>:n=x where x is the number of items that are drawn for....
	<identifier>: <training_split>_<training_dataset>(x<tags>)(x<model_type>)(x<test_split>_<test_dataset>_<context_width>)

	for <identifier>:
	when phase == "sample" use the first part;
	when phase == "extract_data" use the second part;
	when phase == 'train', add the model_type 
	when phase == "fit", "eval", "analyze", add the details of the test set and the context width
 
	`task_name` is not currently used, but it could be useful (eg non_child vs. child).
	non_child vs. child is implicit in the training data, etc.
	'''

    config = configuration.Config()

    validate_spec_dict(spec_dict, config.spec_dict_params)
    validate_phase(spec_dict['task_phase'], config.task_phases)
    validate_training_params(spec_dict)

    if spec_dict['task_phase'] == 'sample':

        confirm_values_are_none(
            spec_dict, ['context_width', 'test_dataset', 'test_split'])
        confirm_values_are_not_none(
            spec_dict, ['training_split', 'training_dataset', 'n_samples'])

        n_str = 'n=' + str(spec_dict['n_samples'])

        path = join(
            config.exp_dir, spec_dict['task_phase'], n_str,
            spec_dict['training_split'] + '_' + spec_dict['training_dataset'])

    elif spec_dict['task_phase'] == 'extract_data':

        confirm_values_are_none(
            spec_dict, ['context_width', 'test_dataset', 'test_split'])
        confirm_values_are_not_none(
            spec_dict,
            ['training_split', 'training_dataset', 'use_tags', 'n_samples'])

        tags_str = 'with_tags' if spec_dict['use_tags'] else 'no_tags'
        n_str = 'n=' + str(spec_dict['n_samples'])

        path = join(
            config.exp_dir, spec_dict['task_phase'], n_str,
            spec_dict['training_split'] + '_' + spec_dict['training_dataset'] +
            '_' + tags_str)

    elif spec_dict['task_phase'] == 'train':

        confirm_values_are_none(
            spec_dict, ['context_width', 'test_dataset', 'test_split'])
        confirm_values_are_not_none(spec_dict, [
            'task_phase', 'model_type', 'training_split', 'training_dataset',
            'use_tags'
        ])

        tags_str = 'with_tags' if spec_dict['use_tags'] else 'no_tags'
        n_str = 'n=' + str(spec_dict['n_samples'])

        path = join(
            config.exp_dir, spec_dict['task_phase'], n_str,
            spec_dict['training_split'] + '_' + spec_dict['training_dataset'] +
            '_' + tags_str + 'x' + spec_dict['model_type'])

    elif spec_dict['task_phase'] in ('fit', 'eval'):

        confirm_values_are_not_none(spec_dict, [
            'training_split', 'training_dataset', 'use_tags', 'n_samples',
            'test_split', 'test_dataset', 'context_width'
        ])
        validate_test_params(spec_dict)

        tags_str = 'with_tags' if spec_dict['use_tags'] else 'no_tags'
        n_str = 'n=' + str(spec_dict['n_samples'])

        path = join(
            config.exp_dir, spec_dict['task_phase'], n_str,
            spec_dict['training_split'] + '_' + spec_dict['training_dataset'] +
            '_' + tags_str + 'x' + spec_dict['model_type'] + '_' +
            spec_dict['test_split'] + '_' + spec_dict['test_dataset'] + '_' +
            str(spec_dict['context_width']))

    else:
        raise ValueError('Task phase not recognized. Must be one of ' +
                         config.task_phases)

    return (path)