def get_args(): # Models argparser.add_argument('--checkpoints-path', type=str, required=True) # Results argparser.add_argument('--results-path', type=str, default='results/') return argparser.parse_args()
def get_args(): argparser = get_argparser() add_all_defaults(argparser) args = parse_args(argparser) args.wait_iterations = args.wait_epochs * args.eval_batches return args
def get_args(): argparser = get_argparser() # Models argparser.add_argument('--eval-path', type=str, required=True) # Save argparser.add_argument('--results-file', type=str, required=True) add_data_args(argparser) return parse_args(argparser)
def get_args(): argparser = get_argparser() # adaptor argparser.add_argument('--two-stage-state-folder', type=str, required=True) argparser.add_argument('--results-file', type=str, required=True) add_data_args(argparser) add_generator_args(argparser) args = parse_args(argparser) return args
def get_args(): # Data argparser.add_argument('--batch-size', type=int, default=512) # Models argparser.add_argument('--checkpoints-path', type=str, required=True) argparser.add_argument('--model-type', type=str, required=True) args = argparser.parse_args() args.reverse = args.model_type in constants.REVERSE_MODELS args.model_path = os.path.join(args.checkpoints_path, args.model_type) return args
def get_args(): # Models argparser.add_argument('--checkpoints-path', type=str, required=True) # Other argparser.add_argument('--analyse', type=str, default='none', choices=['none', 'vowels', 'consonants']) argparser.add_argument('--n-permutations', type=int, default=100000) args = argparser.parse_args() args.analyse = None if args.analyse == 'none' else args.analyse return args
def get_args(): argparser = get_argparser() argparser.add_argument('--max-train-tokens', type=int, required=True) argparser.add_argument('--data-language-dir', type=str, required=True) argparser.add_argument('--checkpoint-language-dir', type=str, required=True) argparser.add_argument('--alpha', type=str, required=True) argparser.add_argument('--beta', type=str, required=True) argparser.add_argument('--results-folder', type=str, required=True) args = parse_args(argparser) return args
def get_args(): argparser.add_argument( "--src-file", type=str, help="The file from which to read data") argparser.add_argument( "--n-folds", type=int, default=10, help="Number of folds to split data") argparser.add_argument( "--max-words", type=int, default=10000, help="Number of types to use") return argparser.parse_args()
def get_args(): argparser = get_argparser() # Save argparser.add_argument('--results-file', type=str, required=True) # adaptor argparser.add_argument('--no-iterations', type=int, default=10) argparser.add_argument('--beta-limit', type=int) argparser.add_argument('--adaptor-iterations', type=int, default=6) argparser.add_argument('--two-stage-state-folder', type=str, required=True) add_all_defaults(argparser) args = parse_args(argparser) args.wait_iterations = args.wait_epochs * args.eval_batches return args
def get_args(): # Data argparser.add_argument('--batch-size', type=int, default=1024) argparser.add_argument('--train-folds', type=int, default=8) # Model argparser.add_argument('--model-type', type=str, required=True) # Save argparser.add_argument('--checkpoints-path', type=str) args = argparser.parse_args() args.reverse = (args.model_type in constants.REVERSE_MODELS) args.model_path = os.path.join(args.checkpoints_path, args.model_type) return args
def get_args(): # Models # argparser.add_argument('--checkpoints-path', type=str, required=True) argparser.add_argument('--model-type', type=str, required=True) # Other argparser.add_argument('--n-permutations', type=int, default=100000) # argparser.add_argument('--analyse', type=str, default='none', # choices=['none', 'vowels', 'consonants']) args = argparser.parse_args() args.keep_eos = args.model_type in ['norm', 'rev'] # args.reverse = (args.model_type in constants.REVERSE_MODELS) # args.analyse = None if args.analyse == 'none' else args.analyse return args
def get_args(): argparser = get_argparser() # Save argparser.add_argument('--adaptor-results-file', type=str, required=True) # adaptor argparser.add_argument('--alpha', type=float, required=True) argparser.add_argument('--beta', type=float, required=True) argparser.add_argument('--adaptor-iterations', type=int, default=6) argparser.add_argument('--two-stage-state-folder', type=str, required=True) argparser.add_argument('--load-adaptor-init-state', default=False, action='store_true') add_all_defaults(argparser) args = parse_args(argparser) args.wait_iterations = args.wait_epochs * args.eval_batches return args
def get_args(): argparser = get_argparser() argparser.add_argument( "--wikipedia-tokenized-file", type=str, help="The file in which wikipedia tokenized results should be") argparser.add_argument("--language", type=str, help="The language the data is in") argparser.add_argument("--n-folds", type=int, default=10, help="Number of folds to split data") argparser.add_argument("--max-sentences", type=int, default=1000000, help="Maximum number of sentences used") add_data_args(argparser) return parse_args(argparser)
def get_args(): # Data argparser.add_argument('--batch-size', type=int, default=32) # Model argparser.add_argument('--nlayers', type=int, default=2) argparser.add_argument('--embedding-size', type=int, default=64) argparser.add_argument('--hidden-size', type=int, default=256) argparser.add_argument('--dropout', type=float, default=.33) argparser.add_argument('--model-type', type=str, required=True) # Optimization argparser.add_argument('--eval-batches', type=int, default=20) argparser.add_argument('--wait-epochs', type=int, default=5) # Save argparser.add_argument('--checkpoints-path', type=str) args = argparser.parse_args() args.wait_iterations = args.wait_epochs * args.eval_batches args.reverse = (args.model_type in constants.REVERSE_MODELS) args.model_path = os.path.join(args.checkpoints_path, args.model_type) return args
args, artificial=artificial) xp, yp = bayesian_optimisation(n_iters, sample_loss, bounds, n_pre_samples=n_pre_samples) opt_results += [ get_optimal_loss(lang, args.is_devoicing, artificial, token_map, concept_ids, ipa_to_concepts, xp, yp, args) ] write_csv( results, '%s/artificial__%s__baysian-results.csv' % (args.rfolder, args.model)) write_csv( opt_results, '%s/artificial__%s__opt-results.csv' % (args.rfolder, args.model)) write_csv( results, '%s/artificial__%s__baysian-results-final.csv' % (args.rfolder, args.model)) if __name__ == '__main__': args = argparser.parse_args(csv_folder='artificial/%s/bayes-opt') assert args.data == 'northeuralex', 'this script should only be run with northeuralex data' fill_artificial_args(args) optimize_languages(args)
def run_languages(args): print('------------------- Start -------------------') languages, token_map, data_split, concept_ids = read_info() print('Train %d, Val %d, Test %d' % (len(data_split[0]), len(data_split[1]), len(data_split[2]))) if args.opt: test_results, test_loss, \ test_acc, best_epoch, val_loss, val_acc = run_opt_language(languages, token_map, concept_ids, args) else: test_results, test_loss, \ test_acc, best_epoch, val_loss, val_acc = run_language(languages, token_map, concept_ids, args) results = [[ 'lang', 'test_loss', 'test_acc', 'best_epoch', 'val_loss', 'val_acc' ]] results += [['full', test_loss, test_acc, best_epoch, val_loss, val_acc]] for lang, result in test_results.items(): results += [[lang] + list(result)] write_csv(results, '%s/%s__shared-results.csv' % (args.rfolder, args.model)) write_csv(results, '%s/%s__shared-results-final.csv' % (args.rfolder, args.model)) if __name__ == '__main__': args = argparser.parse_args(csv_folder='normal') assert args.data == 'northeuralex', 'this script should only be run with northeuralex data' run_languages(args)
def load_info(args): with open('%s/preprocess/info.pckl' % args.ffolder, 'rb') as f: info = pickle.load(f) languages = info['languages'] token_map = info['token_map'] data_split = info['data_split'] concept_ids = info['concepts_ids'] return languages, token_map, data_split, concept_ids def main(args): df = read_src_data(args.ffolder) languages = get_languages(df) train_df, val_df, test_df, data_split = separate_train(df) token_map = get_tokens(df) concepts_ids, IPA_to_concept = get_concept_ids(df) languages_df = separate_per_language(train_df, val_df, test_df, languages) process_languages(languages_df, token_map, args) save_info(args.ffolder, languages, token_map, data_split, concepts_ids, IPA_to_concept) if __name__ == '__main__': args = parser.parse_args() assert args.data == 'northeuralex', 'this script should only be run with northeuralex data' main(args)
def get_args(): # Other argparser.add_argument('--n-permutations', type=int, default=100000) argparser.add_argument('--results-path', type=str, default='results/') return argparser.parse_args()
def get_args(): return argparser.parse_args()
bounds = np.array([[4, 256], [32, 256], [1, 2.95], [0.0, 0.5]]) n_pre_samples = 5 sample_loss = sample_loss_getter(languages, token_map, concept_ids, args) xp, yp = bayesian_optimisation(n_iters, sample_loss, bounds, n_pre_samples=n_pre_samples) opt_results, test_results = get_optimal_loss(languages, token_map, xp, yp, concept_ids, args) log_results = [[ 'lang', 'test_loss', 'test_acc', 'best_epoch', 'val_loss', 'val_acc', 'embedding_size', 'hidden_size', 'nlayers', 'dropout' ]] log_results += [opt_results] log_results += [[]] for lang, result in test_results.items(): log_results += [[lang] + list(result)] write_csv( log_results, '%s/%s__bayesian-shared-results.csv' % (args.rfolder, args.model)) if __name__ == '__main__': args = argparser.parse_args(csv_folder='bayes-opt') assert args.data == 'northeuralex', 'this script should only be run with northeuralex data' optimize_languages(args)
return full_avg_len, avg_len, avg_test_loss, avg_val_loss def run_languages(args): print('------------------- Start -------------------') languages, token_map, data_split, concept_ids, _ = read_info() print('Train %d, Val %d, Test %d' % (len(data_split[0]), len(data_split[1]), len(data_split[2]))) max_order = 3 results = [['lang', 'full_avg_len', 'avg_len', 'test_loss', 'val_loss'] + ['param_%d' % i for i in range(max_order)]] for i, lang in enumerate(languages): print() print('%d Language %s' % (i, lang)) full_avg_len, avg_len, test_loss, val_loss = \ run_language_cv(lang, token_map, concept_ids, args, max_order=max_order) results += [[lang, full_avg_len, avg_len, test_loss, val_loss]] # + opt_params.tolist()] write_csv(results, '%s/ngram.csv' % (args.rfolder)) write_csv(results, '%s/ngram-final.csv' % (args.rfolder)) if __name__ == '__main__': args = argparser.parse_args(csv_folder='cv') assert args.data == 'northeuralex', 'this script should only be run with northeuralex data' run_languages(args)
'lang', 'avg_len', 'test_shannon', 'test_loss', 'test_acc', 'val_loss', 'val_acc' ]] for i, lang in enumerate(languages): for artificial in [True, False]: print() print('%d. %s %s' % (i, lang, 'artificial' if artificial else 'default')) avg_len, shannon, test_shannon, test_loss, \ test_acc, best_epoch, val_loss, val_acc = run_artificial_language( lang, args.is_devoicing, token_map, concept_ids, ipa_to_concepts, args, artificial=artificial) results += [[ '%s %s' % (lang, 'art' if artificial else 'norm'), avg_len, shannon, test_shannon, test_loss, test_acc, best_epoch, val_loss, val_acc ]] write_csv( results, '%s/artificial__%s__results.csv' % (args.rfolder, args.model)) write_csv( results, '%s/artificial__%s__results-final.csv' % (args.rfolder, args.model)) if __name__ == '__main__': args = argparser.parse_args(csv_folder='artificial/%s/normal') assert args.data == 'northeuralex', 'this script should only be run with northeuralex data' fill_artificial_args(args) run_languages(args)
def get_args(): # Models argparser.add_argument('--checkpoints-path', type=str, required=True) return argparser.parse_args()
symbols) lang_data += [[ lang, len(symbols), vowel, consonant, tone, unrecognized, avg_len ]] columns = [ 'lang', 'inventory', 'vowel', 'consonant', 'tone', 'unrecognized', 'avg_len' ] df_info = pd.DataFrame(lang_data, columns=columns) rfolder = args.rfolder[:-len('orig')] df_info.to_csv('%s/lang_inventory.csv' % (rfolder)) def main(args): df = read_src_data(args.ffolder) languages = get_languages(df) train_df, val_df, test_df, _ = separate_train(df) languages_df = separate_per_language(train_df, val_df, test_df, languages) get_lang_ipa_info(df, languages_df, args, field='IPA') if __name__ == '__main__': args = argparser.parse_args(csv_folder='inventory') assert args.data == 'northeuralex', 'this script should only be run with northeuralex data' main(args)
def get_args(): # Other argparser.add_argument('--n-permutations', type=int, default=100000) return argparser.parse_args()
'lang', 'artificial', 'full_avg_len', 'avg_len', 'test_loss', 'val_loss' ]] for i, lang in enumerate(languages): for artificial in [True, False]: print() print(i, end=' ') full_avg_len, avg_len, test_loss, val_loss = run_artificial_language_cv( lang, token_map, args, artificial=artificial, max_order=max_order) results += [[ lang, artificial, full_avg_len, avg_len, test_loss, val_loss ]] write_csv( results, '%s/artificial__%s__results.csv' % (args.rfolder, args.model)) write_csv( results, '%s/artificial__%s__results-final.csv' % (args.rfolder, args.model)) if __name__ == '__main__': args = argparser.parse_args(csv_folder='artificial/%s/cv') assert args.data == 'northeuralex', 'this script should only be run with northeuralex data' fill_artificial_args(args) run_languages_cv(args)
def get_args(): argparser = get_argparser() argparser.add_argument('--data-file', type=str, required=True) args = parse_args(argparser) return args