예제 #1
0
def run_languages(args):
    languages, token_map, data_split, concept_ids, ipa_to_concept = read_info()
    print('Train %d, Val %d, Test %d' %
          (len(data_split[0]), len(data_split[1]), len(data_split[2])))

    results = [[
        'lang', 'avg_len', 'shannon', 'test_shannon', 'test_loss', 'test_acc',
        'val_loss', 'val_acc'
    ]]
    for i, lang in enumerate(languages):
        print()
        print('Lang:', i, end=' ')

        if args.opt:
            avg_len, shannon, test_shannon, test_loss, \
                test_acc, val_loss, val_acc = run_opt_language_cv(lang, token_map, concept_ids, ipa_to_concept, args)
        else:
            avg_len, shannon, test_shannon, test_loss, \
                test_acc, val_loss, val_acc = run_language_cv(lang, token_map, concept_ids, ipa_to_concept, args)
        results += [[
            lang, avg_len, shannon, test_shannon, test_loss, test_acc,
            val_loss, val_acc
        ]]

        write_csv(results, '%s/%s__results.csv' % (args.rfolder, args.model))
    write_csv(results, '%s/%s__results-final.csv' % (args.rfolder, args.model))
예제 #2
0
def run_languages_cv(args):
    print('------------------- Start -------------------')
    _, token_map, data_split, _, _ = read_info()
    languages = get_languages(is_devoicing=args.is_devoicing)
    token_map = add_new_symbols_to_vocab(token_map)
    print('Train %d, Val %d, Test %d' %
          (len(data_split[0]), len(data_split[1]), len(data_split[2])))

    max_order = 3
    results = [[
        'lang', 'artificial', 'full_avg_len', 'avg_len', 'test_loss',
        'val_loss'
    ]]
    for i, lang in enumerate(languages):
        for artificial in [True, False]:
            print()
            print(i, end=' ')
            full_avg_len, avg_len, test_loss, val_loss = run_artificial_language_cv(
                lang,
                token_map,
                args,
                artificial=artificial,
                max_order=max_order)
            results += [[
                lang, artificial, full_avg_len, avg_len, test_loss, val_loss
            ]]

            write_csv(
                results,
                '%s/artificial__%s__results.csv' % (args.rfolder, args.model))
    write_csv(
        results,
        '%s/artificial__%s__results-final.csv' % (args.rfolder, args.model))
예제 #3
0
def run_languages(args):
    print('------------------- Start -------------------')
    _, token_map, data_split, concept_ids, ipa_to_concepts = read_info()

    languages = get_languages(is_devoicing=args.is_devoicing)
    token_map = add_new_symbols_to_vocab(token_map)
    print('Train %d, Val %d, Test %d' %
          (len(data_split[0]), len(data_split[1]), len(data_split[2])))

    results = [[
        'lang', 'avg_len', 'test_shannon', 'test_loss', 'test_acc', 'val_loss',
        'val_acc'
    ]]
    for i, lang in enumerate(languages):
        for artificial in [True, False]:
            print()
            print('%d. %s %s' %
                  (i, lang, 'artificial' if artificial else 'default'))
            avg_len, shannon, test_shannon, test_loss, \
                test_acc, best_epoch, val_loss, val_acc = run_artificial_language(
                    lang, args.is_devoicing, token_map, concept_ids, ipa_to_concepts, args, artificial=artificial)
            results += [[
                '%s %s' % (lang, 'art' if artificial else 'norm'), avg_len,
                shannon, test_shannon, test_loss, test_acc, best_epoch,
                val_loss, val_acc
            ]]

            write_csv(
                results,
                '%s/artificial__%s__results.csv' % (args.rfolder, args.model))
    write_csv(
        results,
        '%s/artificial__%s__results-final.csv' % (args.rfolder, args.model))
def run_languages_cv(args):
    print('------------------- Start -------------------')
    _, token_map, data_split, concept_ids, ipa_to_concepts = read_info()
    languages = get_languages(is_devoicing=args.is_devoicing)
    token_map = add_new_symbols_to_vocab(token_map)
    print('Train %d, Val %d, Test %d' %
          (len(data_split[0]), len(data_split[1]), len(data_split[2])))

    results = [[
        'lang', 'artificial', 'avg_len', 'shannon', 'test_shannon',
        'test_loss', 'test_acc', 'best_epoch', 'val_loss', 'val_acc'
    ]]
    for i, lang in enumerate(languages):
        for artificial in [True, False]:
            print()
            print('%d. %s %s' %
                  (i, lang, 'artificial' if artificial else 'default'))
            embedding_size, hidden_size, nlayers, dropout = \
                opt_params.get_artificial_opt_params(args.model, lang, artificial, args.artificial_type, args.data)
            avg_len, shannon, test_shannon, test_loss, \
                test_acc, val_loss, val_acc = run_artificial_language_cv(
                    lang, args.is_devoicing, token_map, concept_ids, ipa_to_concepts, args,
                    artificial=artificial, embedding_size=embedding_size,
                    hidden_size=hidden_size, nlayers=nlayers, dropout=dropout)
            results += [[
                lang, artificial, avg_len, shannon, test_shannon, test_loss,
                test_acc, val_loss, val_acc
            ]]

            write_csv(
                results,
                '%s/artificial__%s__results.csv' % (args.rfolder, args.model))
    write_csv(
        results,
        '%s/artificial__%s__results-final.csv' % (args.rfolder, args.model))
예제 #5
0
def optimize_languages(args):
    print('------------------- Start -------------------')
    _, token_map, data_split, concept_ids, ipa_to_concepts = read_info()
    languages = get_languages(is_devoicing=args.is_devoicing)
    token_map = add_new_symbols_to_vocab(token_map)
    print('Model %s' % args.model)
    print('Train %d, Val %d, Test %d' %
          (len(data_split[0]), len(data_split[1]), len(data_split[2])))

    n_iters = 45
    bounds = np.array([[4, 256], [32, 256], [1, 2.95], [0.0, 0.5]])
    n_pre_samples = 5

    opt_results = [[
        'lang', 'artificial', 'avg_len', 'shannon', 'test_shannon',
        'test_loss', 'test_acc', 'best_epoch', 'val_loss', 'val_acc',
        'embedding_size', 'hidden_size', 'nlayers', 'dropout'
    ]]
    for i, lang in enumerate(languages):
        for artificial in [True, False]:
            print()
            print('%d. %s %s' %
                  (i, lang, 'artificial' if artificial else 'default'))

            sample_loss = sample_loss_getter(lang,
                                             args.is_devoicing,
                                             token_map,
                                             ipa_to_concepts,
                                             args,
                                             artificial=artificial)
            xp, yp = bayesian_optimisation(n_iters,
                                           sample_loss,
                                           bounds,
                                           n_pre_samples=n_pre_samples)

            opt_results += [
                get_optimal_loss(lang, args.is_devoicing, artificial,
                                 token_map, concept_ids, ipa_to_concepts, xp,
                                 yp, args)
            ]

            write_csv(
                results, '%s/artificial__%s__baysian-results.csv' %
                (args.rfolder, args.model))
            write_csv(
                opt_results, '%s/artificial__%s__opt-results.csv' %
                (args.rfolder, args.model))

    write_csv(
        results, '%s/artificial__%s__baysian-results-final.csv' %
        (args.rfolder, args.model))
예제 #6
0
def get_data_loaders(ffolder,
                     lang,
                     is_devoicing,
                     token_map,
                     args,
                     artificial=True):
    _, _, data_split, _, _ = read_info()
    return _get_data_loaders(data_split,
                             ffolder,
                             lang,
                             is_devoicing,
                             token_map,
                             args,
                             artificial=artificial)
def run_languages(args):
    print('------------------- Start -------------------')
    languages, token_map, data_split, concept_ids, _ = read_info()
    print('Train %d, Val %d, Test %d' %
          (len(data_split[0]), len(data_split[1]), len(data_split[2])))

    results = [['lang', 'full_avg_len', 'avg_len', 'test_loss', 'val_loss']]
    for i, lang in enumerate(languages):
        print()
        print('%d Language %s' % (i, lang))
        full_avg_len, avg_len, test_loss, val_loss = run_language_cv(
            lang, token_map, concept_ids, args)
        results += [[lang, full_avg_len, avg_len, test_loss, val_loss]]

        write_csv(results, '%s/unigram.csv' % (args.rfolder))

    write_csv(results, '%s/unigram-final.csv' % (args.rfolder))
def run_languages(args):
    print('------------------- Start -------------------')
    languages, token_map, data_split, concept_ids, _ = read_info()
    print('Train %d, Val %d, Test %d' %
          (len(data_split[0]), len(data_split[1]), len(data_split[2])))

    max_order = 3
    results = [['lang', 'full_avg_len', 'avg_len', 'test_loss', 'val_loss'] +
               ['param_%d' % i for i in range(max_order)]]
    for i, lang in enumerate(languages):
        print()
        print('%d Language %s' % (i, lang))
        full_avg_len, avg_len, test_loss, val_loss = \
            run_language_cv(lang, token_map, concept_ids, args, max_order=max_order)
        results += [[lang, full_avg_len, avg_len, test_loss,
                     val_loss]]  # + opt_params.tolist()]

        write_csv(results, '%s/ngram.csv' % (args.rfolder))

    write_csv(results, '%s/ngram-final.csv' % (args.rfolder))
예제 #9
0
def get_data(lang, token_map, args, artificial=True):
    _, _, data_split, _, _ = read_info()
    return _get_data(data_split, lang, token_map, args, artificial=artificial)
예제 #10
0
def get_data_split_cv(fold, nfolds, verbose=True):
    _, _, data_split, _, _ = read_info()
    concepts = [y for x in data_split for y in x]

    return _get_data_split_cv(fold, nfolds, concepts, verbose=verbose)