예제 #1
0
def run_languages_cv(args):
    print('------------------- Start -------------------')
    _, token_map, data_split, _, _ = read_info()
    languages = get_languages(is_devoicing=args.is_devoicing)
    token_map = add_new_symbols_to_vocab(token_map)
    print('Train %d, Val %d, Test %d' %
          (len(data_split[0]), len(data_split[1]), len(data_split[2])))

    max_order = 3
    results = [[
        'lang', 'artificial', 'full_avg_len', 'avg_len', 'test_loss',
        'val_loss'
    ]]
    for i, lang in enumerate(languages):
        for artificial in [True, False]:
            print()
            print(i, end=' ')
            full_avg_len, avg_len, test_loss, val_loss = run_artificial_language_cv(
                lang,
                token_map,
                args,
                artificial=artificial,
                max_order=max_order)
            results += [[
                lang, artificial, full_avg_len, avg_len, test_loss, val_loss
            ]]

            write_csv(
                results,
                '%s/artificial__%s__results.csv' % (args.rfolder, args.model))
    write_csv(
        results,
        '%s/artificial__%s__results-final.csv' % (args.rfolder, args.model))
예제 #2
0
def run_languages(args):
    print('------------------- Start -------------------')
    _, token_map, data_split, concept_ids, ipa_to_concepts = read_info()

    languages = get_languages(is_devoicing=args.is_devoicing)
    token_map = add_new_symbols_to_vocab(token_map)
    print('Train %d, Val %d, Test %d' %
          (len(data_split[0]), len(data_split[1]), len(data_split[2])))

    results = [[
        'lang', 'avg_len', 'test_shannon', 'test_loss', 'test_acc', 'val_loss',
        'val_acc'
    ]]
    for i, lang in enumerate(languages):
        for artificial in [True, False]:
            print()
            print('%d. %s %s' %
                  (i, lang, 'artificial' if artificial else 'default'))
            avg_len, shannon, test_shannon, test_loss, \
                test_acc, best_epoch, val_loss, val_acc = run_artificial_language(
                    lang, args.is_devoicing, token_map, concept_ids, ipa_to_concepts, args, artificial=artificial)
            results += [[
                '%s %s' % (lang, 'art' if artificial else 'norm'), avg_len,
                shannon, test_shannon, test_loss, test_acc, best_epoch,
                val_loss, val_acc
            ]]

            write_csv(
                results,
                '%s/artificial__%s__results.csv' % (args.rfolder, args.model))
    write_csv(
        results,
        '%s/artificial__%s__results-final.csv' % (args.rfolder, args.model))
예제 #3
0
def run_languages(args):
    languages, token_map, data_split, concept_ids, ipa_to_concept = read_info()
    print('Train %d, Val %d, Test %d' %
          (len(data_split[0]), len(data_split[1]), len(data_split[2])))

    results = [[
        'lang', 'avg_len', 'shannon', 'test_shannon', 'test_loss', 'test_acc',
        'val_loss', 'val_acc'
    ]]
    for i, lang in enumerate(languages):
        print()
        print('Lang:', i, end=' ')

        if args.opt:
            avg_len, shannon, test_shannon, test_loss, \
                test_acc, val_loss, val_acc = run_opt_language_cv(lang, token_map, concept_ids, ipa_to_concept, args)
        else:
            avg_len, shannon, test_shannon, test_loss, \
                test_acc, val_loss, val_acc = run_language_cv(lang, token_map, concept_ids, ipa_to_concept, args)
        results += [[
            lang, avg_len, shannon, test_shannon, test_loss, test_acc,
            val_loss, val_acc
        ]]

        write_csv(results, '%s/%s__results.csv' % (args.rfolder, args.model))
    write_csv(results, '%s/%s__results-final.csv' % (args.rfolder, args.model))
def run_language_cv(lang, token_map, concept_ids, args):
    global full_results, fold
    nfolds = 10
    avg_test_loss, avg_val_loss = 0, 0

    train_loader, val_loader, test_loader = get_data(lang)
    full_avg_len, avg_len, _, _ = _run_language(lang, train_loader, val_loader,
                                                test_loader, token_map, args)

    for fold in range(nfolds):
        print()
        print('Fold:', fold, end=' ')
        train_loader, val_loader, test_loader = get_data_cv(
            args.ffolder, fold, nfolds, lang, token_map, concept_ids)
        _, _, test_loss, val_loss = _run_language(lang, train_loader,
                                                  val_loader, test_loader,
                                                  token_map, args)

        full_results += [[
            lang, fold, full_avg_len, avg_len, test_loss, val_loss
        ]]  # + opt_params.tolist()]

        avg_test_loss += test_loss / nfolds
        avg_val_loss += val_loss / nfolds

        write_csv(full_results,
                  '%s/%s__full-results.csv' % (args.rfolder, args.model))

    return full_avg_len, avg_len, avg_test_loss, avg_val_loss
def run_languages_cv(args):
    print('------------------- Start -------------------')
    _, token_map, data_split, concept_ids, ipa_to_concepts = read_info()
    languages = get_languages(is_devoicing=args.is_devoicing)
    token_map = add_new_symbols_to_vocab(token_map)
    print('Train %d, Val %d, Test %d' %
          (len(data_split[0]), len(data_split[1]), len(data_split[2])))

    results = [[
        'lang', 'artificial', 'avg_len', 'shannon', 'test_shannon',
        'test_loss', 'test_acc', 'best_epoch', 'val_loss', 'val_acc'
    ]]
    for i, lang in enumerate(languages):
        for artificial in [True, False]:
            print()
            print('%d. %s %s' %
                  (i, lang, 'artificial' if artificial else 'default'))
            embedding_size, hidden_size, nlayers, dropout = \
                opt_params.get_artificial_opt_params(args.model, lang, artificial, args.artificial_type, args.data)
            avg_len, shannon, test_shannon, test_loss, \
                test_acc, val_loss, val_acc = run_artificial_language_cv(
                    lang, args.is_devoicing, token_map, concept_ids, ipa_to_concepts, args,
                    artificial=artificial, embedding_size=embedding_size,
                    hidden_size=hidden_size, nlayers=nlayers, dropout=dropout)
            results += [[
                lang, artificial, avg_len, shannon, test_shannon, test_loss,
                test_acc, val_loss, val_acc
            ]]

            write_csv(
                results,
                '%s/artificial__%s__results.csv' % (args.rfolder, args.model))
    write_csv(
        results,
        '%s/artificial__%s__results-final.csv' % (args.rfolder, args.model))
예제 #6
0
def run_artificial_language_cv(lang,
                               token_map,
                               args,
                               artificial=True,
                               max_order=3):
    global full_results, fold
    nfolds = 10
    avg_test_loss, avg_val_loss = 0, 0

    train_loader, val_loader, test_loader = get_data(lang,
                                                     token_map,
                                                     args,
                                                     artificial=artificial)
    full_avg_len, avg_len, _, _, _, xp, yp = _run_language_bayesian(
        lang,
        train_loader,
        val_loader,
        test_loader,
        token_map,
        args,
        max_order=max_order)

    for fold in range(nfolds):
        print()
        print('Fold:', fold, end=' ')
        train_loader, val_loader, test_loader = get_data_cv(
            fold, nfolds, lang, token_map, args, artificial=artificial)

        full_avg_len_tmp, avg_len_tmp, test_loss, val_loss, opt_params = _run_language_opt(
            lang,
            train_loader,
            val_loader,
            test_loader,
            token_map,
            xp,
            yp,
            args,
            max_order=max_order)

        full_results += [[
            lang, artificial, fold, full_avg_len_tmp, avg_len_tmp, test_loss,
            val_loss
        ]]  # + opt_params.tolist()]

        avg_test_loss += test_loss / nfolds
        avg_val_loss += val_loss / nfolds

        write_csv(
            full_results,
            '%s/artificial__%s__full-results.csv' % (args.rfolder, args.model))

    return full_avg_len, avg_len, avg_test_loss, avg_val_loss
def run_languages(args):
    print('------------------- Start -------------------')
    languages, token_map, data_split, concept_ids, _ = read_info()
    print('Train %d, Val %d, Test %d' %
          (len(data_split[0]), len(data_split[1]), len(data_split[2])))

    results = [['lang', 'full_avg_len', 'avg_len', 'test_loss', 'val_loss']]
    for i, lang in enumerate(languages):
        print()
        print('%d Language %s' % (i, lang))
        full_avg_len, avg_len, test_loss, val_loss = run_language_cv(
            lang, token_map, concept_ids, args)
        results += [[lang, full_avg_len, avg_len, test_loss, val_loss]]

        write_csv(results, '%s/unigram.csv' % (args.rfolder))

    write_csv(results, '%s/unigram-final.csv' % (args.rfolder))
def run_artificial_language_cv(lang,
                               is_devoicing,
                               token_map,
                               concept_ids,
                               ipa_to_concepts,
                               args,
                               artificial=True,
                               embedding_size=None,
                               hidden_size=256,
                               nlayers=1,
                               dropout=0.2):
    global full_results
    nfolds = 10
    avg_shannon, avg_test_shannon, avg_test_loss, avg_test_acc, avg_val_loss, avg_val_acc = 0, 0, 0, 0, 0, 0
    for fold in range(nfolds):
        print()
        print(fold, end=' ')
        print('Best hyperparams emb-hs: %d, hs: %d, nlayers: %d, drop: %.4f' %
              (embedding_size, hidden_size, nlayers, dropout))
        train_loader, val_loader, test_loader = \
            get_data_loaders_cv(args.ffolder, fold, nfolds, lang, is_devoicing, token_map, args, artificial=artificial)
        avg_len, shannon, test_shannon, test_loss, \
            test_acc, best_epoch, val_loss, val_acc = _run_language(
                '%s %s' % (lang, 'art' if artificial else 'norm'),
                train_loader, val_loader, test_loader, token_map,
                ipa_to_concepts, args, embedding_size=embedding_size,
                hidden_size=hidden_size, nlayers=nlayers, dropout=dropout)

        full_results += [[
            lang, artificial, fold, avg_len, test_shannon, test_loss, test_acc,
            val_loss, val_acc, best_epoch
        ]]

        avg_shannon += shannon / nfolds
        avg_test_shannon += test_shannon / nfolds
        avg_test_loss += test_loss / nfolds
        avg_test_acc += test_acc / nfolds
        avg_val_loss += val_loss / nfolds
        avg_val_acc += val_acc / nfolds

        write_csv(
            full_results,
            '%s/artificial__%s__full-results.csv' % (args.rfolder, args.model))

    return avg_len, avg_shannon, avg_test_shannon, avg_test_loss, avg_test_acc, avg_val_loss, avg_val_acc
def run_languages(args):
    print('------------------- Start -------------------')
    languages, token_map, data_split, concept_ids, _ = read_info()
    print('Train %d, Val %d, Test %d' %
          (len(data_split[0]), len(data_split[1]), len(data_split[2])))

    max_order = 3
    results = [['lang', 'full_avg_len', 'avg_len', 'test_loss', 'val_loss'] +
               ['param_%d' % i for i in range(max_order)]]
    for i, lang in enumerate(languages):
        print()
        print('%d Language %s' % (i, lang))
        full_avg_len, avg_len, test_loss, val_loss = \
            run_language_cv(lang, token_map, concept_ids, args, max_order=max_order)
        results += [[lang, full_avg_len, avg_len, test_loss,
                     val_loss]]  # + opt_params.tolist()]

        write_csv(results, '%s/ngram.csv' % (args.rfolder))

    write_csv(results, '%s/ngram-final.csv' % (args.rfolder))
예제 #10
0
def run_language_cv(lang,
                    token_map,
                    concept_ids,
                    ipa_to_concept,
                    args,
                    embedding_size=None,
                    hidden_size=256,
                    nlayers=1,
                    dropout=0.2):
    global full_results, fold
    nfolds = 10
    avg_shannon, avg_test_shannon, avg_test_loss, avg_test_acc, avg_val_loss, avg_val_acc = 0, 0, 0, 0, 0, 0
    for fold in range(nfolds):
        print()
        print('Fold:', fold, end=' ')

        train_loader, val_loader, test_loader = get_data_loaders_cv(
            args.ffolder, fold, nfolds, lang, token_map, concept_ids)
        avg_len, shannon, test_shannon, test_loss, \
            test_acc, best_epoch, val_loss, val_acc = _run_language(
                lang, train_loader, val_loader, test_loader, token_map, ipa_to_concept,
                args, embedding_size=embedding_size, hidden_size=hidden_size,
                nlayers=nlayers, dropout=dropout, per_word=True)

        full_results += [[
            lang, fold, avg_len, test_shannon, test_loss, test_acc, val_loss,
            val_acc, best_epoch
        ]]

        avg_shannon += shannon / nfolds
        avg_test_shannon += test_shannon / nfolds
        avg_test_loss += test_loss / nfolds
        avg_test_acc += test_acc / nfolds
        avg_val_loss += val_loss / nfolds
        avg_val_acc += val_acc / nfolds

        write_csv(full_results,
                  '%s/%s__full-results.csv' % (args.rfolder, args.model))

    return avg_len, avg_shannon, avg_test_shannon, avg_test_loss, avg_test_acc, avg_val_loss, avg_val_acc
예제 #11
0
def optimize_languages(args):
    print('------------------- Start -------------------')
    _, token_map, data_split, concept_ids, ipa_to_concepts = read_info()
    languages = get_languages(is_devoicing=args.is_devoicing)
    token_map = add_new_symbols_to_vocab(token_map)
    print('Model %s' % args.model)
    print('Train %d, Val %d, Test %d' %
          (len(data_split[0]), len(data_split[1]), len(data_split[2])))

    n_iters = 45
    bounds = np.array([[4, 256], [32, 256], [1, 2.95], [0.0, 0.5]])
    n_pre_samples = 5

    opt_results = [[
        'lang', 'artificial', 'avg_len', 'shannon', 'test_shannon',
        'test_loss', 'test_acc', 'best_epoch', 'val_loss', 'val_acc',
        'embedding_size', 'hidden_size', 'nlayers', 'dropout'
    ]]
    for i, lang in enumerate(languages):
        for artificial in [True, False]:
            print()
            print('%d. %s %s' %
                  (i, lang, 'artificial' if artificial else 'default'))

            sample_loss = sample_loss_getter(lang,
                                             args.is_devoicing,
                                             token_map,
                                             ipa_to_concepts,
                                             args,
                                             artificial=artificial)
            xp, yp = bayesian_optimisation(n_iters,
                                           sample_loss,
                                           bounds,
                                           n_pre_samples=n_pre_samples)

            opt_results += [
                get_optimal_loss(lang, args.is_devoicing, artificial,
                                 token_map, concept_ids, ipa_to_concepts, xp,
                                 yp, args)
            ]

            write_csv(
                results, '%s/artificial__%s__baysian-results.csv' %
                (args.rfolder, args.model))
            write_csv(
                opt_results, '%s/artificial__%s__opt-results.csv' %
                (args.rfolder, args.model))

    write_csv(
        results, '%s/artificial__%s__baysian-results-final.csv' %
        (args.rfolder, args.model))