示例#1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--hdim', default=512, type=int)
    parser.add_argument('--seq_len', default=40, type=int)
    parser.add_argument('--model', default=None)
    parser.add_argument('--depth', default=1, type=int)
    parser.add_argument('--translit_path', default=None)
    parser.add_argument('--language', default=None)

    args = parser.parse_args()

    print("Loading Files")
    (char_to_index, index_to_char, vocab_size, trans_to_index, index_to_trans,
     trans_vocab_size) = utils.load_vocabulary(language=args.language)
    (test_text, trans, long_letter_reverse_mapping) = utils.load_language_data(
        language=args.language, is_train=False)
    print("Building network ...")
    (output_layer,
     predict) = utils.define_model(args.hdim,
                                   args.depth,
                                   trans_vocab_size=trans_vocab_size,
                                   vocab_size=vocab_size,
                                   is_train=False)

    if args.model:
        f = np.load(args.model)
        param_values = [np.float32(f[i]) for i in range(len(f))]
        lasagne.layers.set_all_param_values(output_layer, param_values)
    print("Testing ...")

    if args.translit_path:
        data = codecs.open(args.translit_path, 'r', encoding='utf-8').read()
        translate_romanized(predict, data, args.seq_len, trans,
                            trans_vocab_size, trans_to_index, index_to_char,
                            long_letter_reverse_mapping)

    else:
        test(predict, test_text, args.language, args.model, args.seq_len,
             long_letter_reverse_mapping, trans, trans_to_index, char_to_index,
             index_to_trans, index_to_char)
示例#2
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--hdim', default=512, type=int)
    parser.add_argument('--grad_clip', default=100, type=int)
    parser.add_argument('--lr', default=0.01, type=float)
    parser.add_argument('--batch_size', default=50, type=int)
    parser.add_argument('--num_epochs', default=10, type=int)
    parser.add_argument('--seq_len', default=60, type=int)
    parser.add_argument('--depth', default=1, type=int)
    parser.add_argument('--model', default=None)
    parser.add_argument('--model_name_prefix', default='model')
    parser.add_argument('--language', default='hy-AM')
    parser.add_argument('--start_from', default=0, type=float)
    args = parser.parse_args()

    print("Loading Files")

    (char_to_index, index_to_char, vocab_size, trans_to_index, index_to_trans,
     trans_vocab_size) = utils.load_vocabulary(language=args.language)
    (train_text, val_text,
     trans) = utils.load_language_data(language=args.language)
    data_size = len(train_text)

    print("Building Network ...")

    (output_layer, train, cost) = utils.define_model(args.hdim,
                                                     args.depth,
                                                     args.lr,
                                                     args.grad_clip,
                                                     trans_vocab_size,
                                                     vocab_size,
                                                     is_train=True)

    if args.model:
        f = np.load('languages/' + args.language + '/models/' + args.model)
        param_values = [np.float32(f[i]) for i in range(len(f))]
        lasagne.layers.set_all_param_values(output_layer, param_values)

    print("Training ...")
    p = int(len(train_text) * args.start_from) + 1
    step_cnt = 0
    avg_cost = 0
    it = 0
    while it < args.num_epochs:
        avg_cost = 0
        date_at_beginning = datetime.now()
        non_native_skipped = 0
        for _ in range(PRINT_FREQ):
            x, y, p, turned, non_native_sequences = utils.gen_data(
                p, args.seq_len, args.batch_size, train_text, trans,
                trans_to_index, char_to_index)
            if turned:
                it += 1
            avg_cost += train(x, np.reshape(y, (-1, vocab_size)))
            non_native_skipped += non_native_sequences
        date_after = datetime.now()
        print("Epoch {} average loss = {} Time {} sec. Nonnatives skipped {}".
              format(1.0 * it + 1.0 * p / data_size, avg_cost / PRINT_FREQ,
                     (date_after - date_at_beginning).total_seconds(),
                     non_native_skipped))

        step_cnt += 1
        if True:  #step_cnt * args.batch_size > 100000:
            print('computing validation loss...')
            val_turned = False
            val_p = 0
            val_steps = 0.
            val_cost = 0.
            while not val_turned:
                x, y, val_p, val_turned, non_native = utils.gen_data(
                    val_p, args.seq_len, args.batch_size, val_text, trans,
                    trans_to_index, char_to_index)
                val_steps += 1
                val_cost += cost(x, np.reshape(y, (-1, vocab_size)))
            print('validation loss is ' + str(val_cost / val_steps))
            file_name = 'languages/' + args.language + '/models/' + args.model_name_prefix + '.hdim' + str(
                args.hdim) + '.depth' + str(args.depth) + '.seq_len' + str(
                    args.seq_len) + '.bs' + str(
                        args.batch_size) + '.epoch' + str(
                            1.0 * it + 1.0 * p / data_size) + '.loss' + str(
                                avg_cost / PRINT_FREQ) + '.npz'
            print("saving to -> " + file_name)
            np.save(file_name,
                    lasagne.layers.get_all_param_values(output_layer))
            step_cnt = 0
示例#3
0
def main():
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--hdim', default=512, type=int)
    parser.add_argument('--grad_clip', default=100, type=int)
    parser.add_argument('--lr', default=0.01, type=float)
    parser.add_argument('--batch_size', default=50, type=int)
    parser.add_argument('--num_epochs', default=50, type=int)
    parser.add_argument('--seq_len', default=60, type=int)
    parser.add_argument('--depth', default=1, type=int)
    parser.add_argument('--model', default=None)
    parser.add_argument('--model_name_prefix', default='model')
    parser.add_argument('--language', default='hy-AM')
    parser.add_argument('--start_from', default=0, type=float)
    args = parser.parse_args()
   
    print("Loading Files")
    
    (char_to_index, index_to_char, vocab_size, trans_to_index, index_to_trans, trans_vocab_size) = utils.load_vocabulary(language = args.language)
    (train_text, val_text, trans) = utils.load_language_data(language = args.language)
    data_size = len(train_text)
    
    print("Building Network ...")
   
    (output_layer, train, cost) = utils.define_model(args.hdim, args.depth, args.lr, args.grad_clip, trans_vocab_size, vocab_size, is_train = True)
    
    if args.model:
        f = np.load('languages/' + args.language + '/models/' + args.model)
        param_values = [np.float32(f[i]) for i in range(len(f))]
        lasagne.layers.set_all_param_values(output_layer, param_values)
    
    print("Training ...")
    step_cnt = 0
    date_at_beginning = datetime.now()
    last_time = date_at_beginning
    for epoch in range(args.num_epochs):
        train_text = train_text.split(u'։')
        random.shuffle(train_text)
        train_text = u'։'.join(train_text)
        avg_cost = 0.0
        count = 0
        num_of_samples = 0
        num_of_chars = 0
        for (x, y) in utils.data_generator(train_text, args.seq_len, args.batch_size, trans, trans_to_index, char_to_index, is_train = True):
            sample_cost = train(x, np.reshape(y,(-1,vocab_size)))
            sample_cost = float(sample_cost)
            count += 1
            num_of_samples += x.shape[0]
            num_of_chars += x.shape[0] * x.shape[1]
            
            time_now = datetime.now()
            if (time_now - last_time).total_seconds() > 60 * 1: # 10 minutes
                print('Computing validation loss...')
                val_cost = 0.0
                val_count = 0.0
                for ((x_val, y_val, indices, delimiters), non_valids_list) in utils.data_generator(val_text, args.seq_len, args.batch_size, trans, trans_to_index, char_to_index, is_train = False):
                    val_cost += x_val.shape[0] *cost(x_val,np.reshape(y_val,(-1,vocab_size)))
                    val_count += x_val.shape[0]
                print('Validation loss is {}'.format(val_cost/val_count))
                
                file_name = 'languages/{}/models/{}.hdim{}.depth{}.seq_len{}.bs{}.time{:4f}.epoch{}.loss{:.4f}'.format(args.language, args.model_name_prefix, args.hdim, args.depth, args.seq_len, args.batch_size, (time_now - date_at_beginning).total_seconds()/60, epoch, val_cost/val_count)
                print("saving to -> " + file_name)
                np.save(file_name, lasagne.layers.get_all_param_values(output_layer))
                last_time = datetime.now()
            
            print("On step #{} loss is {:.4f}, samples passed {}, chars_passed {}, {:.4f}% of an epoch {} time passed {:4f}"\
                  .format(count, sample_cost, num_of_samples, num_of_chars, 100.0*num_of_chars/len(train_text), epoch, (time_now - date_at_beginning).total_seconds()/60.0))
                  
            avg_cost += sample_cost
# dev dataset

# load test set
filename = 'dataset/Flickr_8k.devImages.txt'
test = load_set(filename)
print('Dataset: %d' % len(test))
test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: test=%d' % len(test_descriptions))
test_features = load_photo_features('features.pkl', test)
print('Photos: test=%d' % len(test_features))
# prepare sequences
X1test, X2test, ytest = create_sequences(tokenizer, max_length,
                                         test_descriptions, test_features)

# fit model

model = define_model(vocab_size, max_length)

filepath = 'model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'
checkpoint = ModelCheckpoint(filepath,
                             monitor='val_loss',
                             verbose=1,
                             save_best_only=False,
                             mode='min')

model.fit([X1train, X2train],
          ytrain,
          epochs=50,
          verbose=1,
          callbacks=[checkpoint],
          validation_data=([X1test, X2test], ytest))
def problem_solving(nb, problem, problem_languages, args, time_start):

    if problem_languages[nb] == "pl":
        pass

    print(problem)
    local_path = get_problem_truth(args.c, problem)
    print(local_path)
    problem_collection, number_of_texts = tagging_problem(
        local_path, problem_languages[nb])

    print('tagged')

    authors = make_authors_list(problem_collection)
    print('authors defined')

    freq1 = args.freq1
    freq2 = args.freq2

    training_set_size, test_set_size = set_sizes(problem_collection)

    random.seed(time.time())

    trunc_words1, trunc_words2 = create_char_ngrams_stat(
        problem_collection, freq2, problem_languages[nb])

    problem_collection = filter_problem_corpus(problem_collection,
                                               trunc_words1, trunc_words2,
                                               problem_languages[nb])

    problem_collection, nb_categories = create_ngrams_and_splitgrams(
        problem_collection)

    words_encoder, words_num = stats_for_ngrams_and_skipgrams(
        problem_collection, nb_categories, freq1)

    freq_feature, words_num = vectorise_problem_corpus(problem_collection,
                                                       words_encoder,
                                                       words_num, frequency,
                                                       number_of_texts)

    freq_feature_form_norm, network_sizes = compute_mean_and_std(
        freq_feature, problem_collection, words_num)

    model_test = define_model(network_sizes, len(authors), len(words_encoder))
    optimiser_test = define_optimiser(model_test)
    bceloss = torch.nn.NLLLoss()
    if use_cuda:
        bceloss = bceloss.cuda()

    mseloss = torch.nn.MSELoss()
    if use_cuda:
        mseloss = mseloss.cuda()

    model = training(model_test, training_set_size, problem_collection,
                     authors, bceloss, optimiser_test, freq_feature_form_norm)

    print('after training')

    result = testing(problem_collection, model, authors,
                     freq_feature_form_norm)

    print('after testing')

    with open(os.path.join(args.o, 'answers-{}.json'.format(problem)),
              'w') as outfile:
        json.dump(result, outfile)

    time_now = time.time()

    timing = time_now - time_start
    print(as_minutes(timing))

    print('sdadkashdksadfksahfksafhksadhf')
    return
示例#6
0
def problem_solving(nb, problem, problem_languages, args, time_start):

    if True:

        #problem = 'problem00001'
        #nb = 0
        if problem_languages[nb] == "pl":
            pass  #continue
        #if (nb != 0 and nb != 0):
        #    continue

        print(problem)
        local_path = get_problem_truth(args.c, problem)
        print(local_path)
        #global problem_collection
        problem_collection, number_of_texts = tagging_problem(
            local_path, problem_languages[nb])

        print('tagged')

        #gc.collect()
        #save_tools(problem_collection_, 'problem_collection_anfang')
        #save_tools(number_of_texts, 'number_of_texts')

        #problem_collection_ = load_tools('problem_collection_anfang')
        #number_of_texts = load_tools('number_of_texts')
        #save_tools(number_of_texts, 'number_of_texts')
        authors = make_authors_list(problem_collection)
        print('authors defined')

        #quit()
        results = []

        #frequency = random.choice([200,220,240,260,280,300])
        #freq1 = random.choice([100,150,200,250,300,350])
        #freq2 =  random.choice([100,150,200,250,300,350])
        #frequency_ = [200,220,240,260,280,300]
        #freq1_ = [100,150,200,250,300,350]
        #freq2_ =  [100,150,200,250,300,350]

        if True:  #for x in range(1):
            #break
            #problem_collection = copy.deepcopy(problem_collection_)

            #frequency = 3000#random.choice([500,600,800,1000,1200,1500])
            #freq1 = 100#random.choice([100,150,200,250,300,350])
            #freq2 =  200#random.choice([100,150,200,250,300,350])

            #training_set_size, test_set_size = set_sizes(problem_collection)

            #random.seed(time.time())

            #print(frequency, freq1, freq2)
            #trunc_words1, trunc_words2 = create_char_ngrams_stat(problem_collection, freq1, freq2, problem_languages[nb])

            #problem_collection = filter_problem_corpus(problem_collection, trunc_words1, trunc_words2, problem_languages[nb])

            #problem_collection, nb_categories = create_ngrams_and_splitgrams(problem_collection)

            #words_encoder, words_num = stats_for_ngrams_and_skipgrams(problem_collection, nb_categories, frequency)

            #problem_collection, freq_feature, words_num = vectorise_problem_corpus(problem_collection, words_encoder, words_num, frequency, number_of_texts)

            #freq_feature_form_norm, pca, network_sizes = compute_mean_and_std(freq_feature, problem_collection,words_num)
            ################################
            #noisy_labels = cluster_test(problem_collection, len(freq_feature), authors, freq_feature_form_norm)

            frequency = 8000  #random.choice([500,600,800])
            freq1 = 400  #random.choice([100,150,200,250,300,350])
            freq2 = 1000  #random.choice([100,150,200,250,300,350])

            training_set_size, test_set_size = set_sizes(problem_collection)

            random.seed(time.time())

            print(frequency, freq1, freq2)
            #del problem_collection
            trunc_words1, trunc_words2 = create_char_ngrams_stat(
                problem_collection, freq1, freq2, problem_languages[nb])

            problem_collection = filter_problem_corpus(problem_collection,
                                                       trunc_words1,
                                                       trunc_words2,
                                                       problem_languages[nb])

            problem_collection, nb_categories = create_ngrams_and_splitgrams(
                problem_collection)

            words_encoder, words_num = stats_for_ngrams_and_skipgrams(
                problem_collection, nb_categories, frequency)

            freq_feature, words_num = vectorise_problem_corpus(
                problem_collection, words_encoder, words_num, frequency,
                number_of_texts)

            freq_feature_form_norm, pca, network_sizes = compute_mean_and_std(
                freq_feature, problem_collection, words_num)

            #result = cluster_test(problem_collection, len(freq_feature), authors, freq_feature_form_norm)

            #save_tools(problem_collection, 'problem_collection')
            #save_tools(words_encoder, 'words_encoder')
            #save_tools(words_num, 'words_num')
            #save_tools(freq_feature, 'freq_feature')

            #problem_collection = load_tools('problem_collection')
            #words_encoder = load_tools('words_encoder')
            #words_num = load_tools('words_num')
            #freq_feature = load_tools('freq_feature')
            #print('tutaj')

            #global model_test
            #model_train = define_model(network_sizes, len(authors), freq_feature_form_norm,len(words_encoder))
            model_test = define_model(network_sizes,
                                      len(authors), freq_feature_form_norm,
                                      len(words_encoder))
            #model = define_model(network_sizes, len(authors), freq_feature_form_norm,len(words_encoder))

            #global optimiser_test

            #optimiser_train = define_optimiser(model_train)
            optimiser_test = define_optimiser(model_test)
            bceloss = torch.nn.NLLLoss()
            if use_cuda:
                bceloss = bceloss.cuda()

            mseloss = torch.nn.MSELoss()
            if use_cuda:
                mseloss = mseloss.cuda()

            #global model
            model = training([None, model_test], training_set_size,
                             problem_collection, authors, bceloss, mseloss,
                             (None, optimiser_test), freq_feature_form_norm,
                             None)

            print('after training')

            result = testing(problem_collection, model, authors,
                             freq_feature_form_norm, None)

            print('after testing')

            with open(os.path.join(args.o, 'answers-{}.json'.format(problem)),
                      'w') as outfile:
                json.dump(result, outfile)

            #results.append(result)

            del model_test, optimiser_test, bceloss, mseloss, outfile
            #gc.collect()
            del freq_feature_form_norm, pca, network_sizes, result, freq_feature, words_num
            #gc.collect()
            del trunc_words1, trunc_words2, nb_categories, words_encoder, training_set_size, test_set_size
            #gc.collect()
            del problem_collection, model
            #del globals()['problem_collection'], globals()['model']
            #del globals()['optimiser_test']
            #del globals()['model_test']
            #gc.collect()
            time_now = time.time()

            timing = time_now - time_start
            print(as_minutes(timing))

            #gc.collect()

        del number_of_texts, authors
        gc.collect()

        #save_tools(results, problem)
        #quit()

        #quit()

    print('sdadkashdksadfksahfksafhksadhf')
    return