def predict(string, vocabulary=VOCABULARY, dataset=DATASET, restore=RESTORE): # print('BEGIN' + string + 'END') if vocabulary is None: if dataset is not None: with open(dataset, 'r') as f: text = f.read() vocabulary = create_vocabulary(text) else: vocabulary = load_vocabulary(vocabulary) vocabulary_size = len(vocabulary) # print('(typos.predict)vocabulary_size:', vocabulary_size) # print('(typos.predict)vocabulary:\n', vocabulary) env = Environment(Lstm, LstmBatchGenerator, vocabulary=vocabulary) valid_add_feed = [ # {'placeholder': 'sampling_prob', 'value': 1.}, { 'placeholder': 'dropout', 'value': 1. } ] env.build(batch_size=64, num_layers=2, num_nodes=[1300, 1300], num_output_layers=2, num_output_nodes=[2048], vocabulary_size=vocabulary_size, embedding_size=512, num_unrollings=100, init_parameter=3., regime='inference', num_gpus=1) _, example_res = env.test(restore_path=restore, additions_to_feed_dict=valid_add_feed, validation_dataset_texts=[string], printed_result_types=[], example_length=len(string), vocabulary=vocabulary, print_results=False, verbose=False) return example_res[0]['input'][1:], example_res[0]['output'][ 1:], example_res[0]['prob_vecs'][1:]
def create_vocabulary(texts): text = '' for t in texts: text += t return create_vocabulary(text)
from some_useful_functions import create_vocabulary, get_positions_in_vocabulary f = open('datasets/ted.txt', 'r', encoding='utf-8') text = f.read() f.close() # different offset = 10000 valid_size = 1000 valid_text = text[offset:offset + valid_size] train_text = text[offset + valid_size:] train_size = len(train_text) # In[5]: vocabulary = create_vocabulary(text) vocabulary_size = len(vocabulary) env = Environment(Lstm, LstmBatchGenerator) cpiv = get_positions_in_vocabulary(vocabulary) evaluation = dict( save_path='residuals_no_authors_no_sampling/parameter_tuning/just_lstm_go', result_types=['perplexity', 'loss', 'bpc', 'accuracy'], datasets={ 'train': None, 'default_1': [valid_text, 'default_1'] }, batch_gen_class=LstmBatchGenerator, batch_kwargs={'vocabulary': vocabulary}, batch_size=1,
import re from environment import Environment # from gru_par import Gru, BatchGenerator from lstm_par import Lstm, LstmBatchGenerator from some_useful_functions import create_vocabulary, get_positions_in_vocabulary f = open('datasets/scipop_v3.0/scipop_train.txt', 'r', encoding='utf-8') train_text = re.sub('<[^>]*>', '', f.read()) f.close() f = open('datasets/scipop_v3.0/scipop_valid.txt', 'r', encoding='utf-8') valid_text = re.sub('<[^>]*>', '', ''.join(f.readlines()[:10])) f.close() vocabulary = create_vocabulary(train_text + valid_text) vocabulary_size = len(vocabulary) env = Environment(Lstm, LstmBatchGenerator, vocabulary=vocabulary) # env = Environment(Gru, BatchGenerator) cpiv = get_positions_in_vocabulary(vocabulary) connection_interval = 8 connection_visibility = 5 subsequence_length_in_intervals = 10 add_feed = [ { 'placeholder': 'dropout', 'value': 0.9 } # ,