def __init__(self, text, batch_size, num_unrollings=1, vocabulary=None): self._text = text self._text_size = len(text) self._batch_size = batch_size self.vocabulary = vocabulary self._vocabulary_size = len(self.vocabulary) self.characters_positions_in_vocabulary = get_positions_in_vocabulary(self.vocabulary) self._num_unrollings = num_unrollings segment = self._text_size // batch_size self._cursor = [offset * segment for offset in range(batch_size)] self._last_batch = self._start_batch()
def __init__(self, text, batch_size, num_unrollings=1, vocabulary=None): self._text = text self._pairs = self.make_pairs(self._text, None) self._number_of_pairs = len(self._pairs) self._text_size = len(text) self._batch_size = batch_size self.vocabulary = vocabulary self._vocabulary_size = len(self.vocabulary) self.character_positions_in_vocabulary = get_positions_in_vocabulary(self.vocabulary) self._ids = self._create_id_array(self._pairs, self.character_positions_in_vocabulary) self._num_unrollings = num_unrollings segment = self._number_of_pairs // batch_size self._cursor = [offset * segment for offset in range(batch_size)] self._last_batch = self._start_batch()
def __init__(self, text, batch_size, num_unrollings=1, vocabulary=None): self._text = text self._text_size = len(text) self._batch_size = batch_size self.vocabularies = vocabulary self._vocabulary_sizes = [len(voc) for voc in self.vocabularies] self.character_positions_in_vocabulary = [ get_positions_in_vocabulary(voc) for voc in self.vocabularies ] self._pairs = self.make_pairs( self._text, {'punctuation_marks': self.vocabularies[1]}) self._number_of_pairs = len(self._pairs) self._num_unrollings = num_unrollings segment = self._number_of_pairs // batch_size self._cursor = [offset * segment for offset in range(batch_size)] self._last_batch = self._start_batch()
def __init__(self, text, batch_size, num_unrollings=1, vocabulary=None): tmp_output = process_input_text(text) [ self._text, self._eod_flags, self._speaker_flags, self._bot_answer_flags, self._number_of_speakers ] = tmp_output # print('self._speaker_flags:', self._speaker_flags[:5000]) # print('self._eod_flags:', self._eod_flags[:5000]) # print('self._bot_answer_flags:', self._bot_answer_flags[:5000]) # print('self._text:', self._text[:5000]) self._text_size = len(self._text) self._batch_size = batch_size self._vocabulary = vocabulary self._vocabulary_size = len(self._vocabulary) self._character_positions_in_vocabulary = get_positions_in_vocabulary( self._vocabulary) self._num_unrollings = num_unrollings segment = self._text_size // batch_size self._cursor = [offset * segment for offset in range(batch_size)] self._last_inputs, _ = self._start_batch() print('self._number_of_speakers:', self._number_of_speakers)
def __init__(self, text, batch_size, num_unrollings=1, vocabulary=None): # tmp_output = process_input_text_reg(text) tmp_output = process_input_text(text) [self._text, self._speaker_flags, self._bot_speaks_flags] = tmp_output # print('self._speaker_flags:', self._speaker_flags[:5000]) # print('self._bot_speaks_flags:', self._bot_speaks_flags[:5000]) # print('self._text:', self._text[:5000]) # print('(__init__)len(self._text):', len(self._text)) # print('len(self._bot_speaks_flags):', len(self._bot_speaks_flags)) # print('sum(self._bot_speaks_flags):', sum(self._bot_speaks_flags)) self._text_size = len(self._text) self._batch_size = batch_size self._vocabulary = vocabulary self._vocabulary_size = len(self._vocabulary) self._number_of_speakers = 2 self._character_positions_in_vocabulary = get_positions_in_vocabulary( self._vocabulary) self._num_unrollings = num_unrollings segment = self._text_size // batch_size self._cursor = [offset * segment for offset in range(batch_size)] self._counter = 0 # to swap flags when all train dataset is processed self._last_inputs, _ = self._start_batch()
f.close() # different offset = 10000 valid_size = 1000 valid_text = text[offset:offset + valid_size] train_text = text[offset + valid_size:] train_size = len(train_text) # In[5]: vocabulary = create_vocabulary(text) vocabulary_size = len(vocabulary) env = Environment(Lstm, LstmBatchGenerator) cpiv = get_positions_in_vocabulary(vocabulary) evaluation = dict( save_path='residuals_no_authors_no_sampling/parameter_tuning/just_lstm_go', result_types=['perplexity', 'loss', 'bpc', 'accuracy'], datasets={ 'train': None, 'default_1': [valid_text, 'default_1'] }, batch_gen_class=LstmBatchGenerator, batch_kwargs={'vocabulary': vocabulary}, batch_size=1, additional_feed_dict=[{ 'placeholder': 'dropout', 'value': 1. }])
with open('datasets/all_scipop_word_voc.txt', 'w') as f: for w_idx, w in enumerate(word_voc): f.write(w) if w_idx < len(word_voc) - 1: f.write('\t') with open('datasets/all_scipop_punc_voc.txt', 'w') as f: for p_idx, p in enumerate(punc_voc): f.write(p) if p_idx < len(punc_voc) - 1: f.write('\t') # print('word_voc:', word_voc) print('punc_voc:', punc_voc) word_cpiv = get_positions_in_vocabulary(word_voc) punc_cpiv = get_positions_in_vocabulary(punc_voc) # env = Environment(Lstm, BatchGenerator, vocabulary=vocabulary) env = Environment(Lstm, BatchGenerator, vocabulary=[word_voc, punc_voc]) add_feed = [{'placeholder': 'dropout', 'value': 0.8}] valid_add_feed = [{'placeholder': 'dropout', 'value': 1.}] print('reached build') env.build(batch_size=1, embeddings_in_batch=False, num_layers=2, num_nodes=[1300, 1300], num_output_layers=2, num_output_nodes=[2048], vocabulary_size=vocabulary_sizes[0],