def syntactic_training_data(rng=data_rng, num_to_run=None, output='syntactic', print_freq=print_freq): training_block = ngram_reader.training_block(rng.random_sample()) block_size = training_block.shape[0] for count in xrange(num_to_run or block_size): if count % print_freq == 0: sys.stdout.write('\r%s: ngram %d of %d' % (output, count, num_to_run or block_size)) sys.stdout.flush() train_index = sample_cumulative_discrete_distribution(training_block[:,-1], rng=data_rng) correct_symbols, error_symbols, ngram_frequency = ngram_reader.contrastive_symbols_from_row(training_block[train_index], rng=data_rng) yield list(correct_symbols) + list(error_symbols)
def syntactic_training_data(rng=data_rng, num_to_run=None, output='syntactic', print_freq=print_freq): training_block = ngram_reader.training_block(rng.random_sample()) block_size = training_block.shape[0] for count in xrange(num_to_run or block_size): if count % print_freq == 0: sys.stdout.write('\r%s: ngram %d of %d' % (output, count, num_to_run or block_size)) sys.stdout.flush() train_index = sample_cumulative_discrete_distribution( training_block[:, -1], rng=data_rng) correct_symbols, error_symbols, ngram_frequency = ngram_reader.contrastive_symbols_from_row( training_block[train_index], rng=data_rng) yield list(correct_symbols) + list(error_symbols)
def add_noise_to_symbols(self, symbols, column_index=None, rng=None, max_tries=5): seq_length = symbols.shape[0] if column_index is None: column_index = seq_length / 2 tries = 0 replacement_word = symbols[column_index] while tries < max_tries: tries += 1 replacement_word = sample_cumulative_discrete_distribution(self.cumulative_word_frequencies) if replacement_word != 0 and replacement_word != symbols[column_index]: break assert replacement_word < self.vocab_size noisy = symbols.copy() noisy[column_index] = replacement_word return noisy
def add_noise_to_symbols(self, symbols, column_index=None, rng=None, max_tries=5): seq_length = symbols.shape[0] if column_index is None: column_index = seq_length / 2 tries = 0 replacement_word = symbols[column_index] while tries < max_tries: tries += 1 replacement_word = sample_cumulative_discrete_distribution( self.cumulative_word_frequencies) if replacement_word != 0 and replacement_word != symbols[ column_index]: break assert replacement_word < self.vocab_size noisy = symbols.copy() noisy[column_index] = replacement_word return noisy
last_time = time.clock() model.increase_k() stats_for_k = {} if not args['dont_run_syntactic']: # syntactic update step augmented_costs = [] costs = [] for block_num in xrange(args['syntactic_blocks_to_run']): training_block = ngram_reader.training_block(data_rng.random_sample()) block_size = training_block.shape[0] for count in xrange(block_size): if count % print_freq == 0: sys.stdout.write('\rk %i b%i: ngram %d of %d' % (model.k, block_num, count, block_size)) sys.stdout.flush() train_index = sample_cumulative_discrete_distribution(training_block[:,-1], rng=data_rng) correct_symbols, error_symbols, ngram_frequency = ngram_reader.contrastive_symbols_from_row(training_block[train_index], rng=data_rng) augmented_cost, cost = model.update_w(*(list(correct_symbols) + list(error_symbols))) if not np.isfinite(cost): print 'single nan detected' save_model('nan_dump.pkl.gz') import IPython IPython.embed() augmented_costs.append(augmented_cost) costs.append(cost) if args['syntactic_blocks_to_run'] > 1: print print '%i intermediate mean %f' % (block_num, np.mean(costs[-block_size:])) print if not np.isfinite(np.mean(costs)):
print 'syntactic mean score \t%f' % syn_validation_mean print 'syntactic mean weighted score \t%f' % syn_validation_weighted_mean # print 'time since block init: %f' % (time.clock() - last_time) # semantic update step if not args['dont_run_semantic']: this_count = 0 augmented_costs = [] costs = [] for block_num in xrange(args['semantic_blocks_to_run']): for i in xrange(args['semantic_block_size']): train_i = -1 while train_i not in indices_in_intersection: train_i = sample_cumulative_discrete_distribution( ngram_reader.cumulative_word_frequencies, rng=data_rng) for j in xrange(args['k_nearest']): train_j = -1 while train_j not in indices_in_intersection: train_j = sample_cumulative_discrete_distribution( ngram_reader.cumulative_word_frequencies, rng=data_rng) if word_similarity.word_pairwise_sims[ train_i, train_j] == -np.inf: continue sim = word_similarity.word_pairwise_sims[train_i, train_j] augmented_cost, cost = model.update_v( train_i, train_j, sim) augmented_costs.append(augmented_cost)
print 'validation:' print 'syntactic mean score \t%f' % syn_validation_mean print 'syntactic mean weighted score \t%f' % syn_validation_weighted_mean # print 'time since block init: %f' % (time.clock() - last_time) # semantic update step if not args['dont_run_semantic']: this_count = 0 augmented_costs = [] costs = [] for block_num in xrange(args['semantic_blocks_to_run']): for i in xrange(args['semantic_block_size']): train_i = -1 while train_i not in indices_in_intersection: train_i = sample_cumulative_discrete_distribution(ngram_reader.cumulative_word_frequencies, rng=data_rng) for j in xrange(args['k_nearest']): train_j = -1 while train_j not in indices_in_intersection: train_j = sample_cumulative_discrete_distribution(ngram_reader.cumulative_word_frequencies, rng=data_rng) if word_similarity.word_pairwise_sims[train_i, train_j] == -np.inf: continue sim = word_similarity.word_pairwise_sims[train_i, train_j] augmented_cost, cost = model.update_v(train_i, train_j, sim) augmented_costs.append(augmented_cost) costs.append(cost) if i % print_freq == 0: sys.stdout.write('\r k %i: pair : %d / %d' % (model.k, i, args['semantic_block_size'])) sys.stdout.flush()
if not args['dont_run_syntactic']: # syntactic update step augmented_costs = [] costs = [] for block_num in xrange(args['syntactic_blocks_to_run']): training_block = ngram_reader.training_block( data_rng.random_sample()) block_size = training_block.shape[0] for count in xrange(block_size): if count % print_freq == 0: sys.stdout.write( '\rk %i b%i: ngram %d of %d' % (model.k, block_num, count, block_size)) sys.stdout.flush() train_index = sample_cumulative_discrete_distribution( training_block[:, -1], rng=data_rng) correct_symbols, error_symbols, ngram_frequency = ngram_reader.contrastive_symbols_from_row( training_block[train_index], rng=data_rng) augmented_cost, cost = model.update_w( *(list(correct_symbols) + list(error_symbols))) if not np.isfinite(cost): print 'single nan detected' save_model('nan_dump.pkl.gz') import IPython IPython.embed() augmented_costs.append(augmented_cost) costs.append(cost) if args['syntactic_blocks_to_run'] > 1: print print '%i intermediate mean %f' % ( block_num, np.mean(costs[-block_size:]))