def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('preprocessing data') if args.amazon is True: line_iterator = \ AmazonProductReviewCorpusReader(args.infile).review_generator() else: line_iterator = file_line_generator(args.infile) if args.sentence_splitter: sent_splitter = nltk.data.load(args.sentence_splitter) with utf8_file_open(args.outfile, 'w') as outfile: for (i, line) in enumerate(line_iterator): log_iterations(log, i, 100000) if args.replace_digits: line = re.sub(r'\d', args.replace_digits, line, 0, REGEX_FLAGS) if args.strip_html: line = nltk.clean_html(line) if args.sentence_splitter: line = sent_splitter.tokenize(line) else: line = [line] if args.tokenize: line = [tokenize(l) for l in line] if not args.tokenize: outfile.write('\n'.join(line)) else: outfile.write('\n'.join([' '.join(l) for l in line])) outfile.write('\n') log.info('finished')
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('preprocessing data') if args.amazon is True: line_iterator = \ AmazonProductReviewCorpusReader(args.infile).review_generator() else: line_iterator = file_line_generator(args.infile) if args.sentence_splitter: sent_splitter = nltk.data.load(args.sentence_splitter) with utf8_file_open(args.outfile, 'w') as outfile: for (i, line) in enumerate(line_iterator): log_iterations(log, i, 100000) if args.replace_digits: line = re.sub(r'\d', args.replace_digits, line, 0, REGEX_FLAGS) if args.strip_html: line = nltk.clean_html(line) if args.sentence_splitter: line = sent_splitter.tokenize(line) else: line = [line] if args.tokenize: line = [tokenize(l) for l in line] if not args.tokenize: outfile.write(u'\n'.join(line)) else: outfile.write(u'\n'.join([u' '.join(l) for l in line])) outfile.write(u'\n') log.info('finished')
def main(argv=None): """See argument parser description.""" if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) with utf8_file_open(args.outfile, 'w') as outfile: outfile.write(u'mean max min std_dev\n') for (count, tupel) in enumerate(calc_matrix_statistics(args.infile)): log_iterations(log, count, 10000) outfile.write(u'%f %f %f %f\n' % tupel) log.info('finished')
def main(argv=None): """See argument parser description.""" if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) with utf8_file_open(args.outfile, 'w') as outfile: outfile.write(u'mean max min std_dev\n') for (count, tupel) in enumerate(calc_matrix_statistics(args.infile)): log_iterations(log, count, 10000) outfile.write(u'%f %f %f %f\n' % tupel) log.info('finished')
def main(argv=None): """See argument parser description.""" if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') file2_content = list(file_line_generator(args.file2)) log.info('combining files') with utf8_file_open(args.out_file, 'w') as outfile: for c, line1 in enumerate(file_line_generator(args.file1)): log_iterations(log, c, 1000) for line2 in file2_content: outfile.write(line1 + args.separator + line2 + '\n') log.info('finished')
def main(argv=None): """See argument parser description.""" if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') file2_content = list(file_line_generator(args.file2)) log.info('combining files') with utf8_file_open(args.out_file, 'w') as outfile: for c, line1 in enumerate(file_line_generator(args.file1)): log_iterations(log, c, 1000) for line2 in file2_content: outfile.write(line1 + args.separator + line2 + u'\n') log.info('finished')
def run(self): vocab = dict(self.vocab) # Get a mapping from index to word vocab_entries = sort_dict_by_label(vocab) vocab_entries = zip(*vocab_entries)[0] log_probabs = 0. num_ppl_examples = 0 num_examples = 0 with utf8_file_open(self.result_file, 'w') as outfile: for batch, _ in self.next_batch(self.predict_file): # Handle each prediction # for (cur_count, (example, predictions)) in enumerate(self.predict_single()): log_iterations(log, num_examples, 10000) num_examples += len(batch) if self.perplexity: batch = zip(*batch) # Pass only the context, not the target word predictions = self.predictor_method(batch[0]) else: self.predictor_method(batch) if self.store_softmax or self.store_rank or self.store_argmax \ or self.information or self.perplexity: sm, probabs, cur_log_probabs, cur_num_ppl_examples = \ self._calc_probabilities_from_similarity(batch[1], predictions[1]) num_ppl_examples += cur_num_ppl_examples if self.store_rank or self.information: # rankdata sorts ascending, i.e., distances, but we have # similarities, hence, 1-sm ranks = rankdata(1 - sm, method='min').astype(int) if self.store_rank: outfile.write(ndarray_to_string(ranks)) if self.information: unique_ranks = set(ranks) hard_idx = vocab[u'hard'] sorted_unique_ranks = ' '.join( map(str, sorted(unique_ranks))) sorted_unique_ranks = '' top_ten_entries = ' '.join([ vocab_entries[i] for i in np.argsort(1 - sm)[:10] ]) print '#%d\t%s\t%s' % (ranks[hard_idx], sorted_unique_ranks, top_ten_entries) if self.store_argmax: maximum = np.argmax(sm) # outfile.write(vocab_entries[maximum] + u' (%d)\t' % maximum) outfile.write(vocab_entries[maximum]) if self.store_softmax: if self.normalize_with_root: sm = np.sqrt(sm) sm = sm / np.linalg.norm(sm, 2, axis=-1) outfile.write(ndarray_to_string(sm)) if self.perplexity: if self.save_word: indices_in_predict_vocab = [ self.vocab_mapping[batch[1][i]] for i in range(len(batch[1])) ] indices_in_original_vocab = [ self.vocab_mapping_list[i] for i in indices_in_predict_vocab ] words = [ self.vocab.keys()[self.vocab.values().index(i)] for i in indices_in_original_vocab ] outfile.write(u'\n'.join( "%s %s" % (x, y) for x, y in zip(map(unicode, probabs), words))) else: outfile.write(u'\n'.join(map(unicode, probabs))) log_probabs += cur_log_probabs if cur_log_probabs is not np.nan else 0. if self.predictions: outfile.write(ndarray_to_string(predictions[0][0])) outfile.write(u'\n') # print all results # for predictions in predictions: # outfile.write(ndarray_to_string(predictions[0][0]) + u'\t') # # if args.store_softmax: # outfile.write(ndarray_to_string(predictions[1][0]) + u'\t') # # outfile.write(vocab_entries[predictions[2][0]] + u' (%d)' % predictions[2][0]) # outfile.write(u'\n') # # outfile.write(unicode(predictions) + u'\n') if self.perplexity: ppl = np.exp(-1. / (num_ppl_examples) * log_probabs) log.info('Perplexity on %d examples is %f', num_ppl_examples, ppl)
def run(self): self.before_run_begins() # printing.pydotprint(self.model.trainer, outfile='trainer.png') # theano.printing.pydotprint(self.model.validator, outfile='validator.png') self.start_time = time() example_count_since_validation, costs_since_validation = 0, 0.0 theano_processing = 0.0 t = time() batch_generator = self._do_skip_examples() while True: log_iterations(log, self.train_total_batches, 10000) (batch, epoch_finished) = batch_generator.next() arguments = self.prepare_arguments(batch) t_start = time() output = self.model.trainer(*arguments) theano_processing += time() - t_start cost = output[0] self.model.total_examples += len(batch) self.model.total_costs += float(cost) self.train_total_batches += 1 costs_since_validation += float(cost) example_count_since_validation += len(batch) self.model.update_learning_rate(self.remaining()) if epoch_finished: batch_generator = self.next_batch(self.train_file) self.epoch_finished() if self.dump_ready(epoch_finished): self.dump(self.model) early_stopping = False if self.validation_ready(): # report training error t = float(time() - t) avg_cost = costs_since_validation / \ float(example_count_since_validation) log.info('Average loss on %d example of the training set is %f', example_count_since_validation, avg_cost) log.info('Speed of training is %f example/s', example_count_since_validation / t) log.info('Percentage of time spent by theano processing is %f', theano_processing / t) log.info('Processed %d so far.', self.model.total_examples) example_count_since_validation, costs_since_validation = 0, 0.0 early_stopping = self.validate() t = time() theano_processing = 0.0 if self.early_exit(early_stopping): break self.exit_train()
def run(self): vocab = dict(self.vocab) # Get a mapping from index to word vocab_entries = sort_dict_by_label(vocab) vocab_entries = zip(*vocab_entries)[0] log_probabs = 0. num_ppl_examples = 0 num_examples = 0 with utf8_file_open(self.result_file, 'w') as outfile: for batch, _ in self.next_batch(self.predict_file): # Handle each prediction # for (cur_count, (example, predictions)) in enumerate(self.predict_single()): log_iterations(log, num_examples, 10000) num_examples += len(batch) if self.perplexity: batch = zip(*batch) # Pass only the context, not the target word predictions = self.predictor_method(batch[0]) else: self.predictor_method(batch) if self.store_softmax or self.store_rank or self.store_argmax \ or self.information or self.perplexity: sm, probabs, cur_log_probabs, cur_num_ppl_examples = \ self._calc_probabilities_from_similarity(batch[1], predictions[1]) num_ppl_examples += cur_num_ppl_examples if self.store_rank or self.information: # rankdata sorts ascending, i.e., distances, but we have # similarities, hence, 1-sm ranks = rankdata(1 - sm, method='min').astype(int) if self.store_rank: outfile.write(ndarray_to_string(ranks)) if self.information: unique_ranks = set(ranks) hard_idx = vocab[u'hard'] sorted_unique_ranks = ' '.join(map(str, sorted(unique_ranks))) sorted_unique_ranks = '' top_ten_entries = ' '.join([vocab_entries[i] for i in np.argsort(1 - sm)[:10]]) print '#%d\t%s\t%s' % (ranks[hard_idx], sorted_unique_ranks, top_ten_entries) if self.store_argmax: maximum = np.argmax(sm) # outfile.write(vocab_entries[maximum] + u' (%d)\t' % maximum) outfile.write(vocab_entries[maximum]) if self.store_softmax: if self.normalize_with_root: sm = np.sqrt(sm) sm = sm / np.linalg.norm(sm, 2, axis=-1) outfile.write(ndarray_to_string(sm)) if self.perplexity: if self.save_word: indices_in_predict_vocab = [self.vocab_mapping[batch[1][i]] for i in range(len(batch[1]))] indices_in_original_vocab = [self.vocab_mapping_list[i] for i in indices_in_predict_vocab] words = [self.vocab.keys()[self.vocab.values().index(i)] for i in indices_in_original_vocab] outfile.write( u'\n'.join("%s %s" % (x, y) for x, y in zip(map(unicode, probabs), words)) ) else: outfile.write(u'\n'.join(map(unicode, probabs))) log_probabs += cur_log_probabs if cur_log_probabs is not np.nan else 0. if self.predictions: outfile.write(ndarray_to_string(predictions[0][0])) outfile.write(u'\n') # print all results # for predictions in predictions: # outfile.write(ndarray_to_string(predictions[0][0]) + u'\t') # # if args.store_softmax: # outfile.write(ndarray_to_string(predictions[1][0]) + u'\t') # # outfile.write(vocab_entries[predictions[2][0]] + u' (%d)' % predictions[2][0]) # outfile.write(u'\n') # # outfile.write(unicode(predictions) + u'\n') if self.perplexity: ppl = np.exp(-1. / (num_ppl_examples) * log_probabs) log.info('Perplexity on %d examples is %f', num_ppl_examples, ppl)
def run(self): self.before_run_begins() # printing.pydotprint(self.model.trainer, outfile='trainer.png') # theano.printing.pydotprint(self.model.validator, outfile='validator.png') self.start_time = time() example_count_since_validation, costs_since_validation = 0, 0.0 theano_processing = 0.0 t = time() batch_generator = self._do_skip_examples() while True: log_iterations(log, self.train_total_batches, 10000) (batch, epoch_finished) = batch_generator.next() arguments = self.prepare_arguments(batch) t_start = time() output = self.model.trainer(*arguments) theano_processing += time() - t_start cost = output[0] self.model.total_examples += len(batch) self.model.total_costs += float(cost) self.train_total_batches += 1 costs_since_validation += float(cost) example_count_since_validation += len(batch) self.model.update_learning_rate(self.remaining()) if epoch_finished: batch_generator = self.next_batch(self.train_file) self.epoch_finished() if self.dump_ready(epoch_finished): self.dump(self.model) early_stopping = False if self.validation_ready(): # report training error t = float(time() - t) avg_cost = costs_since_validation / \ float(example_count_since_validation) log.info( 'Average loss on %d example of the training set is %f', example_count_since_validation, avg_cost) log.info('Speed of training is %f example/s', example_count_since_validation / t) log.info('Percentage of time spent by theano processing is %f', theano_processing / t) log.info('Processed %d so far.', self.model.total_examples) example_count_since_validation, costs_since_validation = 0, 0.0 early_stopping = self.validate() t = time() theano_processing = 0.0 if self.early_exit(early_stopping): break self.exit_train()