logging.info("INITIALIZING TRAINING STATE") logging.info(myyaml.dump(common.dump.vars_seq([hyperparameters, miscglobals]))) #validate(0) diagnostics.diagnostics(cnt, m) # diagnostics.visualizedebug(cnt, m, rundir) while 1: logging.info("STARTING EPOCH #%d" % epoch) for ebatch in get_train_minibatch: cnt += len(ebatch) # print [wordmap.str(id) for id in e] m.train(ebatch) #validate(cnt) if cnt % (int(1000./HYPERPARAMETERS["MINIBATCH SIZE"])*HYPERPARAMETERS["MINIBATCH SIZE"]) == 0: logging.info("Finished training step %d (epoch %d)" % (cnt, epoch)) # print ("Finished training step %d (epoch %d)" % (cnt, epoch)) if cnt % (int(100000./HYPERPARAMETERS["MINIBATCH SIZE"])*HYPERPARAMETERS["MINIBATCH SIZE"]) == 0: diagnostics.diagnostics(cnt, m) if os.path.exists(os.path.join(rundir, "BAD")): logging.info("Detected file: %s\nSTOPPING" % os.path.join(rundir, "BAD")) sys.stderr.write("Detected file: %s\nSTOPPING\n" % os.path.join(rundir, "BAD")) sys.exit(0) if cnt % (int(HYPERPARAMETERS["VALIDATE_EVERY"]*1./HYPERPARAMETERS["MINIBATCH SIZE"])*HYPERPARAMETERS["MINIBATCH SIZE"]) == 0: state.save(m, cnt, epoch, get_train_minibatch, rundir, newkeystr) diagnostics.visualizedebug(cnt, m, rundir, newkeystr) # validate(cnt) get_train_minibatch = examples.TrainingMinibatchStream() epoch += 1
for tokens in trainingsentences(): for i in range(len(tokens)): for j, context in enumerate(HYPERPARAMETERS["CONTEXT_TYPES"]): for k in context: tokidx = i + k if tokidx < 0 or tokidx >= len(tokens): continue random_representations[tokens[i]] += context_vectors[j][tokens[tokidx]] cnt += 1 if cnt % 10000 == 0: diagnostics.diagnostics(cnt, random_representations) logging.info("DONE. Dividing embeddings by their standard deviation...") random_representations = random_representations * (1. / numpy.std(random_representations)) diagnostics.diagnostics(cnt, random_representations) diagnostics.visualizedebug(cnt, random_representations, rundir, newkeystr) outfile = os.path.join(rundir, "random_representations") if newkeystr != "": verboseoutfile = os.path.join(rundir, "random_representations%s" % newkeystr) logging.info("Writing representations to %s, and creating link %s" % (outfile, verboseoutfile)) os.system("ln -s random_representations %s " % (verboseoutfile)) else: logging.info("Writing representations to %s, not creating any link because of default settings" % outfile) o = open(outfile, "wt") from vocabulary import wordmap for i in range(wordmap.len): o.write(wordmap.str(i) + " ") for v in random_representations[i]: o.write(`v` + " ")
logging.info(myyaml.dump(common.dump.vars_seq([hyperparameters, miscglobals]))) #validate(0) diagnostics.diagnostics(cnt, m) # diagnostics.visualizedebug(cnt, m, rundir) while 1: logging.info("STARTING EPOCH #%d" % epoch) for ebatch in get_train_minibatch: cnt += len(ebatch) # print [wordmap.str(id) for id in e] noise_sequences, weights = corrupt.corrupt_examples(m, ebatch) m.train(ebatch, noise_sequences, weights) #validate(cnt) if cnt % (int(1000./HYPERPARAMETERS["MINIBATCH SIZE"])*HYPERPARAMETERS["MINIBATCH SIZE"]) == 0: logging.info("Finished training step %d (epoch %d)" % (cnt, epoch)) # print ("Finished training step %d (epoch %d)" % (cnt, epoch)) if cnt % (int(100000./HYPERPARAMETERS["MINIBATCH SIZE"])*HYPERPARAMETERS["MINIBATCH SIZE"]) == 0: diagnostics.diagnostics(cnt, m) if os.path.exists(os.path.join(rundir, "BAD")): logging.info("Detected file: %s\nSTOPPING" % os.path.join(rundir, "BAD")) sys.stderr.write("Detected file: %s\nSTOPPING\n" % os.path.join(rundir, "BAD")) sys.exit(0) if cnt % (int(HYPERPARAMETERS["VALIDATE_EVERY"]*1./HYPERPARAMETERS["MINIBATCH SIZE"])*HYPERPARAMETERS["MINIBATCH SIZE"]) == 0: state.save(m, cnt, epoch, get_train_minibatch, rundir, newkeystr) diagnostics.visualizedebug(cnt, m, rundir, newkeystr) # validate(cnt) get_train_minibatch = examples.TrainingMinibatchStream() epoch += 1
for i in range(len(tokens)): for j, context in enumerate(HYPERPARAMETERS["CONTEXT_TYPES"]): for k in context: tokidx = i + k if tokidx < 0 or tokidx >= len(tokens): continue random_representations[tokens[i]] += context_vectors[j][ tokens[tokidx]] cnt += 1 if cnt % 10000 == 0: diagnostics.diagnostics(cnt, random_representations) logging.info("DONE. Dividing embeddings by their standard deviation...") random_representations = random_representations * ( 1. / numpy.std(random_representations)) diagnostics.diagnostics(cnt, random_representations) diagnostics.visualizedebug(cnt, random_representations, rundir, newkeystr) outfile = os.path.join(rundir, "random_representations") if newkeystr != "": verboseoutfile = os.path.join(rundir, "random_representations%s" % newkeystr) logging.info("Writing representations to %s, and creating link %s" % (outfile, verboseoutfile)) os.system("ln -s random_representations %s " % (verboseoutfile)) else: logging.info( "Writing representations to %s, not creating any link because of default settings" % outfile) o = open(outfile, "wt") from vocabulary import wordmap