def indexed_weights(): global _indexed_weights if _indexed_weights is not None: return _indexed_weights print >> sys.stderr, len( wordmap.map), "=?=", HYPERPARAMETERS["VOCABULARY_SIZE"] assert len(wordmap.map) == HYPERPARAMETERS["VOCABULARY_SIZE"] if HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0: _indexed_weights = [1 for id in range(wordmap.len)] elif HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 1: from common.json import load from common.file import myopen ngrams_file = HYPERPARAMETERS["NGRAMS"][( HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"], HYPERPARAMETERS["VOCABULARY_SIZE"])] print >> sys.stderr, "Reading ngrams from", ngrams_file, "..." from collections import defaultdict ngramcnt = defaultdict(int) for (ngram, cnt) in load(myopen(ngrams_file)): assert len(ngram) == 1 ngramcnt[ngram[0]] = cnt + HYPERPARAMETERS[ "TRAINING_NOISE_SMOOTHING_ADDITION"] _indexed_weights = [ ngramcnt[wordmap.str(id)] for id in range(len(wordmap.map)) ] _indexed_weights = build(_indexed_weights) else: assert 0 return _indexed_weights
def visualize(cnt, embeddings, rundir, idxs, str): """ Visualize a set of examples using t-SNE. """ from vocabulary import wordmap PERPLEXITY=30 x = embeddings[idxs] print x.shape titles = [wordmap.str(id) for id in idxs] import os.path filename = os.path.join(rundir, "embeddings-%s-%d.png" % (str, cnt)) try: from textSNE.calc_tsne import tsne # from textSNE.tsne import tsne out = tsne(x, perplexity=PERPLEXITY) from textSNE.render import render render([(title, point[0], point[1]) for title, point in zip(titles, out)], filename) except IOError: logging.info("ERROR visualizing", filename, ". Continuing...")
def indexed_weights(): global _indexed_weights if _indexed_weights is not None: return _indexed_weights print >> sys.stderr, len(wordmap.map), "=?=", HYPERPARAMETERS["VOCABULARY_SIZE"] assert len(wordmap.map) == HYPERPARAMETERS["VOCABULARY_SIZE"] if HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0: _indexed_weights = [1 for id in range(wordmap.len)] elif HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 1: from common.json import load from common.file import myopen ngrams_file = HYPERPARAMETERS["NGRAMS"][(HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"], HYPERPARAMETERS["VOCABULARY_SIZE"])] print >> sys.stderr, "Reading ngrams from", ngrams_file, "..." from collections import defaultdict ngramcnt = defaultdict(int) for (ngram, cnt) in load(myopen(ngrams_file)): assert len(ngram) == 1 ngramcnt[ngram[0]] = cnt + HYPERPARAMETERS["TRAINING_NOISE_SMOOTHING_ADDITION"] _indexed_weights = [ngramcnt[wordmap.str(id)] for id in range(len(wordmap.map))] _indexed_weights = build(_indexed_weights) else: assert 0 return _indexed_weights
for ebatch in get_train_minibatch: cnt += len(ebatch) #for e in ebatch: #print [wordmap.str(id) for id in e] #print e m.train(ebatch) #validate(cnt) if cnt % (int(1000./HYPERPARAMETERS["MINIBATCH SIZE"])*HYPERPARAMETERS["MINIBATCH SIZE"]) == 0: logging.info("Finished training step %d (epoch %d)" % (cnt, epoch)) # print ("Finished training step %d (epoch %d)" % (cnt, epoch)) if cnt % (int(100000./HYPERPARAMETERS["MINIBATCH SIZE"])*HYPERPARAMETERS["MINIBATCH SIZE"]) == 0: if os.path.exists(os.path.join(rundir, "BAD")): logging.info("Detected file: %s\nSTOPPING" % os.path.join(rundir, "BAD")) sys.stderr.write("Detected file: %s\nSTOPPING\n" % os.path.join(rundir, "BAD")) sys.exit(0) if cnt % (int(HYPERPARAMETERS["VALIDATE_EVERY"]*1./HYPERPARAMETERS["MINIBATCH SIZE"])*HYPERPARAMETERS["MINIBATCH SIZE"]) == 0: state.save(m, cnt, epoch, get_train_minibatch, rundir, newkeystr) validate(cnt) get_train_minibatch = examples.TrainingMinibatchStream() epoch += 1 #output the embedding outfile=open(HYPERPARAMETERS["EMBEDDING_FILE"],'w') from vocabulary import wordmap for i in range(m.parameters.vocab_size): outfile.write(wordmap.str(i)+'\t') for v in m.parameters.embeddings[i]: outfile.write(str(v)+'\t') outfile.write('\n') outfile.flush() outfile.close()
#!/usr/bin/env python """ Dump n-gram counts over entire training data as YAML. """ import sys from common.stats import stats from hyperparameters import HYPERPARAMETERS from collections import defaultdict cnt = defaultdict(int) if __name__ == "__main__": import vocabulary print >> sys.stderr, "Reading vocab" vocabulary.read() from vocabulary import wordmap import train for (i, e) in enumerate(train.get_train_example()): cnt[tuple([wordmap.str(t) for t in e])] += 1 if i % 10000 == 0: print >> sys.stderr, "Read %d examples" % i print >> sys.stderr, stats() if i > 100000000: break cnt = [(t, cnt[t]) for t in cnt] import common.json common.json.dump(cnt, sys.stdout)
for i in range(len(tokens)): for j, context in enumerate(HYPERPARAMETERS["CONTEXT_TYPES"]): for k in context: tokidx = i + k if tokidx < 0 or tokidx >= len(tokens): continue random_representations[tokens[i]] += context_vectors[j][tokens[tokidx]] cnt += 1 if cnt % 10000 == 0: diagnostics.diagnostics(cnt, random_representations) logging.info("DONE. Dividing embeddings by their standard deviation...") random_representations = random_representations * (1. / numpy.std(random_representations)) diagnostics.diagnostics(cnt, random_representations) diagnostics.visualizedebug(cnt, random_representations, rundir, newkeystr) outfile = os.path.join(rundir, "random_representations") if newkeystr != "": verboseoutfile = os.path.join(rundir, "random_representations%s" % newkeystr) logging.info("Writing representations to %s, and creating link %s" % (outfile, verboseoutfile)) os.system("ln -s random_representations %s " % (verboseoutfile)) else: logging.info("Writing representations to %s, not creating any link because of default settings" % outfile) o = open(outfile, "wt") from vocabulary import wordmap for i in range(wordmap.len): o.write(wordmap.str(i) + " ") for v in random_representations[i]: o.write(`v` + " ") o.write("\n")
#!/usr/bin/env python from optparse import OptionParser parser = OptionParser() parser.add_option("-m", "--modelfile", dest="modelfile") (options, args) = parser.parse_args() assert options.modelfile is not None import cPickle m = cPickle.load(open(options.modelfile)) # print m.parameters.embeddings.shape from vocabulary import wordmap for i in range(m.parameters.vocab_size): print wordmap.str(i), for v in m.parameters.embeddings[i]: print v, print
#!/usr/bin/env python from optparse import OptionParser parser = OptionParser() parser.add_option("-m", "--modelfile", dest="modelfile") (options, args) = parser.parse_args() assert options.modelfile is not None import cPickle m = cPickle.load(open(options.modelfile)) #print m.parameters.embeddings.shape from vocabulary import wordmap for i in range(m.parameters.vocab_size): print wordmap.str(i), for v in m.parameters.embeddings[i]: print v, print
HYPERPARAMETERS["MINIBATCH SIZE"]) == 0: logging.info("Finished training step %d (epoch %d)" % (cnt, epoch)) # print ("Finished training step %d (epoch %d)" % (cnt, epoch)) if cnt % (int(100000. / HYPERPARAMETERS["MINIBATCH SIZE"]) * HYPERPARAMETERS["MINIBATCH SIZE"]) == 0: if os.path.exists(os.path.join(rundir, "BAD")): logging.info("Detected file: %s\nSTOPPING" % os.path.join(rundir, "BAD")) sys.stderr.write("Detected file: %s\nSTOPPING\n" % os.path.join(rundir, "BAD")) sys.exit(0) if cnt % (int(HYPERPARAMETERS["VALIDATE_EVERY"] * 1. / HYPERPARAMETERS["MINIBATCH SIZE"]) * HYPERPARAMETERS["MINIBATCH SIZE"]) == 0: state.save(m, cnt, epoch, get_train_minibatch, rundir, newkeystr) validate(cnt) get_train_minibatch = examples.TrainingMinibatchStream() epoch += 1 #output the embedding outfile = open(HYPERPARAMETERS["EMBEDDING_FILE"], 'w') from vocabulary import wordmap for i in range(m.parameters.vocab_size): outfile.write(wordmap.str(i) + '\t') for v in m.parameters.embeddings[i]: outfile.write(str(v) + '\t') outfile.write('\n') outfile.flush() outfile.close()
tokens[tokidx]] cnt += 1 if cnt % 10000 == 0: diagnostics.diagnostics(cnt, random_representations) logging.info("DONE. Dividing embeddings by their standard deviation...") random_representations = random_representations * ( 1. / numpy.std(random_representations)) diagnostics.diagnostics(cnt, random_representations) diagnostics.visualizedebug(cnt, random_representations, rundir, newkeystr) outfile = os.path.join(rundir, "random_representations") if newkeystr != "": verboseoutfile = os.path.join(rundir, "random_representations%s" % newkeystr) logging.info("Writing representations to %s, and creating link %s" % (outfile, verboseoutfile)) os.system("ln -s random_representations %s " % (verboseoutfile)) else: logging.info( "Writing representations to %s, not creating any link because of default settings" % outfile) o = open(outfile, "wt") from vocabulary import wordmap for i in range(wordmap.len): o.write(wordmap.str(i) + " ") for v in random_representations[i]: o.write( ` v ` + " ") o.write("\n")
Dump n-gram counts over entire training data as YAML. """ import sys from common.stats import stats from collections import defaultdict cnt = defaultdict(int) if __name__ == "__main__": import common.hyperparameters, common.options HYPERPARAMETERS = common.hyperparameters.read("language-model") HYPERPARAMETERS, options, args = common.options.reparse(HYPERPARAMETERS) import hyperparameters import vocabulary print >> sys.stderr, "Reading vocab" vocabulary.read() from vocabulary import wordmap import train for (i, e) in enumerate(train.get_train_example()): cnt[tuple([wordmap.str(t) for t in e])] += 1 if i % 10000 == 0: print >> sys.stderr, "Read %d examples" % i print >> sys.stderr, stats() if i > 100000000: break cnt = [(t, cnt[t]) for t in cnt] import common.json common.json.dump(cnt, sys.stdout)