def train(args): check_present( args, ["train_corpus", "output_folder", "dev_corpus", "train_shuffle_seed"]) train_corpus_path = args.train_corpus if args.train_shuffle_seed > 0: reader = sling.RecordReader(args.train_corpus) items = [(key, value) for key, value in reader] reader.close() r = random.Random(args.train_shuffle_seed) r.shuffle(items) train_corpus_path = os.path.join(args.output_folder, "train_shuffled.rec") writer = sling.RecordWriter(train_corpus_path) for key, value in items: writer.write(key, value) writer.close() print("Wrote shuffled train corpus to %s using seed %d" % \ (train_corpus_path, args.train_shuffle_seed)) # Setting an explicit seed for the sake of determinism. torch.manual_seed(1) # Make commons store if needed. if args.commons == '' or not os.path.exists(args.commons): if args.commons == '': fname = os.path.join(args.output_folder, "commons") print("Will create a commons store at", fname) args.commons = fname else: print("No commons found at", args.commons, ", creating it...") _, symbols = commons_builder.build( [train_corpus_path, args.dev_corpus], args.commons) print("Commons created at", args.commons, "with", len(symbols), \ "symbols besides the usual ones.") # Make the training spec. spec = Spec() spec.build(args.commons, train_corpus_path) # Initialize the model with the spec and any word embeddings. caspar = Caspar(spec) embeddings_file = args.word_embeddings if embeddings_file == '': embeddings_file = None caspar.initialize(embeddings_file) tmp_folder = os.path.join(args.output_folder, "tmp") if not os.path.exists(tmp_folder): os.makedirs(tmp_folder) evaluator = partial(dev_accuracy, args.dev_corpus, tmp_folder) output_file_prefix = os.path.join(args.output_folder, "caspar") hyperparams = Hyperparams(args) print("Using hyperparameters:", hyperparams) trainer = Trainer(caspar, hyperparams, evaluator, output_file_prefix) train = Corpora(train_corpus_path, spec.commons, gold=True) trainer.train(train)
def build_bk_wrapper(params): """ Wrapper for calculating power/bispectrum """ cat_corr = params[0] kwargs = {} if len(params) > 2: kwargs = params[1] spectrum = Spec('bk', cat_corr, **kwargs) print spectrum.file() spectrum.build() return None
def train(args): check_present(args, ["train_corpus", "output_folder", "dev_corpus"]) # Setting an explicit seed for the sake of determinism. torch.manual_seed(1) # Make commons store if needed. if args.commons == '' or not os.path.exists(args.commons): if args.commons == '': fname = os.path.join(args.output_folder, "commons") print "Will create a commons store at", fname args.commons = fname else: print "No commons found at", args.commons, ", creating it..." _, symbols = commons_builder.build( [args.train_corpus, args.dev_corpus], args.commons) print "Commons created at", args.commons, "with", len(symbols), \ "symbols besides the usual ones." # Make the training spec. spec = Spec() spec.build(args.commons, args.train_corpus) # Initialize the model with the spec and any word embeddings. caspar = Caspar(spec) embeddings_file = args.word_embeddings if embeddings_file == '': embeddings_file = None caspar.initialize(embeddings_file) tmp_folder = os.path.join(args.output_folder, "tmp") if not os.path.exists(tmp_folder): os.makedirs(tmp_folder) evaluator = partial(dev_accuracy, args.commons, args.dev_corpus, tmp_folder) output_file_prefix = os.path.join(args.output_folder, "caspar") hyperparams = Hyperparams(args) print "Using hyperparameters:", hyperparams trainer = Trainer(caspar, hyperparams, evaluator, output_file_prefix) train = Corpora(args.train_corpus, spec.commons, gold=True) trainer.train(train)