json.dump(args, f) pprint(args) # N_relationships = len(relationships.relationships) replacement_column_index = args['sequence_length'] / 2 rng = np.random.RandomState(args['random_seed']) data_rng = np.random.RandomState(args['random_seed']) validation_rng = np.random.RandomState(args['random_seed'] + 1) random.seed(args['random_seed']) # set up syntactic ngram_reader = NgramReader(args['ngram_filename'], vocab_size=args['ngram_vocab_size'], train_proportion=args['train_proportion'], test_proportion=args['test_proportion']) testing_block = ngram_reader.testing_block() print 'corpus contains %i ngrams' % (ngram_reader.number_of_ngrams) # set up semantic # num_semantic_training = int(relationships.N * 0.98) # semantic_training = relationships.data[:num_semantic_training] # semantic_testing = relationships.data[num_semantic_training:] relationship_path = join(base_dir, 'relationships.pkl.gz') vocabulary_path = join(base_dir, 'vocabulary.pkl.gz') try: with gzip.open(relationship_path) as f: relationships = cPickle.load(f) print 'loaded relationships from %s' % relationship_path except: # relationships = Relationships()
args['base_dir'] = base_dir else: model_loaded = False # dump the params with open(os.path.join(args['base_dir'], 'params.json'), 'w') as f: json.dump(args, f) pprint(args) replacement_column_index = args['sequence_length'] / 2 ngram_reader = NgramReader(args['ngram_filename'], vocab_size=args['vocab_size'], train_proportion=args['train_proportion'], test_proportion=args['test_proportion']) testing_block = ngram_reader.testing_block() vocabulary = ngram_reader.word_array print 'corpus contains %i ngrams' % (ngram_reader.number_of_ngrams) rng = np.random.RandomState(args['random_seed']) data_rng = np.random.RandomState(args['random_seed']) validation_rng = np.random.RandomState(args['random_seed'] + 1) random.seed(args['random_seed']) if not args['dont_run_semantic']: print 'loading semantic similarities' word_similarity = semantic_module.WordSimilarity( vocabulary, args['word_similarity_file'], memmap_filename=args['word_similarity_memmap']) print 'computing terms with semantic distance'