'bag_of_words_generator') logger.info('Program start, bag of words generator experiment id = %s', experiment_id) logger.info(config) vocabulary = db.load_vocabulary(args.vocabulary_experiment) logger.info('Vocabulary loaded') logger.info('Vocabulary length = %s', len(vocabulary)) table_name = get_table_name(args, experiment_id) # Get the corpus and prepare the bag of words generator. db = DatabaseManager() subject_ids, corpus, chart_dates = db.get_corpus( toy_set=args.toy_set, top100_labels=args.top100_labels, validation_set=args.validation_set, test_set=args.test_set) bag_of_words_generator = BagOfWordsGenerator(logger, vocabulary, subject_ids, corpus, chart_dates) if args.for_rnn: bag_of_words_vectors_rnn = bag_of_words_generator.build_bag_of_words_vectors_rnn( ) logger.info('Bag of words vectors for RNN created') db.insert_bag_of_words_vectors_rnn(bag_of_words_vectors_rnn, table_name) logger.info('Bag of words vectors for RNN inserted in table %s', table_name) else:
nargs='?', const=700, help='how many rows to fetch from the corpus table') parser.add_argument('--top100_labels', action='store_true', default=False) args = parser.parse_args() db = DatabaseManager() start = datetime.datetime.now() time_str = start.strftime("%m%d_%H%M%S") config = vars(args) experiment_id = db.vocabulary_experiment_create(config, start) log_filename = '{}_vocabulary_generator.log'.format(experiment_id) db.vocabulary_experiment_insert_log_file(experiment_id, log_filename) logger = logging_utils.build_logger(log_filename).getLogger( 'vocabulary_generator') logger.info('Program start, vocabulary experiment id = %s', experiment_id) logger.info(config) _, corpus, _ = db.get_corpus(toy_set=args.toy_set, top100_labels=args.top100_labels) vocabulary_generator = VocabularyGenerator(corpus, logger) vocabulary = vocabulary_generator.build_vocabulary() end = datetime.datetime.now() db.vocabulary_experiment_insert_vocabulary(experiment_id, end, vocabulary) logger.info('Vocabulary inserted into database')