def init_from_scratch(args, train_exs, dev_exs): """New model, new data, new dictionary.""" # Create a feature dict out of the annotations in the data logger.info('-' * 100) logger.info('Generate features') feature_dict = utils.build_feature_dict(args, train_exs) logger.info('Num features = %d' % len(feature_dict)) logger.info(feature_dict) # Build a dictionary from the data questions + documents (train/dev splits) logger.info('-' * 100) logger.info('Build word dictionary') word_dict = utils.build_word_dict(args, train_exs + dev_exs) logger.info('Num words = %d' % len(word_dict)) # Build a char dictionary from the data questions + documents (train/dev splits) logger.info('-' * 100) logger.info('Build char dictionary') char_dict = utils.build_char_dict(args, train_exs + dev_exs) logger.info('Num chars = %d' % len(char_dict)) # Initialize model model = DocReader(config.get_model_args(args), word_dict, char_dict, feature_dict) # Load pretrained embeddings for words in dictionary if args.embedding_file: model.load_embeddings(word_dict.tokens(), args.embedding_file) if args.char_embedding_file: model.load_char_embeddings(char_dict.tokens(), args.char_embedding_file) return model
def prepare_dataloader(word_dict=None, feature_dict=None): """Create data loaders for train and dev""" # Load examples logger.info('-' * 100) logger.info('Loading Datasets...') toyfile = 'toy-' if conf['debug'] else '' datafile = os.path.join( conf['data-dir'], 'bioasq_processed', '{}examples-y{}-train.txt'.format(toyfile, conf['year'])) train_ex = utils.load_data(datafile) logger.info('{} train examples loaded'.format(len(train_ex))) datafile = os.path.join( conf['data-dir'], 'bioasq_processed', '{}examples-y{}-test.txt'.format(toyfile, conf['year'])) test_ex = utils.load_data(datafile) logger.info('{} test examples loaded'.format(len(test_ex))) # Prepare feature_dict, word_dict if feature_dict is None: if len(conf['features']) > 0: logger.info('Building feature dictionary...') feature_dict = utils.build_feature_dict(train_ex) if conf['idf-file'] is not None and 'idf' not in feature_dict: feature_dict['idf'] = len(feature_dict) logger.info('Num features = {}'.format(len(feature_dict))) logger.info(feature_dict) if word_dict is None: logger.info('Build word dictionary...') word_dict = utils.build_word_dict(train_ex + test_ex) logger.info('Num words = %d' % len(word_dict)) conf['vocab-size'] = len(word_dict) # Prepare DataLoaders logger.info('-' * 100) logger.info('Creating DataLoaders') train_dataset = utils.QaProxDataset(conf, train_ex, word_dict, feature_dict, conf['idf-file']) train_loader_ = DataLoader(train_dataset, batch_size=conf['batch-size'], sampler=sampler.RandomSampler(train_dataset), collate_fn=utils.batchify, num_workers=conf['num-workers'], pin_memory=conf['cuda']) dev_dataset = utils.QaProxDataset(conf, test_ex, word_dict, feature_dict, conf['idf-file']) dev_loader_ = DataLoader(dev_dataset, batch_size=conf['batch-size'], sampler=sampler.RandomSampler(dev_dataset), collate_fn=utils.batchify, num_workers=conf['num-workers'], pin_memory=conf['cuda']) return train_loader_, dev_loader_, word_dict, feature_dict
def init_from_scratch(args, train_exs, dev_exs): """New model, new data, new dictionary.""" # Create a feature dict out of the annotations in the data logger.info('-' * 100) logger.info('Generate features') feature_dict = utils.build_feature_dict(args, train_exs) logger.info('Num features = %d' % len(feature_dict)) logger.info(feature_dict) # Build a dictionary from the data questions + words (train/dev splits) logger.info('-' * 100) logger.info('Build dictionary') word_dict = utils.build_word_dict(args, train_exs + dev_exs) logger.info('Num words = %d' % len(word_dict)) # Initialize model model = ParagraphRanker(config.get_model_args(args), word_dict, feature_dict) # Load pretrained embeddings for words in dictionary if args.embedding_file and not args.no_embed: model.load_embeddings(word_dict.tokens(), args.embedding_file, args.fasttext) return model
# load data with open(args.train_file, 'r') as f: train_exs = json.load(f) #train_exs=train_exs[:100] with open(args.dev_file, 'r') as f: dev_exs = json.load(f) #dev_exs=dev_exs[:100] with open(args.test_file, 'r') as f: test_exs = json.load(f) #test_exs=test_exs[:100] # build dict feature_dict = build_feature_dict( args, train_exs ) # feature_dict['in_question']=0, ['in_question_uncased']=1,['in_question_lemma']=2,['pos=NN']=3,['pos=IN']=4,['pos=DT']=5,. word_dict = build_word_dict(args, train_exs, dev_exs + test_exs) logger.info('Num words = %d' % len(word_dict)) # -------------------------------------------------------------------------- logger.info('-' * 100) logger.info('Make data loaders') # single ex vectorized train_dataset = ReaderDataset(train_exs, args, word_dict, feature_dict, if_train=True) # sample stategy if args.sort_by_len: