logger.info('Sampling training data...') # FIXME: pick_positive_name ignores those whose gold standard length is not one (multiple or nil) # positives = [sp_training.pick_positive_name(config,corpus_train,concept,i) for i in range(len(corpus_train.names))] # positives = [*zip(positives,corpus_train.names)] # positives_dev_truncated = [sp_training.pick_positive_name(config,corpus_dev_truncated,concept,i) for i in range(len(corpus_dev_truncated.names))] # positives_dev_truncated = [*zip(positives_dev_truncated,corpus_dev_truncated.names)] # with open('gitig_positive_indices.pickle','wb') as f: # pickle.dump([positives,positives_dev_truncated],f) from sample import prepare_positives, examples positives_training, positives_dev, positives_dev_truncated = pickle.load( open(os.path.join(directory, 'gitig_positive_indices.pickle'), 'rb')) positives_training = prepare_positives(positives_training, nltk.word_tokenize, vocabulary) positives_dev = prepare_positives(positives_dev, nltk.word_tokenize, vocabulary) del positives_dev_truncated # sampling def examples(config, concept, positives, vocab, neg_count=config.getint('sample', 'neg_count')): """ Builds positive and negative examples. """ while True:
logger.info('Using truncated development corpus for evaluation.') #corpus_dev = sample.NewDataSet('dev corpus') [real_val_data, concept_order, corpus_dev] = pickle.load( open( os.path.join(directory, 'gitig_real_val_data_truncated_d50_p5.pickle'), 'rb')) real_val_data.y = np.array(real_val_data.y) concept = concept_obj(config, dictionary, order=concept_order) from sample import prepare_positives, examples positives_training, positives_dev, positives_dev_truncated = pickle.load( open(os.path.join(directory, 'gitig_positive_indices.pickle'), 'rb')) # positives_dev = prepare_positives(positives_dev,nltk.word_tokenize,vocabulary) positives_dev_truncated = prepare_positives(positives_dev_truncated, nltk.word_tokenize, vocabulary) del positives_dev, positives_training # corpus # corpus_train = sample.NewDataSet('training corpus') # corpus_train.objects = load.load(os.path.normpath(config['corpus']['training_file']),'NCBI') corpus_dev = sample.NewDataSet('dev corpus') corpus_dev.objects = load.load(config['corpus']['development_file'], 'NCBI') for corpus in [corpus_dev]: corpus.ids = [] # list of all ids (gold standard for each mention) corpus.names = [] # list of all names corpus.all = [ ] # list of tuples (mention_text,gold,context,(start,end,docid))