def eval_parser(path, batch_size): data_flow = parsers.TrainingParser(path) sentence_batch = [] for batch_count, sentence in enumerate(data_flow.parse(), start=1): sentence_batch.append(sentence) if int(batch_count) % int(batch_size) == 0: yield sentence_batch sentence_batch = []
def __len__(training_file_path, batch_size): return parsers.TrainingParser(training_file_path).count() // batch_size
def get(batch_size, training_file_path, antivocab, output_vocab, PADDING_SIZE = 50, gold_file_path = None): """ Batch procesing generator, yields a dict of sentences, candidates and labels if in training mode (determined if gold_file_path is specified) param batch_size: param training_file_path: param antivocab: param output_vocab: param gold_file_path: return: generator object """ while True: batch = {"sentences" : [], "candidates" : []} training_data_flow = parsers.TrainingParser(training_file_path ) if gold_file_path: gold_data_flow = parsers.GoldParser(gold_file_path) batch.update({"labels" : []}) for batch_count, sentence in enumerate(training_data_flow.parse(), start = 1): #training mode if gold_file_path: labels = gold_data_flow.parse() output = prepare_sentence(sentence, antivocab, output_vocab, labels) batch['sentences'].append(output['sentence']) batch['candidates'].append(output['candidates']) batch['labels'].append(output['labels']) #evaulation mode else: output = prepare_sentence(sentence, antivocab, output_vocab) batch['sentences'].append(output['sentence']) batch['candidates'].append(output['candidates']) if int(batch_count)%int(batch_size)==0: for key in batch.keys(): batch[key] = apply_padding(batch, key, maxlen = PADDING_SIZE, value = 1) #TO DO: if gold_file_path: x, y = batch['sentences'], np.expand_dims(batch['labels'], axis=-1) x, y = shuffle(x, y) yield x, y else: yield batch['sentences'], batch['candidates'] batch = {"sentences" : [], "candidates" : []} if gold_file_path: batch.update({"labels" : []}) if batch_count>0: for key in batch.keys(): batch[key] = apply_padding(batch, key, maxlen = PADDING_SIZE, value = 1) batch_count = 0 if gold_file_path: x, y = batch['sentences'], np.expand_dims(batch['labels'], axis=-1) x, y = shuffle(x, y) yield x, y else: yield batch['sentences'], batch['candidates']
def get(batch_size, resources_path, training_file_path, \ antivocab, output_vocab, output_vocab2, output_vocab3, \ PADDING_SIZE = 50, gold_file_path = None): """ Batch procesing generator, yields a dict of sentences, candidates and labels if in training mode (determined if gold_file_path is specified) param batch_size: param training_file_path: param antivocab: param output_vocab: senses aram output_vocab2: wndomain aram output_vocab3: lex param gold_file_path: return: generator object """ while True: batch = {"sentences" : [], "candidates" : [], "candidates_wndomain": [], "candidates_lex": []} training_data_flow = parsers.TrainingParser(training_file_path ) if gold_file_path: gold_data_flow = parsers.GoldParser(gold_file_path) batch.update({"labels" : []}) batch.update({"wndomain_labels" : []}) batch.update({"lex_labels" : []}) mapping_file = pd.read_csv(os.path.join(resources_path, "mapping.csv")) for batch_count, sentence in enumerate(training_data_flow.parse(), start = 1): #training mode if gold_file_path: labels = gold_data_flow.parse() output = prepare_sentence(sentence, antivocab, output_vocab, output_vocab2, output_vocab3, mapping_file, labels) batch['sentences'].append(output['sentence']) #batch['candidates'].append(output['candidates']) #batch['candidates_wndomain'].append(output['candidates_wndomain']) #batch['candidates_lex'].append(output['candidates_lex']) batch['labels'].append(output['labels']) batch['wndomain_labels'].append(output['wndomain_labels']) batch['lex_labels'].append(output['lex_labels']) #evaulation mode else: output = prepare_sentence(sentence, antivocab, output_vocab, output_vocab2, output_vocab3) batch['sentences'].append(output['sentence']) batch['candidates'].append(output['candidates']) batch['candidates_wndomain'].append(output['candidates_wndomain']) batch['candidates_lex'].append(output['candidates_lex']) if int(batch_count)%int(batch_size)==0: for key in batch.keys(): batch[key] = apply_padding(batch, key, maxlen = PADDING_SIZE, value = 1) #TO DO: if gold_file_path: x = batch['sentences'] y = [np.expand_dims(batch['labels'], axis=-1), np.expand_dims(batch['wndomain_labels'], axis=-1), np.expand_dims(batch['lex_labels'], axis=-1)] #x, y = shuffle(x, y) yield x, y else: yield batch['sentences'], batch['candidates'], batch['candidates_wndomain'], batch['candidates_lex'] batch = {"sentences" : [], "candidates" : [], "candidates_wndomain": [], "candidates_lex": []} if gold_file_path: batch.update({"labels" : []}) batch.update({"wndomain_labels" : []}) batch.update({"lex_labels" : []}) if batch_count>0: for key in batch.keys(): batch[key] = apply_padding(batch, key, maxlen = PADDING_SIZE, value = 1) batch_count = 0 if gold_file_path: x = batch['sentences'] y = [np.expand_dims(batch['labels'], axis=-1), np.expand_dims(batch['wndomain_labels'], axis=-1), np.expand_dims(batch['lex_labels'], axis=-1)] #x, y = shuffle(x, y) yield x, y else: yield batch['sentences'], batch['candidates'], batch['candidates_wndomain'], batch['candidates_lex']
from tqdm import tqdm, tnrange import json import pandas as pd import os import utils import parsers # In[ ]: # # Parse training XML file # In[5]: Training = parsers.TrainingParser( '../resources/WSD_Evaluation_Framework/Training_Corpora/SemCor/semcor.data.xml' ) Training.create_vocab( input_vocab_path="../resources/semcor.input.vocab.json", pos_vocab_path="../resources/semcor.pos.vocab.json", left_out_vocab_path="../resources/semcor.leftout.vocab.json", subsampling_rate=1e-4, min_count=5) # # converting eval datasets # In[6]: dir_ = "../resources/WSD_Evaluation_Framework/Evaluation_Datasets" eval_datasets = [i for i in os.listdir(dir_) if i.startswith("se")]