示例#1
0
def eval_parser(path, batch_size):

    data_flow = parsers.TrainingParser(path)
    sentence_batch = []
    for batch_count, sentence in enumerate(data_flow.parse(), start=1):
        sentence_batch.append(sentence)

        if int(batch_count) % int(batch_size) == 0:
            yield sentence_batch
            sentence_batch = []
def __len__(training_file_path, batch_size):
    return parsers.TrainingParser(training_file_path).count() // batch_size
def get(batch_size, training_file_path, antivocab, output_vocab, PADDING_SIZE = 50, gold_file_path = None):
    """
    Batch procesing generator, yields a dict of sentences, candidates and labels if in training mode (determined if gold_file_path is specified)

    param batch_size:
    param training_file_path:
    param antivocab:
    param output_vocab:
    param gold_file_path:
    return: generator object
    """
    while True:
        batch = {"sentences" : [], "candidates" : []}

        training_data_flow = parsers.TrainingParser(training_file_path )
        if gold_file_path:
            gold_data_flow = parsers.GoldParser(gold_file_path)
            batch.update({"labels" : []})


        for batch_count, sentence in enumerate(training_data_flow.parse(), start = 1):
            
            #training mode
            if gold_file_path:
                labels = gold_data_flow.parse()
                output = prepare_sentence(sentence, antivocab, output_vocab, labels)

                batch['sentences'].append(output['sentence'])
                batch['candidates'].append(output['candidates'])
                batch['labels'].append(output['labels'])

            #evaulation mode
            else:
                output = prepare_sentence(sentence, antivocab, output_vocab)

                batch['sentences'].append(output['sentence'])
                batch['candidates'].append(output['candidates'])
   
            if int(batch_count)%int(batch_size)==0:

                for key in batch.keys():
                    batch[key] = apply_padding(batch, key, maxlen = PADDING_SIZE, value = 1)

                #TO DO:

                if gold_file_path:
                    x, y = batch['sentences'], np.expand_dims(batch['labels'], axis=-1)
                    x, y = shuffle(x, y)
                    yield x, y
                else:
                    yield batch['sentences'], batch['candidates']
                    
                batch = {"sentences" : [], "candidates" : []}
                if gold_file_path:
                    batch.update({"labels" : []})

        if batch_count>0:
            for key in batch.keys():
                    batch[key] = apply_padding(batch, key, maxlen = PADDING_SIZE, value = 1)
            batch_count = 0

            if gold_file_path:
                x, y = batch['sentences'], np.expand_dims(batch['labels'], axis=-1)
                x, y = shuffle(x, y)
                yield x, y
            else:
                yield batch['sentences'], batch['candidates']
def get(batch_size, resources_path, training_file_path, \
        antivocab, output_vocab, output_vocab2, output_vocab3, \
        PADDING_SIZE = 50, gold_file_path = None):
    """
    Batch procesing generator, yields a dict of sentences, candidates and labels if in training mode (determined if gold_file_path is specified)

    param batch_size:
    param training_file_path:
    param antivocab:
    param output_vocab: senses
    aram output_vocab2: wndomain
    aram output_vocab3: lex
    param gold_file_path:
    return: generator object
    """

    while True:
        batch = {"sentences" : [], "candidates" : [], "candidates_wndomain": [],  "candidates_lex": []}

        training_data_flow = parsers.TrainingParser(training_file_path )
        if gold_file_path:
            gold_data_flow = parsers.GoldParser(gold_file_path)
            batch.update({"labels" : []})
            batch.update({"wndomain_labels" : []})
            batch.update({"lex_labels" : []})
            mapping_file = pd.read_csv(os.path.join(resources_path, "mapping.csv"))


        for batch_count, sentence in enumerate(training_data_flow.parse(), start = 1):

            #training mode
            if gold_file_path:
                labels = gold_data_flow.parse()
                output = prepare_sentence(sentence, antivocab, output_vocab, output_vocab2, output_vocab3, mapping_file, labels)
                batch['sentences'].append(output['sentence'])

                #batch['candidates'].append(output['candidates'])
                #batch['candidates_wndomain'].append(output['candidates_wndomain'])
                #batch['candidates_lex'].append(output['candidates_lex'])

                batch['labels'].append(output['labels'])
                batch['wndomain_labels'].append(output['wndomain_labels'])
                batch['lex_labels'].append(output['lex_labels'])

            #evaulation mode
            else:
                output = prepare_sentence(sentence, antivocab, output_vocab, output_vocab2, output_vocab3)

                batch['sentences'].append(output['sentence'])
                batch['candidates'].append(output['candidates'])
                batch['candidates_wndomain'].append(output['candidates_wndomain'])
                batch['candidates_lex'].append(output['candidates_lex'])

            if int(batch_count)%int(batch_size)==0:

                for key in batch.keys():
                    batch[key] = apply_padding(batch, key, maxlen = PADDING_SIZE, value = 1)

                #TO DO:

                if gold_file_path:
                    x  = batch['sentences']
                    y = [np.expand_dims(batch['labels'], axis=-1),
                         np.expand_dims(batch['wndomain_labels'], axis=-1),
                         np.expand_dims(batch['lex_labels'], axis=-1)]
                    #x, y = shuffle(x, y)
                    yield x, y
                else:
                    yield batch['sentences'], batch['candidates'], batch['candidates_wndomain'], batch['candidates_lex']

                batch = {"sentences" : [], "candidates" : [], "candidates_wndomain": [],  "candidates_lex": []}
                if gold_file_path:
                    batch.update({"labels" : []})
                    batch.update({"wndomain_labels" : []})
                    batch.update({"lex_labels" : []})

        if batch_count>0:
            for key in batch.keys():
                    batch[key] = apply_padding(batch, key, maxlen = PADDING_SIZE, value = 1)
            batch_count = 0

            if gold_file_path:
                x  = batch['sentences']
                y = [np.expand_dims(batch['labels'], axis=-1),
                     np.expand_dims(batch['wndomain_labels'], axis=-1),
                     np.expand_dims(batch['lex_labels'], axis=-1)]
                #x, y = shuffle(x, y)
                yield x, y

            else:

                yield batch['sentences'], batch['candidates'], batch['candidates_wndomain'], batch['candidates_lex']
示例#5
0
from tqdm import tqdm, tnrange
import json
import pandas as pd
import os

import utils
import parsers

# In[ ]:

# # Parse training XML file

# In[5]:

Training = parsers.TrainingParser(
    '../resources/WSD_Evaluation_Framework/Training_Corpora/SemCor/semcor.data.xml'
)

Training.create_vocab(
    input_vocab_path="../resources/semcor.input.vocab.json",
    pos_vocab_path="../resources/semcor.pos.vocab.json",
    left_out_vocab_path="../resources/semcor.leftout.vocab.json",
    subsampling_rate=1e-4,
    min_count=5)

# # converting eval datasets

# In[6]:

dir_ = "../resources/WSD_Evaluation_Framework/Evaluation_Datasets"
eval_datasets = [i for i in os.listdir(dir_) if i.startswith("se")]