Пример #1
0
    def load_dataset(self, dataset_directory, tokenizer_type):
        input_lines = utils.read_lines('%s/%s' %
                                       (dataset_directory, 'inputs.txt'))
        label_lines = utils.read_lines('%s/%s' %
                                       (dataset_directory, 'labels.txt'))

        input_tokens = list(
            map(
                lambda l: tokenizer.tokenize_sentence(
                    l,
                    vocab=self.vocab,
                    tokenizer_type=tokenizer_type,
                    add_start_end=False), input_lines))
        label_tokens = list(
            map(
                lambda l: tokenizer.tokenize_sentence(
                    l,
                    vocab=self.label_vocab,
                    tokenizer_type=tokenizer_type,
                    add_start_end=False), label_lines))

        dataset = {}
        dataset['input_tokens'] = input_tokens
        dataset['label_tokens'] = label_tokens
        dataset['input_max_length'] = np.max(
            list(map(lambda l: len(l), input_tokens)))
        dataset['inputs'] = input_lines
        dataset['labels'] = label_lines
        dataset['size'] = len(input_lines)
        return dataset
    def read_raw_lines(self, base_path):
        inputs_path = '%s/inputs.txt' % base_path 
        outputs_path = '%s/outputs.txt' % base_path

        inputs = utils.read_lines(inputs_path)
        outputs = utils.read_lines(outputs_path)

        dataset = {}
        dataset['inputs'] = inputs
        dataset['outputs'] = outputs
        return dataset
def save_results(dev_path, shared_path, gen_questions_path,
                 gen_answers_start_path, gen_answers_end_path, gen_idxs_path,
                 gen_ids_path, save_path):
    print("Loading dev json: %s and shared: %s" % (dev_path, shared_path))
    dev_json = json.load(open(dev_path))
    shared_json = json.load(open(shared_path))
    print("Done loading dev json and shared")
    questions = utils.read_lines(gen_questions_path)
    answer_starts = utils.read_lines(gen_answers_start_path)
    answer_ends = utils.read_lines(gen_answers_end_path)
    idxs = utils.read_lines(gen_idxs_path)
    ids = utils.read_lines(gen_ids_path)

    keys = dev_json.keys()
    dataset = defaultdict(list)

    idx = 54

    for i in range(0, len(questions)):
        cur_q = questions[i].split(" ")
        if invalid_question(cur_q):
            continue
        cur_q = dedup(cur_q)
        cur_ans_start = int(answer_starts[i])
        cur_ans_end = int(answer_ends[i])
        idx = int(idxs[i])
        id = int(ids[i])
        cur_par = shared_json['x'][idx][0][0]
        cy_0 = 0
        cy_1 = len(cur_par[cur_ans_end - 1])
        cy = [[cy_0, cy_1]]

        answerss = [cur_par[cur_ans_start:cur_ans_end]]
        cur_q_char = list(map(lambda token: token.split(), cur_q))

        dataset['idxs'].append(idx)
        dataset['ids'].append(len(dataset['ids']))
        dataset['cy'].append(cy)
        dataset['answerss'].append(answerss)
        dataset['span_answerss'].append(answerss)
        dataset['*x'].append([idx, 0])
        dataset['*cx'].append([idx, 0])
        dataset['*p'].append([idx, 0])

        shared_json['x'][idx]
        dataset['y'].append([[[0, cur_ans_start], [0, cur_ans_end]]])
        dataset['q'].append(cur_q)
        dataset['cq'].append(cur_q_char)

    print("Saving to path %s" % save_path)
    utils.save_json(dataset, save_path)
Пример #4
0
	def init_from_path(self, path):
		"""
		Read lines (one vocab token per line) into counters
		"""
		self.path = path
		lines = utils.read_lines(path)
		self.init_from_array(lines)
    def load_dataset(self, dataset_directory, tokenizer_type):
        input_lines = utils.read_lines('%s/%s' % (dataset_directory, 'inputs.txt'))
        output_lines = utils.read_lines('%s/%s' % (dataset_directory, 'outputs.txt'))
        indices_lines = utils.read_lines('%s/%s' % (dataset_directory, 'indices.txt'))

        context_tokens = list(map(lambda l: tokenizer.tokenize_sentence(l, 
            vocab=self.vocab,
            tokenizer_type=self.context_tokenizer_type,#tokenizer_type,
            add_start_end=True), input_lines))
        
        output_tokens = list(map(lambda l: tokenizer.tokenize_sentence(l,
            vocab=self.vocab,
            tokenizer_type=tokenizer_type,
            add_start_end=True), output_lines))
        input_tokens = list(map(lambda tokens_list: tokens_list[0:-1], output_tokens))
        desired_input_tokens = list(map(lambda tokens_list: tokens_list[1:], output_tokens))

        indices_vals = list(map(lambda l: int(l), indices_lines))

        answer_starts_path = '%s/%s' % (dataset_directory, 'answer_starts.txt')
        answer_ends_path = '%s/%s' % (dataset_directory, 'answer_ends.txt')

        dataset = {}
        if utils.check_file(answer_starts_path):
            answer_starts = utils.read_lines(answer_starts_path)
            answer_starts = list(map(lambda l: int(l), answer_starts))
            dataset['answer_starts'] = answer_starts
        
        if utils.check_file(answer_ends_path):
            answer_ends = utils.read_lines(answer_ends_path)
            answer_ends = list(map(lambda l: int(l), answer_ends))
            dataset['answer_ends'] = answer_ends
      
        print("Example context lengths %s" % len(context_tokens[2]))
        dataset['indices'] = indices_vals
        dataset['input_tokens'] = input_tokens
        dataset['desired_input_tokens'] = desired_input_tokens
        dataset['context_tokens'] = context_tokens
        dataset['contexts'] = input_lines
        dataset['inputs'] = output_lines
        dataset['desired_inputs'] = output_lines
        dataset['size'] = len(output_lines)
        return dataset
Пример #6
0
    def load_dataset(self, dataset_directory, tokenizer_type):
        input_lines = utils.read_lines('%s/%s' % (dataset_directory, 'inputs.txt'))
        output_lines = utils.read_lines('%s/%s' % (dataset_directory, 'outputs.txt'))
        indices_lines = utils.read_lines('%s/%s' % (dataset_directory, 'indices.txt'))
        
        context_tokens = list(map(lambda l: tokenizer.tokenize_sentence(l, 
            vocab=self.vocab,
            tokenizer_type=constants.TOKENIZER_SPECIAL_DELIMITER,
            add_start_end=True), input_lines))
        output_tokens = list(map(lambda l: tokenizer.tokenize_sentence(l,
            vocab=self.vocab,
            tokenizer_type=tokenizer_type,
            add_start_end=True), output_lines))
        input_tokens = list(map(lambda tokens_list: tokens_list[0:-1], output_tokens))
        desired_input_tokens = list(map(lambda tokens_list: tokens_list[1:], output_tokens))

        indices_vals = list(map(lambda l: int(l), indices_lines))

        answer_starts_path = '%s/%s' % (dataset_directory, 'answer_starts.txt')
        answer_ends_path = '%s/%s' % (dataset_directory, 'answer_ends.txt')

        dataset = {}
        if utils.check_file(answer_starts_path):
            answer_starts = utils.read_lines(answer_starts_path)
            answer_starts = list(map(lambda l: int(l) + 1, answer_starts)) # We're adding <START>
            dataset['answer_starts'] = answer_starts
        
        if utils.check_file(answer_ends_path):
            answer_ends = utils.read_lines(answer_ends_path)
            answer_ends = list(map(lambda l: int(l) + 1, answer_ends)) # We're adding <END>
            dataset['answer_ends'] = answer_ends
      
        dataset['indices'] = indices_vals
        dataset['input_tokens'] = input_tokens
        dataset['desired_input_tokens'] = desired_input_tokens
        dataset['context_tokens'] = context_tokens
        dataset['contexts'] = input_lines
        dataset['inputs'] = output_lines
        dataset['desired_inputs'] = output_lines
        dataset['size'] = len(output_lines)
        return dataset
Пример #7
0
    items = l.strip().split(" ")
    return items


answer_starts_path = 'datasets/newsqa/train/answer_starts.txt'
answer_ends_path = 'datasets/newsqa/train/answer_ends.txt'
input_path = 'datasets/newsqa/train/inputs.txt'
output_path = 'datasets/newsqa/train/outputs.txt'
generated_path = 'logs/newsqa_saved_data/dummy5_train_predictions_epoch_6.txt'
indices_path = 'datasets/newsqa/train/indices.txt'

inputs = utils.read_lines_with_func(func, input_path)
outputs = utils.read_tabbed_lines(output_path)
generated = utils.read_lines_with_func(gen_func, generated_path)
answer_starts = list(
    map(lambda l: int(l), utils.read_lines(answer_starts_path)))
answer_ends = list(map(lambda l: int(l), utils.read_lines(answer_ends_path)))
indices = list(map(lambda l: int(l), utils.read_lines(indices_path)))

answers = []
truncated_contexts = []
questions = []
generated_questions = []

num_overlap = []
num_items = len(generated)

question_counter = 0
generated_question_counter = 0
filtered_words = ["a", "the", "who", "what", "when", "where", "why", "it"]
for i in range(num_items):