Exemplo n.º 1
0
def process_dev_json_to_files():
    # dev_path example data/squad/dev-v1.1.json
    download_prefix = os.path.dirname(os.path.abspath(
        FLAGS.dev_path))  # data/squad/
    dev_filename = os.path.basename(FLAGS.dev_path)  # "dev-v1.1.json"
    # relative path to save the data

    print("Downloading datasets into {}".format(download_prefix))
    print("Preprocessing datasets into {}".format(FLAGS.data_dir))

    if not os.path.exists(download_prefix):
        os.makedirs(download_prefix)
    if not os.path.exists(FLAGS.data_dir):
        os.makedirs(FLAGS.data_dir)

    maybe_download(squad_base_url, dev_filename, download_prefix, None)
    # Read data from dev json file
    dev_data = data_from_json(os.path.join(download_prefix, dev_filename))
    # write data out to FLAGS.data_dir location
    dev_num_questions, dev_num_answers = read_write_dataset(
        dev_data, 'dev', FLAGS.data_dir)

    dev_path = os.path.join(FLAGS.data_dir, "dev")
    ## generate tokens
    x_dev_dis_path = dev_path + ".ids.context"
    y_dev_ids_path = dev_path + ".ids.question"
    qa_data.data_to_token_ids(dev_path + ".context", x_dev_dis_path,
                              FLAGS.vocab_path)
    qa_data.data_to_token_ids(dev_path + ".question", y_dev_ids_path,
                              FLAGS.vocab_path)
Exemplo n.º 2
0
def prepare_dev(prefix, dev_filename, vocab):
    # Don't check file size, since we could be using other datasets
    dev_dataset = maybe_download(squad_base_url, dev_filename, prefix)
    dev_data = data_from_json(os.path.join(prefix, dev_filename))
    context_data, question_data, question_uuid_data, context_text = read_dataset(
        dev_data, 'dev', vocab)
    return context_data, question_data, question_uuid_data, context_text
def prepare_dev(prefix, dev_filename, vocab):
    # Don't check file size, since we could be using other datasets
    dev_dataset = maybe_download(squad_base_url, dev_filename, prefix)

    dev_data = data_from_json(os.path.join(prefix, dev_filename))
    context_data, question_data, question_uuid_data = read_dataset(dev_data, 'dev', vocab)

    return context_data, question_data, question_uuid_data
Exemplo n.º 4
0
def prepare_dev(prefix, dev_filename, vocab, download = False):
    print("Downloading {}".format(dev_filename))
    # Don't check file size, since we could be using other datasets
    if download:
        dev_dataset = maybe_download(squad_base_url, dev_filename, prefix)
        dev_data = data_from_json(os.path.join(prefix, dev_filename))
    else:
        dev_dataset = data_from_json(dev_filename)
    context_data, question_data, question_uuid_data = read_dataset(dev_data, 'dev', vocab)

    return context_data, question_data, question_uuid_data
Exemplo n.º 5
0
def prepare_dev(prefix, dev_filename, vocab):
    dev_dataset = maybe_download(squad_base_url, dev_filename, prefix)
    dev_data = data_from_json(os.path.join(prefix, dev_filename))
    context_data, question_data, question_uuid_data = read_dataset(
        dev_data, 'dev', vocab)

    def normalize(dat):
        return map(lambda tok: map(int, tok.split()), dat)

    context_data = normalize(context_data)
    question_data = normalize(question_data)

    return context_data, question_data, question_uuid_data
Exemplo n.º 6
0
def prepare_dev(prefix, dev_filename, vocab):
    # Don't check file size, since we could be using other datasets
    dev_dataset = maybe_download(squad_base_url, dev_filename, prefix)

    dev_data = data_from_json(os.path.join(prefix, dev_filename))

    # remove answer
    #    if FLAGS.eval_on_train:
    #        context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data, s_labels, e_labels, true_answers = read_dataset(dev_data, 'train', vocab)
    #        return context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data, s_labels, e_labels, true_answers

    context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data = read_dataset(
        dev_data, 'dev', vocab)

    return context_tokens_data, context_data, question_tokens_data, question_data, question_uuid_data
Exemplo n.º 7
0
def prepare_dev(prefix, dev_filename, vocab):
    # Don't check file size, since we could be using other datasets
    dev_dataset = maybe_download(squad_base_url, dev_filename, prefix)

    dev_data = data_from_json(os.path.join(prefix, dev_filename))
    context_data, question_data, question_uuid_data = read_dataset(
        dev_data, 'dev', vocab)

    def normalize(dat):
        return list(map(lambda tok: list(map(int, tok.split())), dat))

    context_data = normalize(context_data)
    question_data = normalize(question_data)

    return context_data, question_data, question_uuid_data
Exemplo n.º 8
0
def expand_vocab(prefix, dev_filename, vocab, embd, raw_glove,
                 raw_glove_vocab):

    # Don't check file size, since we could be using other datasets
    dev_dataset = maybe_download(squad_base_url, dev_filename, prefix)
    dev_data = data_from_json(os.path.join(prefix, dev_filename))
    #context_data, question_data, question_uuid_data = read_dataset(dev_data, 'dev', vocab)
    dataset = dev_data
    context_data = []
    query_data = []
    question_uuid_data = []
    tier = 'dev'
    new_vocab = {}
    found = 0
    notfound = 0

    for articles_id in tqdm(range(len(dataset['data'])),
                            desc="Preprocessing {}".format(tier)):
        article_paragraphs = dataset['data'][articles_id]['paragraphs']
        for pid in range(len(article_paragraphs)):
            context = article_paragraphs[pid]['context']
            # The following replacements are suggested in the paper
            # BidAF (Seo et al., 2016)
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')

            context_tokens = tokenize(context)

            qas = article_paragraphs[pid]['qas']
            for qid in range(len(qas)):
                question = qas[qid]['question']
                question_tokens = tokenize(question)
                question_uuid = qas[qid]['id']

                #context_ids = [str(vocab.get(w, qa_data.UNK_ID)) for w in context_tokens]
                #qustion_ids = [str(vocab.get(w, qa_data.UNK_ID)) for w in question_tokens]
                #print(context_ids)
                for w in context_tokens:
                    if not w in vocab:
                        if not w in new_vocab:
                            new_vocab[w] = 1
                        else:
                            new_vocab[w] += 1
                        notfound += 1
                    else:
                        found += 1

                for w in question_tokens:
                    if not w in vocab:
                        if not w in new_vocab:
                            new_vocab[w] = 1
                        else:
                            new_vocab[w] += 1
                        notfound += 1
                    else:
                        found += 1

    print('found/not found: {}/{}, {}% not found'.format(
        found, notfound, 100 * notfound / float(found + notfound)))
    print('New vocabulary:', len(new_vocab))

    vocab_list = list(vocab.items())
    vn = len(vocab_list)
    for i in range((len(new_vocab))):
        vocab_list.append((new_vocab.keys()[i], vn + i))

    vocab = dict(vocab_list)
    rev_vocab = dict([(x, y) for (y, x) in vocab_list])
    #context_data.append(' '.join(context_ids))
    #query_data.append(' '.join(qustion_ids))
    #question_uuid_data.append(question_uuid)
    #return context_data, question_data, question_uuid_data
    _, dim = embd.shape
    new_glove = np.random.randn(len(vocab), dim)
    new_glove[:vn, :] = embd

    found = 0
    for i in range(vn, vn + (len(new_vocab))):
        word = vocab_list[i][0]
        if word in raw_glove_vocab:
            found += 1
            idx = raw_glove_vocab[word]
            new_glove[i, :] = raw_glove[idx, :]
        if word.capitalize() in raw_glove_vocab:
            found += 1
            idx = raw_glove_vocab[word.capitalize()]
            new_glove[i, :] = raw_glove[idx, :]
        if word.upper() in raw_glove_vocab:
            found += 1
            idx = raw_glove_vocab[word.upper()]
            new_glove[i, :] = raw_glove[idx, :]
    #from IPython import embed; embed()
    print("{} unseen words found embeddings".format(found))

    return vocab, rev_vocab, new_glove