Exemplo n.º 1
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    #if not args.split:
    #    sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir, "third_party",
                                          "wikipedia")
    source_summaries = pd.read_csv(source_path + '/summaries.csv')
    source_qas = pd.read_csv(args.source_dir + '/qaps.csv')

    summaries = []
    summaries_char_list = []
    ques_answers = []
    questions = []
    questions_char_list = []
    document_ids = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    summary_index = -1
    for index_summ, row in tqdm(source_summaries.iterrows()):
        if data_type == row['set']:
            summary_tokenized_paras = []
            summary_char_para = []
            summary_tokenized = list(
                map(word_tokenize, sent_tokenize(row['summary_tokenized'])))
            summary_tokenized = [
                process_tokens(tokens) for tokens in summary_tokenized
            ]
            char_list = [[list(word) for word in sent]
                         for sent in summary_tokenized]
            #print ("summ",summary_tokenized)
            #print (char_list)
            summary_tokenized_paras.append(
                summary_tokenized)  # TODO:each summary has only one paragraph
            summaries.append(summary_tokenized_paras)
            summary_char_para.append(
                char_list)  # TODO:each summary has only one paragraph
            summaries_char_list.append(summary_char_para)
            #coz train/test/valid all are in one file, index_summ cannot be used
            summary_index = summary_index + 1

            qas = source_qas[source_qas['document_id'].isin(
                [row['document_id']])]

            for sent in summary_tokenized:
                for word in sent:
                    word_counter[word] += len(qas)
                    lower_word_counter[word.lower()] += len(qas)
                    for char in word:
                        char_counter[char] += len(qas)

            for index, qa in qas.iterrows():
                #if question is of multiple sentences, not handling that case also
                #Not req most probably
                question_tokenized = word_tokenize(qa['question'])
                question_tokenized = process_tokens(question_tokenized)
                #print (question_tokenized)
                question_char_list = [
                    list(word) for word in question_tokenized
                ]

                answer1_tokenized = list(
                    map(word_tokenize, sent_tokenize(qa['answer1'])))
                answer1_tokenized = [
                    process_tokens(tokens) for tokens in answer1_tokenized
                ]
                #print(answer1_tokenized)

                answer2_tokenized = list(
                    map(word_tokenize, sent_tokenize(qa['answer2'])))
                answer2_tokenized = [
                    process_tokens(tokens) for tokens in answer2_tokenized
                ]
                #print(answer2_tokenized)

                ques_answers.append([answer1_tokenized, answer2_tokenized])
                #print(ques_answers)

                questions.append(question_tokenized)
                questions_char_list.append(question_char_list)
                document_ids.append([summary_index, row['document_id']])

                for sent in question_tokenized:
                    for word in sent:
                        word_counter[word] += 1
                        lower_word_counter[word.lower()] += 1
                        for char in word:
                            char_counter[char] += 1

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    data = {
        'q': questions,
        'cq': questions_char_list,
        '*x': document_ids,
        'answerss': ques_answers,
        '*cx': document_ids
    }
    shared = {
        'x': summaries,
        'cx': summaries_char_list,
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }

    print("saving ...")
    save(args, data, shared, out_name)
Exemplo n.º 2
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir,
                                          "{}-v1.1.json".format(data_type))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    contextss = []
    context_questions = []
    titles = []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        pp = []
        p.append(pp)
        xp, cxp, contexts, c_questions = [], [], [], []
        x.append(xp)
        cx.append(cxp)
        contextss.append(contexts)
        context_questions.append(c_questions)
        title = "[" + str(ai).zfill(2) + "] " + article['title'].replace(
            '_', ' ')
        titles.append(title)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``",
                                      '" ')  #Sentences of priginal Paragraph
            contexts.append(context)
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            if ai == 0: c_questions.append(para['qas'][3]['question'])
            else: c_questions.append(para['qas'][0]['question'])
            """
            for qa in para['qas']:
                # get words
                c_questions.append(qa['question'])
                break
                qi = word_tokenize(qa['question']) # qa['question'] : original question
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1]-1]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)
            """
            if args.debug:
                break
    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {
        'q': q,
        'cq': cq,
        'y': y,
        '*x': rx,
        '*cx': rcx,
        'cy': cy,
        'idxs': idxs,
        'ids': ids,
        'answerss': answerss,
        '*p': rx
    }
    shared = {
        'x': x,
        'cx': cx,
        'p': p,
        'contextss': contextss,
        'context_questions': context_questions,
        'titles': titles,
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }
    print("saving ...")
    save(args, data, shared, out_name)
Exemplo n.º 3
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    parser = StanfordParser(model_path=os.getenv("StanfordParser_model_path"))
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    # if not args.split:
    #     sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(
        args.source_dir, "{}-{}v1.1.json".format(data_type, args.suffix))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    q_syn_seq = []
    na = []
    cy = []
    x, cx = [], []
    syn_seq = []
    rsyn_seq = []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    syn_counter = Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    abandon = 0
    for ai, article in enumerate(tqdm(source_data['data'][221:])):
        xp, cxp = [], []
        syn_seqp = []
        pp = []
        x.append(xp)
        cx.append(cxp)
        syn_seq.append(syn_seqp)
        p.append(pp)
        p_i = -1
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ').replace("``", '" ').replace(
                'e.g.', 'e-g,')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            try:
                syn_seqpp = sents_parser(xi, parser)
                p_i += 1
            except:
                abandon += 1
                continue
            for sent in syn_seqpp:
                for word in sent:
                    for syn in word:
                        syn_counter[syn] += 1

            syn_seqp.append(syn_seqpp)
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai + 221, p_i]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == p_i
            for qa in para['qas']:
                # get words
                qi = word_tokenize(qa['question'])
                qi = process_tokens(qi)
                try:
                    q_syn_seqq = sent_parser(qi, parser)
                except:
                    continue
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start,
                                             answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1] - 1]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1] - 1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                if len(qa['answers']) == 0:
                    yi.append([(0, 0), (0, 1)])
                    cyi.append([0, 1])
                    na.append(True)
                else:
                    na.append(False)

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                q_syn_seq.append(q_syn_seqq)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                rsyn_seq.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)
        print('abandon {} paragraph'.format(abandon))
        if args.debug:
            break
    for sent in q_syn_seq:
        for word in sent:
            for syn in word:
                syn_counter[syn] += 1
    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {
        'q': q,
        'cq': cq,
        'y': y,
        '*x': rx,
        '*cx': rcx,
        '*syn_seq': rsyn_seq,
        'cy': cy,
        'q_syn_seq': q_syn_seq,
        'idxs': idxs,
        'ids': ids,
        'answerss': answerss,
        '*p': rx,
        'na': na
    }
    shared = {
        'x': x,
        'cx': cx,
        'p': p,
        'syn_seq': syn_seq,
        'syn_counter': syn_counter,
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }
    print("saving ...")
    save(args, data, shared, out_name)
Exemplo n.º 4
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    totalnum0 = 0
    falsenum0 = 0
    falsenum1 = 0
    truenum0 = 0
    truenum1 = 0
    outlist = []
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir,
                                          "{}.seq.json".format(data_type))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            ctx = para['context']
            if (len(ctx.split()) <= 800):
                cut = -1
            else:
                cut = sum(map(len, ctx.split()[:800])) + 800
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            xi = [[xijk for xijk in xij if xijk != ''] for xij in xi]
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                totalnum0 += 1
                # get words
                qi = word_tokenize(qa['question'])
                cqi = [list(qij) for qij in qi]
                yi = []
                answers = []
                flag = False
                Flag = True
                if (cut > -1):
                    for ans in qa['answers']:
                        if (max(ans['answer_location'])) < cut:
                            Flag = False
                            break
                else:
                    Flag = False
                if (Flag):
                    falsenum1 += 1

                for answer in qa['answers']:
                    flag1 = True
                    answer_text = answer['text']
                    answers.append(answer_text)
                    ansi = word_tokenize(answer_text)
                    answer_location = answer['answer_location']
                    yii = []
                    for ans_idx, answer_start in enumerate(answer_location):
                        answer_stop = answer_start + len(ansi[ans_idx])
                        yi0, _ = get_word_span(context, xi, answer_start,
                                               answer_stop)
                        if (yi0[1] >= 800):
                            flag1 = False
                        assert len(xi[yi0[0]]) > yi0[1]
                        w0 = xi[yi0[0]][yi0[1]]
                        assert ansi[ans_idx] == w0, (ansi[ans_idx], w0)
                        yii.append(yi0)

                    if (flag1):
                        flag = True

                    yi.append(yii)
                if (flag):
                    truenum0 += 1

                if (flag == Flag):
                    print(ctx, qa, yi, cut)
                    outlist.append([ctx, qa])

                    # answer_start = answer['answer_start']
                    # answer_stop = answer_start + len(answer_text)
                    # yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # # yi0 = answer['answer_word_start'] or [0, 0]
                    # # yi1 = answer['answer_word_stop'] or [0, 1]
                    # assert len(xi[yi0[0]]) > yi0[1]
                    # assert len(xi[yi1[0]]) >= yi1[1]
                    # w0 = xi[yi0[0]][yi0[1]]
                    # w1 = xi[yi1[0]][yi1[1]-1]
                    # i0 = get_word_idx(context, xi, yi0)
                    # i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    # cyi0 = answer_start - i0
                    # cyi1 = answer_stop - i1 - 1
                    # # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    # assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    # assert answer_text[-1] == w1[cyi1]
                    # assert cyi0 < 32, (answer_text, w0)
                    # assert cyi1 < 32, (answer_text, w1)
                    # yi.append([yi0, yi1])

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

            if args.debug:
                break

    print(truenum0, totalnum0, float(truenum0) / totalnum0)
    print(falsenum1, totalnum0, 1 - float(falsenum1) / totalnum0)
    with open('debugcnt.json', 'w') as f:
        json.dump(outlist, f)
Exemplo n.º 5
0
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize
        def word_tokenize(tokens):
            return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir, "{}-{}v1.1.json".format(data_type, args.suffix))
    source_data = json.load(open(source_path, 'r', encoding="utf-8"))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    na = []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                # get words
                qi = word_tokenize(qa['question'])
                qi = process_tokens(qi)
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1]-1]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    #add
		    
                    #print("i0 :",i0, "i1 :",i1, "cyi0 :", cyi0, "w0 :",w0 )
                    #print("xi :", xi)
                    #print( "yi0",yi0, "(yi1[0], yi1[1]-1) :",(yi1[0], yi1[1]-1) )
                    #print("answer_text",answer_text)
                    #print("cyi1:",cyi1)
                    #print("answer_text[0] :",answer_text[0])
                    #print("answer_text[-1] :",answer_text[-1])
                    #print("w0 :",w0)
                    #print("w1 :",w1)
                    #so far

                    #print(":):):)")
                    #print("answer_text:",answer_text,"\nstart:", w0[cyi0:],"\nend:", w1[:cyi1+1])
                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                if len(qa['answers']) == 0:
                    yi.append([(0, 0), (0, 1)])
                    cyi.append([0, 1])
                    na.append(True)
                else:
                    na.append(False)

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

        if args.debug:
            break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy,
            'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx, 'na': na}
    shared = {'x': x, 'cx': cx, 'p': p,
              'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict}

    print("saving ...")
    save(args, data, shared, out_name)
Exemplo n.º 6
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir, "third_party",
                                          "wikipedia")
    #source_summaries =  pd.read_csv(source_path + '/summaries.csv')
    source_summaries = pd.read_csv(
        '/Users/dhruv100691/Documents/cs546/CS-546--Narrative-QA/bi-att-flow-dev/processed_summaries_new_method.csv'
    )
    #source_qas = pd.read_csv(args.source_dir + '/qaps.csv')
    source_qas = pd.read_csv(
        '/Users/dhruv100691/Documents/cs546/CS-546--Narrative-QA/bi-att-flow-dev/processed_answer_spans_rogue_new_method.csv'
    )
    #could not find spans for some answers, so dropping those qa pairs
    source_qas['start_index'] = source_qas['start_index'].str.replace(
        '(', '').str.replace(')', '')
    source_qas['end_index'] = source_qas['end_index'].str.replace(
        '(', '').str.replace(')', '')
    source_qas.dropna(subset=['start_index', 'end_index'], inplace=True)

    summaries = []
    summaries_char_list = []
    ques_answers = []
    questions = []
    questions_char_list = []
    ques_answer_lengths = []
    ques_answer_spans = []
    document_ids = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    summary_index = -1
    len_summ = 0
    num_summ = 0
    avg_len_sent = 0
    num_words = 0
    bleu_scores = []
    bleu_4_scores = []
    for index_summ, row in tqdm(source_summaries.iterrows(), total=1572):
        if data_type == row['set']:
            len_sent = 0
            spans = []
            references = []
            summary_tokenized_paras = []
            summary_char_para = []
            #row['processed_summary'] = row['processed_summary'].replace(".",". ")
            summary_tokenized = list(
                map(word_tokenize, sent_tokenize(row['processed_summary'])))
            #summary_tokenized = [process_tokens(tokens) for tokens in summary_tokenized]
            char_list = [[list(word) for word in sent]
                         for sent in summary_tokenized]
            summary_tokenized_paras.append(summary_tokenized)
            #summaries.append(list(break_summary_to_paras(summary_tokenized)))
            summaries.append(summary_tokenized_paras)
            num_summ += 1
            summary_char_para.append(
                char_list)  # TODO:each summary has only one paragraph
            summaries_char_list.append(summary_char_para)
            #coz train/test/valid all are in one file, index_summ cannot be used
            summary_index = summary_index + 1
            len_summ += len(summary_tokenized)

            qas = source_qas[source_qas['document_id'].isin(
                [row['document_id']])]
            qas = modify_answer_spans(qas, row['processed_summary'])

            for sent in summary_tokenized:
                len_sent += len(sent)
                num_words += len(sent)
                for word in sent:
                    word_counter[word] += len(qas)
                    lower_word_counter[word.lower()] += len(qas)
                    for char in word:
                        char_counter[char] += len(qas)
            avg_len_sent += (len_sent / float(len(summary_tokenized)))
            for index, qa in qas.iterrows():
                #if question is of multiple sentences, not handling that case also
                #Not req most probably
                question_tokenized = word_tokenize(qa['processed_question'])
                #question_tokenized = process_tokens(question_tokenized)
                #print (question_tokenized)
                question_char_list = [
                    list(word) for word in question_tokenized
                ]

                answer1_tokenized = list(
                    map(word_tokenize,
                        sent_tokenize(qa['processed_answer'].replace(
                            ".", ""))))  ##TODO
                #answer1_tokenized = [process_tokens(tokens) for tokens in answer1_tokenized]
                answer1_eos = answer1_tokenized[len(answer1_tokenized) - 1] + [
                    '</s>'
                ]  #appending end token
                answer1_sos = ['--SOS--'] + answer1_tokenized[0]
                target_length = len(answer1_eos)

                answer1_span_start_idx = qa['start_index']
                answer1_span_end_idx = qa['end_index']

                #answer2_tokenized = list(map(word_tokenize, sent_tokenize(qa['answer2'])))
                #answer2_tokenized = [process_tokens(tokens) for tokens in answer2_tokenized]
                #answer2_eos = answer2_tokenized[len(answer2_tokenized) - 1] + ['</s>']  # appending end token
                #answer2_sos = ['--SOS--'] + answer2_tokenized[0]
                #print(answer2_tokenized)

                predicted_rouge_span = summary_tokenized[0][
                    answer1_span_start_idx[1]:answer1_span_end_idx[1] + 1]
                references.append([list(map(str.lower, answer1_tokenized[0]))])
                spans.append(list(map(str.lower, predicted_rouge_span)))

                ques_answers.append([answer1_sos, answer1_eos])
                ques_answer_spans.append(
                    [answer1_span_start_idx, answer1_span_end_idx])
                ques_answer_lengths.append(target_length)

                questions.append(question_tokenized)
                questions_char_list.append(question_char_list)
                document_ids.append([summary_index, row['document_id']])

                for sent in question_tokenized:
                    for word in sent:
                        word_counter[word] += 1
                        lower_word_counter[word.lower()] += 1
                        for char in word:
                            char_counter[char] += 1

            bleu_scores.append(
                corpus_bleu(references, spans, weights=(1, 0, 0, 0)))
            bleu_4_scores.append(corpus_bleu(references, spans))
    print("Average score bleu_1 for", data_type,
          sum(bleu_scores) / len(bleu_scores))
    print("Average score bleu_4 for", data_type,
          sum(bleu_4_scores) / len(bleu_4_scores))

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    data = {
        'q': questions,
        'cq': questions_char_list,
        '*x': document_ids,
        'answerss': ques_answers,
        '*cx': document_ids,
        'ans_len': ques_answer_lengths,
        'spans': ques_answer_spans
    }
    shared = {
        'x': summaries,
        'cx': summaries_char_list,
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }

    print("saving ...")
    save(args, data, shared, out_name)

    print("{} statistics".format(data_type))
    print(" Number of summaries :", num_summ)
    print(" Average summary length : ", len_summ / float(num_summ))
    print(" Average sentence lengths :", avg_len_sent / float(num_summ))
    print(" Average number of words :", num_words / float(num_summ))
Exemplo n.º 7
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir,
                                          "{}.seq.json".format(data_type))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            xi = [[xijk for xijk in xij if xijk != ''] for xij in xi]
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                # get words
                qi = word_tokenize(qa['question'])
                cqi = [list(qij) for qij in qi]
                yi = []
                answers = []
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    ansi = word_tokenize(answer_text)
                    answer_location = answer['answer_location']
                    yii = []
                    for ans_idx, answer_start in enumerate(answer_location):
                        answer_stop = answer_start + len(ansi[ans_idx])
                        yi0, _ = get_word_span(context, xi, answer_start,
                                               answer_stop)
                        assert len(xi[yi0[0]]) > yi0[1]
                        w0 = xi[yi0[0]][yi0[1]]
                        assert ansi[ans_idx] == w0, (ansi[ans_idx], w0)
                        yii.append(yi0)

                    yi.append(yii)

                    # answer_start = answer['answer_start']
                    # answer_stop = answer_start + len(answer_text)
                    # yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # # yi0 = answer['answer_word_start'] or [0, 0]
                    # # yi1 = answer['answer_word_stop'] or [0, 1]
                    # assert len(xi[yi0[0]]) > yi0[1]
                    # assert len(xi[yi1[0]]) >= yi1[1]
                    # w0 = xi[yi0[0]][yi0[1]]
                    # w1 = xi[yi1[0]][yi1[1]-1]
                    # i0 = get_word_idx(context, xi, yi0)
                    # i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    # cyi0 = answer_start - i0
                    # cyi1 = answer_stop - i1 - 1
                    # # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    # assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    # assert answer_text[-1] == w1[cyi1]
                    # assert cyi0 < 32, (answer_text, w0)
                    # assert cyi1 < 32, (answer_text, w1)
                    # yi.append([yi0, yi1])

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

            if args.debug:
                break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {
        'q': q,
        'cq': cq,
        'y': y,
        '*x': rx,
        '*cx': rcx,
        'idxs': idxs,
        'ids': ids,
        'answerss': answerss,
        '*p': rx
    }
    shared = {
        'x': x,
        'cx': cx,
        'p': p,
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }

    print("saving ...")
    save(args, data, shared, out_name)
Exemplo n.º 8
0
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None, save_json=True, pre_answer_doc_id=None):
  """
  data:
  q/cq: query
  y: answer
  rx/rcx: index pairs: (article_no, paragraph_no)
  cy:
  idxs:
  ids:
  answerss:
  na:
  
  shared:
  x/cx: tokenized paragraphs (words and chars)
  p: untokenized paragraphs
  """
  if args.tokenizer == "PTB":
    import nltk
    sent_tokenize = nltk.sent_tokenize
    def word_tokenize(tokens):
      return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
  elif args.tokenizer == 'Stanford':
    from my.corenlp_interface import CoreNLPInterface
    interface = CoreNLPInterface(args.url, args.port)
    sent_tokenize = interface.split_doc
    word_tokenize = interface.split_sent
  else:
    raise Exception()

  if args.medhop:
    from qangaroo.utils import process_tokens_medhop as process_tokens
  else:
    from qangaroo.utils import process_tokens

  if not args.split:
    sent_tokenize = lambda para: [para]

  source_path = in_path or os.path.join(args.source_dir, "{}.json".format(data_type))
  source_data = json.load(open(source_path, 'r'))

  q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
  na = []  # no answer
  cy = []
  x, cx = [], []
  x2 = []
  answers = []
  p, p2 = [], []
  q2, cq2 = [], []
  cand_span, ccand_span, cand_span_y = [], [], []
  cand_word, cand_word_y, cand_word_found, real_cand_word_found = [], [], [], []
  all_cand_spans, A1s, A2s, all_cand_doc_ids, all_cand_ids, all_cand_num_spans_found, real_cand_count = [], [], [], [], [], [], [] # To store all candidate spans, adjacency matrices, candidate's doc ids, candidate's ids
  answer_doc_ids, answer_ids_in_doc = [], []
  topk_2layer_tfidf_docs = []
  first_doc_ids = []
  word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
  start_ai = int(round(len(source_data) * start_ratio))
  stop_ai = int(round(len(source_data) * stop_ratio))
  mis_cand = 0
  found_answer_in_first_n = 0

  tfidf = TfidfVectorizer(strip_accents='unicode')
  bi = 0
  if args.randomize_examples:
    random.shuffle(source_data)
  for ai, article in enumerate(tqdm(source_data[start_ai:stop_ai])):
    candidates = article['candidates']
    query_sub = ' '.join(article['query'].split()[1:])
    query = article['query'].replace('_', ' ')  # get rid of '_' in, e.g., 'record_label' 
    supports = article['supports']
    answer = article['answer']

    if args.truncate_at > 0:
      for si, support in enumerate(supports):
        support_split = support.split(' ')[:args.truncate_at]
        if support_split[-1] != '.':
          support_split += '.'
        supports[si] = ' '.join(support_split)

    if args.randomize_docs:
      random.shuffle(supports)

    if args.filter_by_annotations is not None:
      annotations = article['annotations']
      not_follow = 0
      likely = 0
      follow = 0
      multiple = 0
      single = 0
      follow_and_multiple = 0
      for anno in annotations:
        if anno[0] == 'follows' and anno[1] == 'multiple':
          follow_and_multiple += 1
        if anno[0] == 'follows':
          follow += 1
        if anno[0] == 'not_follow':
          not_follow += 1
        if anno[0] == 'likely':
          likely += 1
        if anno[1] == 'multiple':
          multiple += 1
        if anno[1] == 'single':
          single += 1

      if args.filter_by_annotations == 'follow' and follow < 2:
        continue
      elif args.filter_by_annotations == 'multiple' and (follow < 2 or multiple < 2):
        continue
      elif args.filter_by_annotations == 'single' and (follow < 2 or single < 2):
        continue
    
    xp, cxp = [], []
    xp2 = []
    pp, pp2 = [], []

    x.append(xp)
    cx.append(cxp)
    p.append(pp)
    x2.append(xp2)
    p2.append(pp2)

    para_features = tfidf.fit_transform(supports)
    q_features = tfidf.transform([query_sub])
    dists = pairwise_distances(q_features, para_features, "cosine").ravel()
    sorted_ix = np.lexsort((supports, dists))
    first_doc_ids.append(np.asscalar(sorted_ix[0]))
    assert first_doc_ids[-1] < len(supports), (first_doc_ids[-1], len(supports))
    
    if args.rank_by_tfidf and save_json:
      first_doc_ids[-1] = 0
      para_features = tfidf.fit_transform(supports)
      q_features = tfidf.transform([query_sub])
      dists = pairwise_distances(q_features, para_features, "cosine").ravel()

      if pre_answer_doc_id is not None:
        dists[pre_answer_doc_id[bi]] = 0
      
      sorted_ix = np.lexsort((supports, dists))
      sorted_supports = [supports[idx] for idx in sorted_ix]
      
      if args.tfidf_layer == 1:
        if args.back_tfidf:
          para_features = tfidf.fit_transform(sorted_supports[2:])
          q_features = tfidf.transform([sorted_supports[1] + ' ' + sorted_supports[0]])
          dists = pairwise_distances(q_features, para_features, "cosine").ravel()
          sorted_ix = np.lexsort((sorted_supports[2:], dists))
          supports = [sorted_supports[idx + 2] for idx in sorted_ix]
          
          assert len(sorted_supports) == len(supports) + 2
          supports.insert(0, sorted_supports[1])
          supports.insert(2, sorted_supports[0])
        else:
          supports = sorted_supports
      elif args.tfidf_layer == 2:
        if args.mode == 'double':
          para_features = tfidf.fit_transform(sorted_supports[2:])
          q_features = tfidf.transform([sorted_supports[1]])
          dists = pairwise_distances(q_features, para_features, "cosine").ravel()
          sorted_ix = np.lexsort((sorted_supports[2:], dists))
          supports = [sorted_supports[idx + 2] for idx in sorted_ix]
          
          assert len(sorted_supports) == len(supports) + 2
          supports.insert(0, sorted_supports[1])
          supports.insert(2, sorted_supports[0])
        else:
          para_features = tfidf.fit_transform(sorted_supports[1:])
          q_features = tfidf.transform([sorted_supports[0]])
          dists = pairwise_distances(q_features, para_features, "cosine").ravel()
          sorted_ix = np.lexsort((sorted_supports[1:], dists))
          supports = [sorted_supports[idx + 1] for idx in sorted_ix]
          
          assert len(sorted_supports) == len(supports) + 1
          supports.insert(0, sorted_supports[0])
      else:
        raise NotImplementedError

      if args.keep_topk_docs_only > 0:
        supports = supports[:args.keep_topk_docs_only]
    else:
      sorted_supports = [supports[idx] for idx in sorted_ix]

      para_features = tfidf.fit_transform(supports)
      q_features = tfidf.transform([sorted_supports[0]])
      dists = pairwise_distances(q_features, para_features, "cosine").ravel()
      dists[sorted_ix[0]] = 1e30
      sorted_ix = np.lexsort((supports, dists))
      topk_2layer_tfidf_docs.append([])
      for kk in range(min(7, len(sorted_ix))):
        topk_2layer_tfidf_docs[-1].append(np.asscalar(sorted_ix[kk]))
      
    context = ''
    if args.split_supports is True:
      xi, cxi = [[]], [[]]
      xi_len = []
      for pi, _context in enumerate(supports):
        _context += ' '
        _context = _context.replace("''", '" ')
        _context = _context.replace("``", '" ')
        _context = _context.replace('  ', ' ').replace(' ', ' ')
        context += _context
        _xi = list(map(word_tokenize, sent_tokenize(_context)))
        _xi = [process_tokens(tokens) for tokens in _xi]  # xi = [["blahblah"]]
        _cxi = [[list(xijk) for xijk in xij] for xij in _xi]

        xi[0] += _xi[0]

        xi_len.append(len(_xi[0]))
        
        xp.append(_xi[0])
        cxp.append(_cxi[0])
        pp.append(_context)

      xp2.append(xi[0])
      pp2.append(context)
      assert sum(map(len,xp)) == np.array(xp2).shape[-1], (sum(map(len,xp)), np.array(xp2).shape[-1])
      
    else:
      for pi, _context in enumerate(supports):
        _context += ' '
        _context = _context.replace("''", '" ')
        _context = _context.replace("``", '" ')
        _context = _context.replace('  ', ' ').replace(' ', ' ')
        context += _context

      xi = list(map(word_tokenize, sent_tokenize(context)))
      xi = [process_tokens(tokens) for tokens in xi]  # xi = [["blahblah"]]
      cxi = [[list(xijk) for xijk in xij] for xij in xi]
      xp.append(xi[0])
      cxp.append(cxi[0])
      pp.append(context)
    

    # Only "+= 1" because every sets of support_docs corresponds to only 1 question.
    # In SQuAD, every paragraph can have multiple (len(para['qas'])) questions.
    for xij in xi:  # for sentence in context
      for xijk in xij:  # for word in sentence
        # if xijk == '.':
        #   print(xijk)
        word_counter[xijk] += 1
        lower_word_counter[xijk.lower()] += 1
        for xijkl in xijk:
          char_counter[xijkl] += 1


    # query
    # get words
    qi = word_tokenize(query)
    qi = process_tokens(qi)
    cqi = [list(qij) for qij in qi]

    q2i = word_tokenize(query_sub)
    q2i = process_tokens(q2i)
    cq2i = [list(q2ij) for q2ij in q2i]
    
    # answer
    yi = []
    cyi = []

    candi, ccandi, candi_y = [], [], []
    candi_word_y = []
    candi_word = candidates
    
    cand_span.append(candi)
    ccand_span.append(ccandi)
    cand_span_y.append(candi_y)
    cand_word.append(candi_word)
    cand_word_y.append(candi_word_y)
    answer_text = answer

    tokenized_context = ' '.join(xp2[-1])
    if args.find_candidates:
      assert answer in candidates, (answer, candidates)
      candi_word_y.append(candidates.index(answer))
      candidates_spans, not_found, candidates_found, real_candidates_found = compute_candidate_spans(tokenized_context, candidates)
      cand_word_found.append(candidates_found)
      real_cand_word_found.append(real_candidates_found)
      mis_cand += (not_found > 0)
      for (start, stop) in candidates_spans:
        yi0, yi1 = get_word_span(tokenized_context, xi, start, stop)
        
        assert len(xi[yi0[0]]) > yi0[1]
        assert len(xi[yi1[0]]) >= yi1[1]
        w0 = xi[yi0[0]][yi0[1]]
        w1 = xi[yi1[0]][yi1[1]-1]
        i0 = get_word_idx(tokenized_context, xi, yi0)
        i1 = get_word_idx(tokenized_context, xi, (yi1[0], yi1[1]-1))
        cyi0 = start - i0
        cyi1 = stop - i1 - 1
        candi.append([yi0, yi1])
        ccandi.append([cyi0, cyi1])

      
    if answer == '':
      raise Exception("Answer is empty.")
    else:   
      answer_start, answer_stop = compute_answer_span(tokenized_context, answer) # Find first matching span
      if answer_start is None:
        yi.append([(0, 0), (0, 1)])
        cyi.append([0, 1])
        na.append(True)
        answer_doc_ids.append([0])
        answer_ids_in_doc.append([0])
      
      else:
        if args.find_candidates:  # If we found the answer span, then we must have found the same span in candidates
          assert (answer_start, answer_stop) in \
          (candidates_spans), (answer, candidates, answer_start, answer_stop, candidates_spans)
          ans_idx = candidates_spans.index((answer_start, answer_stop))
          candi_y.append(ans_idx)
        na.append(False)
        yi0, yi1 = get_word_span(tokenized_context, xi, answer_start, answer_stop)
        answer_doc_id, answer_id_in_doc = find_doc_with_answer(yi0[1], xi_len)

        if pre_answer_doc_id is not None:
          assert answer_doc_id < 3, (answer_doc_id)
        answer_doc_ids.append([answer_doc_id])
        answer_ids_in_doc.append([answer_id_in_doc])

        answer_spans = []
        answer_spans.append((answer_start, answer_stop))
        next_answer_start = answer_start
        next_answer_stop = answer_stop
        next_context = tokenized_context[answer_stop:]
        while True:
          next_answer_start, next_answer_stop = compute_answer_span(next_context, answer)
          next_context = next_context[next_answer_stop:]
          if next_answer_start is not None:
            answer_spans.append((next_answer_start + answer_spans[-1][1], next_answer_stop + answer_spans[-1][1]))
          else:
            break
          next_yi0, next_yi1 = get_word_span(tokenized_context, xi, next_answer_start + answer_spans[-2][1], next_answer_stop + answer_spans[-2][1])

          next_answer_doc_id, next_answer_id_in_doc = find_doc_with_answer(next_yi0[1], xi_len)
          
          answer_doc_ids[-1].append(next_answer_doc_id)
          answer_ids_in_doc[-1].append(next_answer_id_in_doc)

        assert len(xi[yi0[0]]) > yi0[1]
        assert len(xi[yi1[0]]) >= yi1[1]
        w0 = xi[yi0[0]][yi0[1]]
        w1 = xi[yi1[0]][yi1[1]-1]
        i0 = get_word_idx(tokenized_context, xi, yi0)
        i1 = get_word_idx(tokenized_context, xi, (yi1[0], yi1[1]-1))
        cyi0 = answer_start - i0
        cyi1 = answer_stop - i1 - 1
        if args.medhop:
          assert answer_text[0] == w0[cyi0], (answer_text[0], w0[cyi0].lower(), answer_text, w0, cyi0)
        else:
          assert answer_text[0] == w0[cyi0].lower(), (answer_text[0], w0[cyi0].lower(), answer_text, w0, cyi0)
        assert answer_text[-1] == w1[cyi1].lower()
        assert cyi0 < 32, (answer_text, w0)
        assert cyi1 < 32, (answer_text, w1)
        
        yi.append([yi0, yi1])
        cyi.append([cyi0, cyi1])

    for qij in qi:
      word_counter[qij] += 1
      lower_word_counter[qij.lower()] += 1
      for qijk in qij:
        char_counter[qijk] += 1

    q.append(qi)
    q2.append(q2i)
    cq.append(cqi)
    cq2.append(cq2i)
    y.append(yi)
    cy.append(cyi)
    ids.append(article['id'])
    answers.append(answer)
    bi += 1

    
  assert len(q) == len(na), (len(qa), len(na))
  assert len(q) == len(y), (len(q), len(y))
  assert len(q) == len(x), (len(q), len(x))
  assert len(q) == len(first_doc_ids), (len(q), len(first_doc_ids))
  assert len(q) == len(answer_doc_ids), (len(q), len(answer_doc_ids))
  # Get embedding map according to word_counter.
  word2vec_dict = get_word2vec(args, word_counter)
  lower_word2vec_dict = get_word2vec(args, lower_word_counter)

  # add context here
  """
  q/cq: query
  y: answer
  rx/rcx: index pairs: (article_no, paragraph_no)
  cy:
  idxs:
  ids:
  answerss:
  na:
  """

  if args.split_supports:
    if args.find_candidates:
      data = {'q': q, 'cq': cq, 'y': y, 'cy': cy, 'ids': ids, 'answers': answers, 'na': na, 'x': x, 'cx': cx, 'p': p, 'x2': x2, 'p2': p2, 'q2': q2, 'cq2': cq2, \
              'cand_span': cand_span, 'ccand_span': ccand_span, 'cand_span_y': cand_span_y, 'cand_word': cand_word, 'cand_word_y': cand_word_y, \
              'cand_word_found': cand_word_found, 'real_cand_word_found': real_cand_word_found, 'answer_doc_ids': answer_doc_ids, 'answer_ids_in_doc': answer_ids_in_doc, 'first_doc_ids': first_doc_ids}
      if args.rank_by_tfidf is False:
        assert len(topk_2layer_tfidf_docs) > 0
        data.update({'topk_2layer_tfidf_docs': topk_2layer_tfidf_docs})
    else:
      data = {'q': q, 'cq': cq, 'y': y, 'cy': cy, 'ids': ids, 'answers': answers, \
              'na': na, 'x': x, 'cx': cx, 'p': p, 'x2': x2, 'p2': p2, 'answer_doc_ids': answer_doc_ids, \
              'answer_ids_in_doc': answer_ids_in_doc, 'first_doc_ids': first_doc_ids}
  else:
    data = {'q': q, 'cq': cq, 'y': y, 'cy': cy, 'ids': ids, 'answers': answers, \
            'na': na, 'x': x, 'cx': cx, 'p': p}
  """
  x/cx: tokenized paragraphs (words and chars)
  p: untokenized paragraphs
  """
  shared = {'word_counter': word_counter, 'char_counter': char_counter, \
    'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, \
    'lower_word2vec': lower_word2vec_dict}

  print("saving ...")
  print("no answer: %d" %sum(na))
  print("missing candidates: %d" %mis_cand)
  if save_json:
    save(args, data, shared, out_name)
  else:
    prepro_each(args, data_type, out_name=out_name, pre_answer_doc_id=answer_doc_ids)
Exemplo n.º 9
0
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):
    """


    :param args:            arguments
    :param data_type:       train, dev or all
    :param start_ratio:     default is 0.0
    :param stop_ratio:      default is 1.0
    :param out_name:        train, dev or test
    :param in_path:         default is None, not sure about what is this
    :return:
    """

    # 1. tokenize and sent tokenize

    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            """
            firstly word_tokenize the tokens and replace some
            chars, and return a list
            :param tokens:
            :return:
            """
            return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]  # input is para, turn it to a list

    # 2. load data from disk
    source_path = in_path or os.path.join(args.source_dir,
                                          "{}-v{}.json".format(data_type, args.version))
    source_data = json.load(open(file=source_path, mode='r'))

    # 3. initiate some counter and some lists
    q, cq, rx, rcx = [], [], [], []
    # question, char_question, context, char_context
    y, cy, ids, idxs = [], [], [], []
    x, cx = [], []
    answerss, p = [], []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    start_at_index = int(round(len(source_data['data']) * start_ratio))
    stop_at_index = int(round(len(source_data['data']) * stop_ratio))

    # 4. iterate the dataset
    max_ques_size = 0
    max_context_size = 0
    max_word_size = 0

    for article_index, article in enumerate(tqdm(source_data['data'][start_at_index:stop_at_index])):
        xp, cxp, pp = [], [], []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)

        for paragraph_index, paragraph in enumerate(article['paragraphs']):
            context = paragraph['context']
            context = context.replace("''", '" ')
            # notice this space, so the length of the context will not change when replace
            context = context.replace("``", '" ')

            # context is a str here
            list_of_wordlist = list(map(word_tokenize, sent_tokenize(context)))
            # after sent_tokenizer, it will be a list of sentence, here just one sentence,
            # a list of sentence
            # then the map, will apply the word_tokenize func to each sentence
            # a list of lists of words
            # [[words for sentence1], [words for sentence2]]

            list_of_wordlist = [process_tokens(tokens) for tokens in list_of_wordlist]
            # list_of_wordlist is a 2d stuff
            for wordlist in list_of_wordlist:
                max_context_size = max(max_context_size, len(wordlist))

            list_of_charlist = [[list(word) for word in wordlist] for wordlist in list_of_wordlist]
            # list of charlist is a 3d, sentence-dim, word-dim, char-dim

            xp.append(list_of_wordlist)
            # 3d, paragraph, sentence, words
            cxp.append(list_of_charlist)
            # 4d, paragraph, sentence, words, chars
            pp.append(context)
            # 2d, paragraph, context

            ## update counters
            num_qas = len(paragraph['qas'])
            for wordlist in list_of_wordlist:
                for word in wordlist:
                    word_counter[word] += num_qas
                    lower_word_counter[word.lower()] += num_qas
                    for char in word:
                        char_counter[char] += num_qas

            rxi = [article_index, paragraph_index]
            assert len(x) - 1 == article_index
            # x stores xp, xp is 3d, paragraph, sentece, and words
            assert len(x[article_index]) - 1 == paragraph_index

            for question in paragraph['qas']:
                question_wordslist = word_tokenize(question['question'])
                max_ques_size = max(max_ques_size, len(question_wordslist))
                # it's a list of words
                question_charslist = [list(word) for word in question_wordslist]
                # it's a list of charlist
                yi = []
                cyi = []
                answers = []  # the content of each answers

                for answer in question['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start_index = answer['answer_start']
                    answer_end_index = answer_start_index + len(answer_text)
                    yi0, yi1 = get_word_span(context,
                                             list_of_wordlist,  # 2-d: sentences, words
                                             answer_start_index,
                                             answer_end_index)
                    # yi0 (0, 108), 0 is the index of sentence
                    # yi1 (0, 111). 108 and 111 is the start and end of word index

                    assert len(list_of_wordlist[yi0[0]]) > yi0[1]
                    # the length of the first sentence is larger than 108
                    assert len(list_of_wordlist[yi1[0]]) >= yi1[1]
                    # the length of the first sentence is larger or equla to 111

                    w0 = list_of_wordlist[yi0[0]][yi0[1]]  # the start words of the answer
                    w1 = list_of_wordlist[yi1[0]][yi1[1] - 1]  # the last word of the answer

                    i0 = get_word_idx(context, list_of_wordlist, yi0)
                    i1 = get_word_idx(context, list_of_wordlist, (yi1[0], yi1[1] - 1))
                    # i0 is 515, which is the char index of the answer,
                    # i1 is start index of the final word in terms of chars
                    # 'Saint Bernadette Soubirous', i1 is the index of S in Soubirous
                    cyi0 = answer_start_index - i0
                    # it should be 0 here since start index is 515, and i0 should also be 515
                    cyi1 = answer_end_index - i1 - 1
                    # cyi1 seems to be the length of last word -1, or because some other issues

                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    # be sure the first char and last char are same with the first word's first char and last word's last char
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)
                    #
                    yi.append([yi0, yi1])  # index of words
                    cyi.append([cyi0, cyi1])
                    # index of shifts from the first char and last char of the answer in context

                # update counters
                for word in question_wordslist:
                    word_counter[word] += 1
                    lower_word_counter[word.lower()] += 1
                    for char in word:
                        char_counter[char] += 1

                q.append(question_wordslist)  # 2-d list of wordlist for each question
                cq.append(question_charslist)  # 3-d, question-word-char
                y.append(yi)  # question-startendpair
                cy.append(cyi)  # question-startend char pair
                rx.append(rxi)  # list of article_id-paragraph_id pair
                rcx.append(rxi)
                ids.append(question['id'])  # ids for each question
                idxs.append(len(idxs))  # index for each question
                answerss.append(answers)  # list of answer in string

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    for word in word_counter:
        max_word_size = max(max_word_size, len(word))


    # add context here
    data = {
        'q': q,  # list of word list of each questions, [['who','are', 'you'], ... ]
        'cq': cq,
        # [<class 'list'>: [['T', 'o'], ['w', 'h', 'o', 'm'], ['d', 'i', 'd'], ['t', 'h', 'e'], ['V', 'i', 'r', 'g', 'i', 'n'], ['M', 'a', 'r', 'y'], ['a', 'l', 'l', 'e', 'g', 'e', 'd', 'l', 'y'], ['a', 'p', 'p', 'e', 'a', 'r'], ['i', 'n'], ['1', '8', '5', '8'], ['i', 'n'], ['L', 'o', 'u', 'r', 'd', 'e', 's'], ['F', 'r', 'a', 'n', 'c', 'e'], ['?']] , ...]
        'y': y,  # list of <class 'list'>: [[(0, 108), (0, 111)]]
        '*x': rx,  # list of <class 'list'>: [0, 21], 0 means the number of article, 21 means the 21st paragraph
        '*cx': rcx,  # same with rx but for characters, i guess the values are same as well
        'cy': cy,  #
        'idxs': idxs,  # just those ids
        'ids': ids,  # the id of each question, sth like uuid
        'answerss': answerss,  # the content of the answer
        '*p': rx  #
    }
    shared = {
        'x': x,  # words of each paragraph
        'cx': cx,  # characters of each
        'p': p,  # the content of each paragraph
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }

    print("saving ...")
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize
        def word_tokenize(tokens):
            return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    # source_path = in_path or os.path.join(args.source_dir, "{}-v1.1.json".format(data_type))
    source_path = in_path or os.path.join(args.source_dir, "{}.json".format(data_type))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    # start_ai = int(round(len(source_data['data']) * start_ratio))
    # stop_ai = int(round(len(source_data['data']) * stop_ratio))
    start_ai = int(round(len(source_data) * start_ratio))
    stop_ai = int(round(len(source_data) * stop_ratio))

    # for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
    for ai, article in enumerate(tqdm(source_data[start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        # print(article.keys()) # paragraph, title
        # raise
        # print(article) # {'question', 'answer', 'context', 'answer_list'}
        # raise
        # for pi, para in enumerate(article['paragraphs']):
        for pi, para in enumerate([article]):
            # print(para.keys()) # qas, context
            # raise
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            # print(xi) # [['archi', ',', 'the', 'school']]
            # raise
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            # print(len(para['qas'])) # 5
            # print(para['qas']) # [{'answers': [{'text', 'answer_start'}], 'id', 'question'}]
            # raise
            for xij in xi:
                for xijk in xij:
                    # word_counter[xijk] += len(para['qas'])
                    # lower_word_counter[xijk.lower()] += len(para['qas'])
                    word_counter[xijk] += 1
                    lower_word_counter[xijk.lower()] += 1
                    for xijkl in xijk:
                        # char_counter[xijkl] += len(para['qas'])
                        char_counter[xijkl] += 1

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            # for qa in para['qas']:
            for qa in [article]:
                # get words
                # qi = word_tokenize(qa['question'])
                qi = word_tokenize(qa['question'])
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                # for answer in qa['answers']:
                try:
                    answer = qa['answer']
                    answer_text = qa['answer_list'][answer-1]
                except KeyError:
                    answer_text = ' '.join(qa['answer_list'])
                for _ in [answer_text]:
                    # answer_text = answer['text']
                    answers.append(answer_text)
                    # answer_start = answer['answer_start']
                    try:
                        answer_start = context.index(answer_text)
                        answer_stop = answer_start + len(answer_text)
                    except ValueError:
                        answer_start = 0
                        answer_stop = len(context)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1]-1]
                    # context: str
                    # xi: [[word, word, word, ...]]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, '|', w0[cyi0:], '|', w1[:cyi1+1])
                    # raise
                    #assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    #assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                # ids.append(qa['id'])
                ids.append(qa['question'])
                idxs.append(len(idxs))
                answerss.append(answers)

            if args.debug:
                break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy,
            'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx}
    shared = {'x': x, 'cx': cx, 'p': p,
              'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict}

    print("saving ...")
    save(args, data, shared, out_name)
Exemplo n.º 11
0
import json
import sys

from tqdm import tqdm

from my.corenlp_interface import CoreNLPInterface

in_path = sys.argv[1]
out_path = sys.argv[2]
url = sys.argv[3]
port = int(sys.argv[4])
data = json.load(open(in_path, 'r'))

h = CoreNLPInterface(url, port)


def find_all(a_str, sub):
    start = 0
    while True:
        start = a_str.find(sub, start)
        if start == -1: return
        yield start
        start += len(sub)  # use start += 1 to find overlapping matches


def to_hex(s):
    return " ".join(map(hex, map(ord, s)))


def handle_nobreak(cand, text):
    if cand == text:
Exemplo n.º 12
0
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize
        def word_tokenize(tokens):
            return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    # source_path = in_path or os.path.join(args.source_dir, "{}.seq.json".format(data_type))
    # source_data = json.load(open(source_path, 'r'))

    q, cq = [], []
    sents, csents = [], []
    rsents, rcsents = [], []
    sentslen = []
    labels = []
    ids = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    outfile = open('noise.txt', 'w')
    enum = 0
    total = 0
    overlap = 0
    if(args.mode=="squad"):
        source_path = os.path.join(args.source_dir, "{}.json".format(data_type))
        source_data = json.load(open(source_path, 'r'))
        start_ai = int(round(len(source_data['data']) * start_ratio))
        stop_ai = int(round(len(source_data['data']) * stop_ratio))
        for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
            for pi, para in enumerate(article['paragraphs']):
                # wordss
                context = para['context']
                context = context.replace("''", '" ')
                context = context.replace("``", '" ')
                xi = list(map(word_tokenize, sent_tokenize(context)))
                xi = [process_tokens(tokens) for tokens in xi]  # process tokens
                xi = [[xijk for xijk in xij if xijk != ''] for xij in xi]
                # context in sent-level
                contexti = sent_tokenize(context)
                context_sent_len = []
                len_cur = 0
                for cidx, c in enumerate(contexti):
                    len_cur += len(c) + 1
                    context_sent_len.append(len_cur)
                #assert len_cur-1 == len(context), (len_cur, len(context))

                # sentences in word-level
                sentsi = xi
                
                # sentences in char-level
                csentsi = [[list(xijk) for xijk in xij] for xij in xi]
                if args.debug:
                    print(sentsi)

                for xij in xi:
                    for xijk in xij:
                        word_counter[xijk] += len(para['qas'])
                        lower_word_counter[xijk.lower()] += len(para['qas'])
                        for xijkl in xijk:
                            char_counter[xijkl] += len(para['qas'])

                for qa in para['qas']:
                    # get question words
                    qaid = qa["id"]
                    qi = word_tokenize(qa['question'])
                    cqi = [list(qij) for qij in qi]
                    answer_loc_list = []
                    # if(len(qa['answers'])>1):
                    #     continue
                    answer = qa['answers'][0]
                    # for answer in qa['answers']:
                    answer_text = answer['text']
                    ansi = word_tokenize(answer_text)
                    answer_location = answer['answer_start']
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    answer_loc_list.append(get_sent_loc_idx(context_sent_len, answer_start, answer_stop))
                    score = get_score(answer_loc_list, len(sentsi), args.kernel_size)
                    label = get_label(answer_loc_list, len(sentsi))
                    for si in range(len(sentsi)):
                        if(len(sentsi[si]) > 60 or noise_flag(sentsi[si])):
                            outfile.write(' '.join(sentsi[si])+'\n')
                            enum+=1
                            continue
                        sents.append([sentsi[si]])
                        sentslen.append(len(sentsi[si]))
                        csents.append([csentsi[si]])
                        q.append(qi)
                        cq.append(cqi)
                        scores.append(score[si])
                        labels.append(label[si])
                        ids.append(qaid)

                    for qij in qi:
                        word_counter[qij] += 1
                        lower_word_counter[qij.lower()] += 1
                        for qijk in qij:
                            char_counter[qijk] += 1

    else:
        fi = 0
        qlen = []
        slen = []
        for file_size in ['0-400', '400-700', '700-']:
            source_path = os.path.join(args.source_dir, "{0}/{1}.seq.json".format(file_size, data_type))
            source_data = json.load(open(source_path, 'r'))
            start_ai = int(round(len(source_data['data']) * start_ratio))
            stop_ai = int(round(len(source_data['data']) * stop_ratio))
            for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
                xp, cxp = [], []
                sents.append(xp)
                csents.append(cxp)
                for pi, para in enumerate(article['paragraphs']):
                    # wordss
                    context = para['context']
                    context = context.replace("''", '" ')
                    context = context.replace("``", '" ')
                    xi = list(map(word_tokenize, sent_tokenize(context)))
                    xi = [process_tokens(tokens) for tokens in xi]  # process tokens
                    xi = [[xijk for xijk in xij if xijk != ''] for xij in xi]
                    # context in sent-level
                    contexti = sent_tokenize(context)
                    context_sent_len = []
                    len_cur = 0
                    for cidx, c in enumerate(contexti):
                        len_cur += len(c) + 1
                        if(len(xi[cidx]) < 200):
                            slen.append(len(xi[cidx]))
                        context_sent_len.append(len_cur)
                    assert len_cur-1 == len(context), (len_cur, len(context))

                    # sentences in word-level
                    sentsi = xi
                    
                    # sentences in char-level
                    csentsi = [[list(xijk) for xijk in xij] for xij in xi]
                    xp.append([[sent] for sent in sentsi])
                    cxp.append([[csent] for csent in csentsi])

                    if args.debug:
                        print(sentsi)

                    for xij in xi:
                        for xijk in xij:
                            word_counter[xijk] += len(para['qas'])
                            lower_word_counter[xijk.lower()] += len(para['qas'])
                            for xijkl in xijk:
                                char_counter[xijkl] += len(para['qas'])

                    for qa in para['qas']:
                        # get question words
                        total += 1
                        qaid = qa["id"]
                        qi = word_tokenize(qa['question'])
                        for qw in qi:
                            oflag = False
                            for xs in xi[0]:
                                if qw not in STOPWORDS and qw in xs:
                                    overlap += 1
                                    oflag = True
                                    break
                            if(oflag):
                                break
                        qlen.append(len(qi))
                        cqi = [list(qij) for qij in qi]
                        answer_loc_list = []
                        # if(len(qa['answers'])>1):
                        #     continue
                        answer = qa['answers'][0]
                        # for answer in qa['answers']:
                        answer_text = answer['text']
                        ansi = word_tokenize(answer_text)
                        answer_location = answer['answer_location']
                        api = []
                        for ans_idx, answer_start in enumerate(answer_location):
                            answer_stop = answer_start + len(ansi[ans_idx])
                            answer_loc_senti = get_sent_loc_idx(context_sent_len, answer_start, answer_stop)
                            answer_loc_list.append(answer_loc_senti)
                        label = get_label(answer_loc_list, len(sentsi))
                        for si in range(len(sentsi)):
                            if(len(sentsi[si]) > 60 or noise_flag(sentsi[si])):
                                outfile.write(' '.join(sentsi[si])+'\n')
                                enum+=1
                                continue
                            rsentsi = [ai+fi, pi, si]
                            rx = rsentsi
                            assert(sentsi[si] == sents[rx[0]][rx[1]][rx[2]][0])
                            #sents.append([sentsi[si]])
                            sentslen.append(len(sentsi[si]))
                            #csents.append([csentsi[si]])
                            q.append(qi)
                            cq.append(cqi)
                            labels.append(label[si])
                            ids.append(qaid)
                            rsents.append(rsentsi)
                            rcsents.append(rsentsi)

                        for qij in qi:
                            word_counter[qij] += 1
                            lower_word_counter[qij.lower()] += 1
                            for qijk in qij:
                                char_counter[qijk] += 1

                if args.debug:
                    break

            fi += stop_ai-start_ai

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    print(len(q), len(cq), len(labels))
    print(float(overlap)/total)
    print(enum)
    data = {'q': q, 'cq': cq, '*sents': rsents, '*csents': rcsents, 'label': labels, "id": ids, 
        "sentslen": sentslen}
    shared = {'sents': sents, 'csents': csents, 'word_counter': word_counter, 'char_counter': char_counter,
     'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict}

    # print("saving ...")
    # save(args, data, shared, out_name)

    plt.figure()
    sns.set( palette="muted", color_codes=True)  
    sns.distplot(qlen, kde_kws={"label":"Question Length Distribution"})
    plt.savefig("qld")
    plt.figure()
    sns.distplot(slen, kde_kws={"label":"Sentence Length Distribution"})
    plt.savefig("sld")
    plt.show()
Exemplo n.º 13
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    """

    :param args: configurations
    :param data_type: train or dev
    :param start_ratio:
    :param stop_ratio:
    :param out_name: train, dev, test
    :param in_path:
    :return:
    """
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    # 1. load data
    source_path = in_path or os.path.join(
        args.source_dir, "{}-v{}.json".format(data_type, args.version))
    source_data = json.load(open(source_path, 'r'))
    # load the train  data or dev 1.1 dataset

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    start_at_index = int(round(len(source_data['data']) * start_ratio))
    stop_at_index = int(round(len(source_data['data']) * stop_ratio))

    # for each article
    for article_index, article in enumerate(
            tqdm(source_data['data'][start_at_index:stop_at_index])):
        xp, cxp = [], []
        pp = []
        x.append(xp)  # article_paragraph_sentence_wordlist
        cx.append(cxp)  # article_paragraph_sentence_word_charlist
        p.append(pp)  # article_contextlist

        # for each paragrph of the article
        for paragraph_index, paragraph in enumerate(article['paragraphs']):
            # wordss
            context = paragraph['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            list_of_wordlist = list(map(word_tokenize, sent_tokenize(context)))
            list_of_wordlist = [
                process_tokens(tokens) for tokens in list_of_wordlist
            ]  # process tokens
            # xi are words
            # given xi, add chars
            list_of_charlist = [[list(word) for word in word_list]
                                for word_list in list_of_wordlist]
            # cxi are characters for each words
            xp.append(list_of_wordlist)  # paragraph_sentence_wordlist
            cxp.append(list_of_charlist)  # paragraph_sentence_word_charlist
            pp.append(context)  # contextlist

            # update the counter to plus the number of questions
            for wordlist in list_of_wordlist:
                for word in wordlist:
                    word_counter[word] += len(paragraph['qas'])
                    lower_word_counter[word.lower()] += len(paragraph['qas'])
                    for char in word:
                        char_counter[char] += len(paragraph['qas'])

            rxi = [article_index, paragraph_index]
            assert len(x) - 1 == article_index
            assert len(x[article_index]) - 1 == paragraph_index
            for question in paragraph['qas']:
                # get words
                question_wordslist = word_tokenize(question['question'])
                question_charslist = [list(qij) for qij in question_wordslist]
                yi = []
                cyi = []
                answers = []
                for answer in question['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, list_of_wordlist,
                                             answer_start, answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(list_of_wordlist[yi0[0]]) > yi0[1]
                    assert len(list_of_wordlist[yi1[0]]) >= yi1[1]
                    w0 = list_of_wordlist[yi0[0]][yi0[1]]
                    w1 = list_of_wordlist[yi1[0]][yi1[1] - 1]
                    i0 = get_word_idx(context, list_of_wordlist, yi0)
                    i1 = get_word_idx(context, list_of_wordlist,
                                      (yi1[0], yi1[1] - 1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                for qij in question_wordslist:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(question_wordslist)  # question_wordlist,
                cq.append(question_charslist)  # qeustion_word_charlist
                y.append(yi)  #
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(question['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

            if args.debug:
                break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {
        'q':
        q,  # list of word list of each questions, [['who','are', 'you'], ... ]
        'cq': cq,
        # [<class 'list'>: [['T', 'o'], ['w', 'h', 'o', 'm'], ['d', 'i', 'd'], ['t', 'h', 'e'], ['V', 'i', 'r', 'g', 'i', 'n'], ['M', 'a', 'r', 'y'], ['a', 'l', 'l', 'e', 'g', 'e', 'd', 'l', 'y'], ['a', 'p', 'p', 'e', 'a', 'r'], ['i', 'n'], ['1', '8', '5', '8'], ['i', 'n'], ['L', 'o', 'u', 'r', 'd', 'e', 's'], ['F', 'r', 'a', 'n', 'c', 'e'], ['?']] , ...]
        'y': y,  # list of <class 'list'>: [[(0, 108), (0, 111)]]
        '*x':
        rx,  # list of <class 'list'>: [0, 21], 0 means the number of article, 21 means the 21st paragraph
        '*cx':
        rcx,  # same with rx but for characters, i guess the values are same as well
        'cy': cy,  #
        'idxs': idxs,  # just those ids
        'ids': ids,  # the id of each question, sth like uuid
        'answerss': answerss,  # the content of the answer
        '*p': rx  #
    }
    # the following variables are shared by several question,
    shared = {
        'x': x,  # words of each paragraph
        'cx': cx,  # characters of each
        'p': p,  # the content of each paragraph
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }

    print("saving ...")
Exemplo n.º 14
0
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize
        def word_tokenize(tokens):
 #           return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
            return [token.replace("''", '"').replace("``", '"') for token in jieba.lcut(tokens, cut_all=False)]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

#    if not args.split:
#        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir, "{}-v1.1.json".format(data_type))
    source_data = json.load(open(source_path, 'r', encoding='utf-8'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            
            ################### add by zhijing
            table = {ord(f):ord(t) for f,t in zip(
            	',。!?【】()%#@&1234567890',
            	',.!?[]()%#@&1234567890')}
            context = context.translate(table)
            ################### add by zhijing
            print(context)
            print(len(sent_tokenize(context)))
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            print('xi')
            print(xi)
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                # get words
                qi = word_tokenize(qa['question'])
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1]-1]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

            if args.debug:
                break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy,
            'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx}
    shared = {'x': x, 'cx': cx, 'p': p,
              'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict}

    print("saving ...")
    save(args, data, shared, out_name)
Exemplo n.º 15
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir,
                                          "{}-v1.1.json".format(data_type))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                # get words
                qi = word_tokenize(qa['question'])
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                answer = qa['answer']

                yi.append(answer)

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                def put():
                    q.append(qi)
                    cq.append(cqi)
                    y.append(yi)
                    rx.append(rxi)
                    rcx.append(rxi)
                    ids.append(qa['id'])
                    idxs.append(len(idxs))
                    answerss.append(answers)

                put()
                if data_type == 'train' and answer:
                    for i in range(3):
                        put()

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    data = {
        'q': q,
        'cq': cq,
        'y': y,
        '*x': rx,
        '*cx': rcx,
        'idxs': idxs,
        'ids': ids,
        'answerss': answerss,
        '*p': rx
    }
    shared = {
        'x': x,
        'cx': cx,
        'p': p,
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }

    print("saving ...")
    save(args, data, shared, out_name)
Exemplo n.º 16
0
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize
        def word_tokenize(tokens):
            return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir, "{}-qar_squad_all.jsonl".format(data_type))
    rfp = open(source_path, 'r')

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    #start_ai = int(round(len(source_data['data']) * start_ratio))
    #stop_ai = int(round(len(source_data['data']) * stop_ratio))
    pi = 0
    ai = 0
    xp, cxp = [], []
    pp = []
    x.append(xp)
    cx.append(cxp)
    p.append(pp)

    for line in tqdm(rfp):
        para = json.loads(line)
        context = para['context']
        context = context.replace("''", '" ')
        context = context.replace("``", '" ')
        xi = list(map(word_tokenize, sent_tokenize(context)))
        # xi = context.split()
        xi = [process_tokens(tokens) for tokens in xi]  # process tokens
        # given xi, add chars
        cxi = [[list(xijk) for xijk in xij] for xij in xi]
        xp.append(xi)
        cxp.append(cxi)
        pp.append(context)

        for xij in xi:
            for xijk in xij:
                word_counter[xijk] += len(para['qas'])
                lower_word_counter[xijk.lower()] += len(para['qas'])
                for xijkl in xijk:
                    char_counter[xijkl] += len(para['qas'])

        rxi = [ai, pi]
        assert len(x) - 1 == ai
        assert len(x[ai]) - 1 == pi
        for qa in para['qas']:
            # get words
            qa_text = qa['question']

            qa_text = qa_text.replace("''", '" ')
            qa_text = qa_text.replace("``", '" ')

            qi = word_tokenize(qa_text)

            # qi = qa['question'].split()
            cqi = [list(qij) for qij in qi]
            yi = []
            cyi = []
            answers = []
            for answer in qa['answers']:
                flag = False
                answer_text = answer['text']

                answer_text = answer_text.replace("''", '" ')
                answer_text = answer_text.replace("``", '" ')

                answers.append(answer_text)
                answer_start = answer['answer_start']
                answer_stop = answer_start + len(answer_text)
                # TODO : put some function that gives word_start, word_stop here
                yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                # yi0 = answer['answer_word_start'] or [0, 0]
                # yi1 = answer['answer_word_stop'] or [0, 1]
                assert len(xi[yi0[0]]) > yi0[1]
                assert len(xi[yi1[0]]) >= yi1[1]
                w0 = xi[yi0[0]][yi0[1]]
                w1 = xi[yi1[0]][yi1[1]-1]

                if len(w1) == 0 and len(xi[yi1[0]][yi1[1]-2]) != 0:
                    flag = True
                    w1 = xi[yi1[0]][yi1[1]-2]

                i0 = get_word_idx(context, xi, yi0)
                i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                cyi0 = answer_start - i0
                cyi1 = answer_stop - i1 - 1
                # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)

                if flag:
                    assert answer_text[-2] == w1[cyi1], (answer_text, w1, cyi1)
                else:
                    assert answer_text[-1] == w1[cyi1], (answer_text, w1, cyi1)

                assert cyi0 < 32, (answer_text, w0)
                assert cyi1 < 32, (answer_text, w1)

                yi.append([yi0, yi1])
                cyi.append([cyi0, cyi1])

            for qij in qi:
                word_counter[qij] += 1
                lower_word_counter[qij.lower()] += 1
                for qijk in qij:
                    char_counter[qijk] += 1

            q.append(qi)
            cq.append(cqi)
            y.append(yi)
            cy.append(cyi)
            rx.append(rxi)
            rcx.append(rxi)
            ids.append(qa['id'])
            idxs.append(len(idxs))
            answerss.append(answers)

        if args.debug:
            break

        pi += 1


    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy,
            'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx}
    shared = {'x': x, 'cx': cx, 'p': p,
              'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict}

    print("Saving ...")
    save(args, data, shared, out_name)
    print("Saving complete!")
Exemplo n.º 17
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    print("Preprocessing data type %s" % data_type)
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir,
                                          "{}.csv".format(data_type))
    print("Reading data from source path %s" % source_path)
    source_data = pd.read_csv(source_path,
                              encoding='utf-8',
                              dtype=dict(is_answer_absent=float),
                              na_values=dict(question=[],
                                             story_text=[],
                                             validated_answers=[]),
                              keep_default_na=False)

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []  # Gold standard answers
    span_answerss = []  # Answers from our spans
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    start_ai = int(round(len(source_data) * start_ratio))
    stop_ai = int(round(len(source_data) * stop_ratio))
    data_rows = source_data.iterrows()
    story_ids_to_idx = {}
    idx_to_story_ids = {}

    for ai, data_point in enumerate(tqdm(data_rows)):
        question_index, question_info = data_point[0], data_point[1]
        story_id = question_info['story_id']
        context = question_info['story_text']
        context = context.replace("''", '" ')
        context = context.replace("``", '" ')
        question = question_info['question']
        question_id = ai
        answer_char_ranges = question_info['answer_char_ranges']

        # Copy get answer script from the newsqa dataset
        baseline_answers = []
        # Prefer validated answers.
        # If there are no validated answers, use the ones that are provided.
        if not 'validated_answers' in question_info or not question_info[
                'validated_answers']:
            # Ignore per selection splits.
            char_ranges = question_info['answer_char_ranges'].replace(
                '|', ',').split(',')
        else:
            validated_answers_dict = json.loads(
                question_info['validated_answers'])
            char_ranges = []
            for k, v in validated_answers_dict.items():
                char_ranges += v * [k]

        for char_range in char_ranges:
            if char_range.lower() == 'none':
                baseline_answers.append('NONE')
            elif ':' in char_range:
                start, end = map(int, char_range.split(':'))
                answer = question_info['story_text'][start:end]
                baseline_answers.append(answer)
        paragraph_ptr = -1
        pi = 0
        if story_id not in story_ids_to_idx:
            paragraph_ptr = len(story_ids_to_idx)
            story_ids_to_idx[story_id] = paragraph_ptr
            idx_to_story_ids[paragraph_ptr] = story_id
            xp, cxp = [], []
            pp = []
            x.append(xp)
            cx.append(cxp)
            p.append(pp)

            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens

            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += 1
                    lower_word_counter[xijk.lower()] += 1
                    for xijkl in xijk:
                        char_counter[xijkl] += 1

        else:
            paragraph_ptr = story_ids_to_idx[story_id]
        rxi = [paragraph_ptr, pi]
        """
        print("TEST")
        print("TEST")
        print(story_ids_to_idx)
        print(len(xp))
        print(paragraph_ptr)
        """
        xi = x[paragraph_ptr][pi]

        qi = word_tokenize(question)
        cqi = [list(qij) for qij in qi]
        yi = []
        cyi = []
        answers = []
        answer_char_ranges_split = answer_char_ranges.split("|")
        for answer in answer_char_ranges_split:
            if answer == 'None':
                continue
            answer_char_range = answer.split(",")[0].split(":")
            answer_start = int(answer_char_range[0])
            answer_stop = int(answer_char_range[-1])
            answer_text = context[answer_start:answer_stop].strip()

            if answer_text == "":
                print("BAD ANSWER GIVEN %s" % answer_char_range)
                continue

            answers.append(answer_text)

            # TODO : put some function that gives word_start, word_stop here
            yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
            # yi0 = answer['answer_word_start'] or [0, 0]
            # yi1 = answer['answer_word_stop'] or [0, 1]

            assert len(xi[yi0[0]]) > yi0[1]
            assert len(xi[yi1[0]]) >= yi1[1]
            w0 = xi[yi0[0]][yi0[1]]
            w1 = xi[yi1[0]][yi1[1] - 1]

            i0 = get_word_idx(context, xi, yi0)
            i1 = get_word_idx(context, xi, (yi1[0], yi1[1] - 1))
            cyi0 = answer_start - i0
            cyi1 = answer_stop - i1 - 1

            #print(question, answer_text, w0[cyi0:], w1[:cyi1+1])
            #assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
            #assert answer_text[-1] == w1[-1]
            assert cyi0 < 32, (answer_text, w0)
            assert cyi1 < 32, (answer_text, w1)

            yi.append([yi0, yi1])
            cyi.append([cyi0, cyi1])

        for qij in qi:
            word_counter[qij] += 1
            lower_word_counter[qij.lower()] += 1
            for qijk in qij:
                char_counter[qijk] += 1

        q.append(qi)
        cq.append(cqi)
        y.append(yi)
        cy.append(cyi)
        rx.append(rxi)
        rcx.append(rxi)
        ids.append(question_id)
        idxs.append(len(idxs))
        answerss.append(baseline_answers)
        span_answerss.append(answers)
        if args.debug:
            break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {
        'q': q,
        'cq': cq,
        'y': y,
        '*x': rx,
        '*cx': rcx,
        'cy': cy,
        'idxs': idxs,
        'ids': ids,
        'answerss': answerss,
        'span_answerss': span_answerss,
        '*p': rx
    }
    shared = {
        'x': x,
        'cx': cx,
        'p': p,
        'story_ids_to_idx': story_ids_to_idx,
        'idx_to_story_ids': idx_to_story_ids,
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }

    print("saving ...")
    save(args, data, shared, out_name)
Exemplo n.º 18
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    sent_tokenize0 = lambda para: [para]

    # source_path = in_path or os.path.join(args.source_dir, "{}.seq.json".format(data_type))
    # source_data = json.load(open(source_path, 'r'))

    total = 0
    debug_out = []
    debug_q = Counter()
    false_num = 0
    fnum = 0
    q, cq = [], []
    y = []
    sents, csents = [], []
    rsents, rcsents = [], []
    ids = []
    answerss = []
    q_counter, q_counter0 = {}, {}
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    source_path = os.path.join(args.source_dir,
                               "{}.seq.json".format(data_type))
    source_data = json.load(open(source_path, 'r'))
    filter_dict = json.load(open(args.filter_file, 'r'))
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp = [], []
        sents.append(xp)
        csents.append(cxp)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            xi = [[xijk for xijk in xij if xijk != ''] for xij in xi]
            # context in sent-level
            contexti = sent_tokenize(context)
            context_sent_len = []
            len_cur = 0
            for cidx, c in enumerate(contexti):
                len_cur += len(c) + 1
                context_sent_len.append(len_cur)
            assert len_cur - 1 == len(context), (len_cur, len(context))

            # sentences in word-level
            sentsi = xi

            # sentences in char-level
            csentsi = [[list(xijk) for xijk in xij] for xij in xi]

            xp.append(sentsi)
            cxp.append(csentsi)

            for qa in para['qas']:
                # get question words
                qaid = qa["id"]
                q_counter[qa['question']] = q_counter.get(qa['question'],
                                                          0) + 1
                total += 1

                if (qaid in filter_dict):
                    valid_sentid = sorted(filter_dict[qaid])
                    inv_sentid = {k: v for v, k in enumerate(valid_sentid)}
                    rsentsi = [ai, pi, valid_sentid]
                    qi = word_tokenize(qa['question'])
                    cqi = [list(qij) for qij in qi]
                    # xi = list(map(word_tokenize, sent_tokenize(context)))
                    newxi = [xi[sentid] for sentid in valid_sentid]
                    word_num = list(map(len, newxi))
                    newxi = [[x for s in newxi for x in s]]
                    cnewxi = [[list(xijk) for xijk in xij] for xij in newxi]

                    yi = []
                    answers = []
                    for answer in qa['answers']:
                        yii = []
                        answer_text = answer['text']
                        ansi = word_tokenize(answer_text)
                        answer_location = answer['answer_location']
                        not_complete = False
                        for ans_idx, answer_start in enumerate(
                                answer_location):
                            answer_stop = answer_start + len(ansi[ans_idx])
                            answer_loc_senti = get_sent_loc_idx(
                                context_sent_len, answer_start, answer_stop)
                            if (answer_loc_senti not in valid_sentid):
                                not_complete = True
                                break
                            start = sum(
                                word_num[:inv_sentid[answer_loc_senti]])
                            end = sum(word_num[:inv_sentid[answer_loc_senti] +
                                               1])
                            try:
                                pos = newxi[0].index(ansi[ans_idx], start, end)
                            except:
                                not_complete = True
                                false_num += 1
                                print(xi[answer_loc_senti],
                                      newxi[0][start - 5:end + 5], word_num,
                                      start, end, newxi[start:end], ansi)
                                break
                            yii.append(pos)
                        if (not_complete):
                            continue
                        yi.append(yii)
                        answers.append(answer_text)

                    if (len(yi) == 0):
                        fnum += 1
                        q_counter0[qa['question']] = q_counter0.get(
                            qa['question'], 0) + 1
                        continue

                    for xij in newxi:
                        for xijk in xij:
                            word_counter[xijk] += 1
                            lower_word_counter[xijk.lower()] += 1
                            for xijkl in xijk:
                                char_counter[xijkl] += 1

                    for qij in qi:
                        word_counter[qij] += 1
                        lower_word_counter[qij.lower()] += 1
                        for qijk in qij:
                            char_counter[qijk] += 1

                    q.append(qi)
                    cq.append(cqi)
                    y.append(yi)
                    ids.append(qa['id'])
                    rsents.append(rsentsi)
                    rcsents.append(rsentsi)
                    answerss.append(answers)

                if (qaid not in filter_dict):
                    continue

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    qx, qy = [], []
    print("{0}/{1}".format(len(q), total))
    for k in q_counter0.keys():
        if (float(q_counter0[k]) / q_counter[k] > 0.05):
            qx.append(q_counter0[k])
            qy.append(q_counter[k])
            print(k, "{}/{}".format(q_counter0[k], q_counter[k]))

    xaxis = list(range(len(qx)))
    plt.bar(xaxis, qx, width=0.5)
    plt.bar(xaxis, qy, width=0.2)
    plt.show()
    data = {
        'q': q,
        'cq': cq,
        '*x': rsents,
        '*cx': rcsents,
        'y': y,
        "id": ids,
        "answer": answerss
    }
    shared = {
        'x': sents,
        'cx': csents,
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }
    # print(debug_q)
    # print("saving ...")
    # with open("debug_out.json", "w") as fh:
    #     json.dump(debug_out, fh)
    save(args, data, shared, out_name)