Пример #1
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    print("Preprocessing data type %s" % data_type)
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir,
                                          "{}.csv".format(data_type))
    print("Reading data from source path %s" % source_path)
    source_data = pd.read_csv(source_path,
                              encoding='utf-8',
                              dtype=dict(is_answer_absent=float),
                              na_values=dict(question=[],
                                             story_text=[],
                                             validated_answers=[]),
                              keep_default_na=False)

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []  # Gold standard answers
    span_answerss = []  # Answers from our spans
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    start_ai = int(round(len(source_data) * start_ratio))
    stop_ai = int(round(len(source_data) * stop_ratio))
    data_rows = source_data.iterrows()
    story_ids_to_idx = {}
    idx_to_story_ids = {}

    for ai, data_point in enumerate(tqdm(data_rows)):
        question_index, question_info = data_point[0], data_point[1]
        story_id = question_info['story_id']
        context = question_info['story_text']
        context = context.replace("''", '" ')
        context = context.replace("``", '" ')
        question = question_info['question']
        question_id = ai
        answer_char_ranges = question_info['answer_char_ranges']

        # Copy get answer script from the newsqa dataset
        baseline_answers = []
        # Prefer validated answers.
        # If there are no validated answers, use the ones that are provided.
        if not 'validated_answers' in question_info or not question_info[
                'validated_answers']:
            # Ignore per selection splits.
            char_ranges = question_info['answer_char_ranges'].replace(
                '|', ',').split(',')
        else:
            validated_answers_dict = json.loads(
                question_info['validated_answers'])
            char_ranges = []
            for k, v in validated_answers_dict.items():
                char_ranges += v * [k]

        for char_range in char_ranges:
            if char_range.lower() == 'none':
                baseline_answers.append('NONE')
            elif ':' in char_range:
                start, end = map(int, char_range.split(':'))
                answer = question_info['story_text'][start:end]
                baseline_answers.append(answer)
        paragraph_ptr = -1
        pi = 0
        if story_id not in story_ids_to_idx:
            paragraph_ptr = len(story_ids_to_idx)
            story_ids_to_idx[story_id] = paragraph_ptr
            idx_to_story_ids[paragraph_ptr] = story_id
            xp, cxp = [], []
            pp = []
            x.append(xp)
            cx.append(cxp)
            p.append(pp)

            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens

            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += 1
                    lower_word_counter[xijk.lower()] += 1
                    for xijkl in xijk:
                        char_counter[xijkl] += 1

        else:
            paragraph_ptr = story_ids_to_idx[story_id]
        rxi = [paragraph_ptr, pi]
        """
        print("TEST")
        print("TEST")
        print(story_ids_to_idx)
        print(len(xp))
        print(paragraph_ptr)
        """
        xi = x[paragraph_ptr][pi]

        qi = word_tokenize(question)
        cqi = [list(qij) for qij in qi]
        yi = []
        cyi = []
        answers = []
        answer_char_ranges_split = answer_char_ranges.split("|")
        for answer in answer_char_ranges_split:
            if answer == 'None':
                continue
            answer_char_range = answer.split(",")[0].split(":")
            answer_start = int(answer_char_range[0])
            answer_stop = int(answer_char_range[-1])
            answer_text = context[answer_start:answer_stop].strip()

            if answer_text == "":
                print("BAD ANSWER GIVEN %s" % answer_char_range)
                continue

            answers.append(answer_text)

            # TODO : put some function that gives word_start, word_stop here
            yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
            # yi0 = answer['answer_word_start'] or [0, 0]
            # yi1 = answer['answer_word_stop'] or [0, 1]

            assert len(xi[yi0[0]]) > yi0[1]
            assert len(xi[yi1[0]]) >= yi1[1]
            w0 = xi[yi0[0]][yi0[1]]
            w1 = xi[yi1[0]][yi1[1] - 1]

            i0 = get_word_idx(context, xi, yi0)
            i1 = get_word_idx(context, xi, (yi1[0], yi1[1] - 1))
            cyi0 = answer_start - i0
            cyi1 = answer_stop - i1 - 1

            #print(question, answer_text, w0[cyi0:], w1[:cyi1+1])
            #assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
            #assert answer_text[-1] == w1[-1]
            assert cyi0 < 32, (answer_text, w0)
            assert cyi1 < 32, (answer_text, w1)

            yi.append([yi0, yi1])
            cyi.append([cyi0, cyi1])

        for qij in qi:
            word_counter[qij] += 1
            lower_word_counter[qij.lower()] += 1
            for qijk in qij:
                char_counter[qijk] += 1

        q.append(qi)
        cq.append(cqi)
        y.append(yi)
        cy.append(cyi)
        rx.append(rxi)
        rcx.append(rxi)
        ids.append(question_id)
        idxs.append(len(idxs))
        answerss.append(baseline_answers)
        span_answerss.append(answers)
        if args.debug:
            break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {
        'q': q,
        'cq': cq,
        'y': y,
        '*x': rx,
        '*cx': rcx,
        'cy': cy,
        'idxs': idxs,
        'ids': ids,
        'answerss': answerss,
        'span_answerss': span_answerss,
        '*p': rx
    }
    shared = {
        'x': x,
        'cx': cx,
        'p': p,
        'story_ids_to_idx': story_ids_to_idx,
        'idx_to_story_ids': idx_to_story_ids,
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }

    print("saving ...")
    save(args, data, shared, out_name)
Пример #2
0
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize
        def word_tokenize(tokens):
            return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir, "{}-{}v1.1.json".format(data_type, args.suffix))
    source_data = json.load(open(source_path, 'r', encoding="utf-8"))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    na = []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                # get words
                qi = word_tokenize(qa['question'])
                qi = process_tokens(qi)
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1]-1]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    #add
		    
                    #print("i0 :",i0, "i1 :",i1, "cyi0 :", cyi0, "w0 :",w0 )
                    #print("xi :", xi)
                    #print( "yi0",yi0, "(yi1[0], yi1[1]-1) :",(yi1[0], yi1[1]-1) )
                    #print("answer_text",answer_text)
                    #print("cyi1:",cyi1)
                    #print("answer_text[0] :",answer_text[0])
                    #print("answer_text[-1] :",answer_text[-1])
                    #print("w0 :",w0)
                    #print("w1 :",w1)
                    #so far

                    #print(":):):)")
                    #print("answer_text:",answer_text,"\nstart:", w0[cyi0:],"\nend:", w1[:cyi1+1])
                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                if len(qa['answers']) == 0:
                    yi.append([(0, 0), (0, 1)])
                    cyi.append([0, 1])
                    na.append(True)
                else:
                    na.append(False)

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

        if args.debug:
            break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy,
            'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx, 'na': na}
    shared = {'x': x, 'cx': cx, 'p': p,
              'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict}

    print("saving ...")
    save(args, data, shared, out_name)
Пример #3
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    parser = StanfordParser(model_path=os.getenv("StanfordParser_model_path"))
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    # if not args.split:
    #     sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(
        args.source_dir, "{}-{}v1.1.json".format(data_type, args.suffix))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    q_syn_seq = []
    na = []
    cy = []
    x, cx = [], []
    syn_seq = []
    rsyn_seq = []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    syn_counter = Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    abandon = 0
    for ai, article in enumerate(tqdm(source_data['data'][221:])):
        xp, cxp = [], []
        syn_seqp = []
        pp = []
        x.append(xp)
        cx.append(cxp)
        syn_seq.append(syn_seqp)
        p.append(pp)
        p_i = -1
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ').replace("``", '" ').replace(
                'e.g.', 'e-g,')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            try:
                syn_seqpp = sents_parser(xi, parser)
                p_i += 1
            except:
                abandon += 1
                continue
            for sent in syn_seqpp:
                for word in sent:
                    for syn in word:
                        syn_counter[syn] += 1

            syn_seqp.append(syn_seqpp)
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai + 221, p_i]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == p_i
            for qa in para['qas']:
                # get words
                qi = word_tokenize(qa['question'])
                qi = process_tokens(qi)
                try:
                    q_syn_seqq = sent_parser(qi, parser)
                except:
                    continue
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start,
                                             answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1] - 1]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1] - 1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                if len(qa['answers']) == 0:
                    yi.append([(0, 0), (0, 1)])
                    cyi.append([0, 1])
                    na.append(True)
                else:
                    na.append(False)

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                q_syn_seq.append(q_syn_seqq)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                rsyn_seq.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)
        print('abandon {} paragraph'.format(abandon))
        if args.debug:
            break
    for sent in q_syn_seq:
        for word in sent:
            for syn in word:
                syn_counter[syn] += 1
    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {
        'q': q,
        'cq': cq,
        'y': y,
        '*x': rx,
        '*cx': rcx,
        '*syn_seq': rsyn_seq,
        'cy': cy,
        'q_syn_seq': q_syn_seq,
        'idxs': idxs,
        'ids': ids,
        'answerss': answerss,
        '*p': rx,
        'na': na
    }
    shared = {
        'x': x,
        'cx': cx,
        'p': p,
        'syn_seq': syn_seq,
        'syn_counter': syn_counter,
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }
    print("saving ...")
    save(args, data, shared, out_name)
Пример #4
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir,
                                          "{}.seq.json".format(data_type))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            xi = [[xijk for xijk in xij if xijk != ''] for xij in xi]
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                # get words
                qi = word_tokenize(qa['question'])
                cqi = [list(qij) for qij in qi]
                yi = []
                answers = []
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    ansi = word_tokenize(answer_text)
                    answer_location = answer['answer_location']
                    yii = []
                    for ans_idx, answer_start in enumerate(answer_location):
                        answer_stop = answer_start + len(ansi[ans_idx])
                        yi0, _ = get_word_span(context, xi, answer_start,
                                               answer_stop)
                        assert len(xi[yi0[0]]) > yi0[1]
                        w0 = xi[yi0[0]][yi0[1]]
                        assert ansi[ans_idx] == w0, (ansi[ans_idx], w0)
                        yii.append(yi0)

                    yi.append(yii)

                    # answer_start = answer['answer_start']
                    # answer_stop = answer_start + len(answer_text)
                    # yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # # yi0 = answer['answer_word_start'] or [0, 0]
                    # # yi1 = answer['answer_word_stop'] or [0, 1]
                    # assert len(xi[yi0[0]]) > yi0[1]
                    # assert len(xi[yi1[0]]) >= yi1[1]
                    # w0 = xi[yi0[0]][yi0[1]]
                    # w1 = xi[yi1[0]][yi1[1]-1]
                    # i0 = get_word_idx(context, xi, yi0)
                    # i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    # cyi0 = answer_start - i0
                    # cyi1 = answer_stop - i1 - 1
                    # # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    # assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    # assert answer_text[-1] == w1[cyi1]
                    # assert cyi0 < 32, (answer_text, w0)
                    # assert cyi1 < 32, (answer_text, w1)
                    # yi.append([yi0, yi1])

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

            if args.debug:
                break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {
        'q': q,
        'cq': cq,
        'y': y,
        '*x': rx,
        '*cx': rcx,
        'idxs': idxs,
        'ids': ids,
        'answerss': answerss,
        '*p': rx
    }
    shared = {
        'x': x,
        'cx': cx,
        'p': p,
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }

    print("saving ...")
    save(args, data, shared, out_name)
Пример #5
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    totalnum0 = 0
    falsenum0 = 0
    falsenum1 = 0
    truenum0 = 0
    truenum1 = 0
    outlist = []
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir,
                                          "{}.seq.json".format(data_type))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            ctx = para['context']
            if (len(ctx.split()) <= 800):
                cut = -1
            else:
                cut = sum(map(len, ctx.split()[:800])) + 800
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            xi = [[xijk for xijk in xij if xijk != ''] for xij in xi]
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                totalnum0 += 1
                # get words
                qi = word_tokenize(qa['question'])
                cqi = [list(qij) for qij in qi]
                yi = []
                answers = []
                flag = False
                Flag = True
                if (cut > -1):
                    for ans in qa['answers']:
                        if (max(ans['answer_location'])) < cut:
                            Flag = False
                            break
                else:
                    Flag = False
                if (Flag):
                    falsenum1 += 1

                for answer in qa['answers']:
                    flag1 = True
                    answer_text = answer['text']
                    answers.append(answer_text)
                    ansi = word_tokenize(answer_text)
                    answer_location = answer['answer_location']
                    yii = []
                    for ans_idx, answer_start in enumerate(answer_location):
                        answer_stop = answer_start + len(ansi[ans_idx])
                        yi0, _ = get_word_span(context, xi, answer_start,
                                               answer_stop)
                        if (yi0[1] >= 800):
                            flag1 = False
                        assert len(xi[yi0[0]]) > yi0[1]
                        w0 = xi[yi0[0]][yi0[1]]
                        assert ansi[ans_idx] == w0, (ansi[ans_idx], w0)
                        yii.append(yi0)

                    if (flag1):
                        flag = True

                    yi.append(yii)
                if (flag):
                    truenum0 += 1

                if (flag == Flag):
                    print(ctx, qa, yi, cut)
                    outlist.append([ctx, qa])

                    # answer_start = answer['answer_start']
                    # answer_stop = answer_start + len(answer_text)
                    # yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # # yi0 = answer['answer_word_start'] or [0, 0]
                    # # yi1 = answer['answer_word_stop'] or [0, 1]
                    # assert len(xi[yi0[0]]) > yi0[1]
                    # assert len(xi[yi1[0]]) >= yi1[1]
                    # w0 = xi[yi0[0]][yi0[1]]
                    # w1 = xi[yi1[0]][yi1[1]-1]
                    # i0 = get_word_idx(context, xi, yi0)
                    # i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    # cyi0 = answer_start - i0
                    # cyi1 = answer_stop - i1 - 1
                    # # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    # assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    # assert answer_text[-1] == w1[cyi1]
                    # assert cyi0 < 32, (answer_text, w0)
                    # assert cyi1 < 32, (answer_text, w1)
                    # yi.append([yi0, yi1])

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

            if args.debug:
                break

    print(truenum0, totalnum0, float(truenum0) / totalnum0)
    print(falsenum1, totalnum0, 1 - float(falsenum1) / totalnum0)
    with open('debugcnt.json', 'w') as f:
        json.dump(outlist, f)
Пример #6
0
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize
        def word_tokenize(tokens):
            return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    '''This is where they read teh json'''
    #TODO: load fever and try to preprocess it
    source_path = in_path or os.path.join(args.source_dir, "{}-v1.1.json".format(data_type))




    with open(source_path,"r") as f:

        jlr= JSONLineReader()
        all_claims_ev=jlr.process(f)
        logging.info(all_claims_ev["verifiable"])
        sys.exit(1)

    source_data = json.load(open(source_path, 'r'))
    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        logging.info("reached here1")
        logging.info(p)
        logging.info(article)

        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            logger.info(context)
            sys.exit(1)

            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                # get words
                qi = word_tokenize(qa['question'])
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1]-1]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

            if args.debug:
                break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy,
            'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx}
    shared = {'x': x, 'cx': cx, 'p': p,
              'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict}

    print("saving ...")
    save(args, data, shared, out_name)
Пример #7
0
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize
        def word_tokenize(tokens):
            return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir, "{}-qar_squad_all.jsonl".format(data_type))
    rfp = open(source_path, 'r')

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    #start_ai = int(round(len(source_data['data']) * start_ratio))
    #stop_ai = int(round(len(source_data['data']) * stop_ratio))
    pi = 0
    ai = 0
    xp, cxp = [], []
    pp = []
    x.append(xp)
    cx.append(cxp)
    p.append(pp)

    for line in tqdm(rfp):
        para = json.loads(line)
        context = para['context']
        context = context.replace("''", '" ')
        context = context.replace("``", '" ')
        xi = list(map(word_tokenize, sent_tokenize(context)))
        # xi = context.split()
        xi = [process_tokens(tokens) for tokens in xi]  # process tokens
        # given xi, add chars
        cxi = [[list(xijk) for xijk in xij] for xij in xi]
        xp.append(xi)
        cxp.append(cxi)
        pp.append(context)

        for xij in xi:
            for xijk in xij:
                word_counter[xijk] += len(para['qas'])
                lower_word_counter[xijk.lower()] += len(para['qas'])
                for xijkl in xijk:
                    char_counter[xijkl] += len(para['qas'])

        rxi = [ai, pi]
        assert len(x) - 1 == ai
        assert len(x[ai]) - 1 == pi
        for qa in para['qas']:
            # get words
            qa_text = qa['question']

            qa_text = qa_text.replace("''", '" ')
            qa_text = qa_text.replace("``", '" ')

            qi = word_tokenize(qa_text)

            # qi = qa['question'].split()
            cqi = [list(qij) for qij in qi]
            yi = []
            cyi = []
            answers = []
            for answer in qa['answers']:
                flag = False
                answer_text = answer['text']

                answer_text = answer_text.replace("''", '" ')
                answer_text = answer_text.replace("``", '" ')

                answers.append(answer_text)
                answer_start = answer['answer_start']
                answer_stop = answer_start + len(answer_text)
                # TODO : put some function that gives word_start, word_stop here
                yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                # yi0 = answer['answer_word_start'] or [0, 0]
                # yi1 = answer['answer_word_stop'] or [0, 1]
                assert len(xi[yi0[0]]) > yi0[1]
                assert len(xi[yi1[0]]) >= yi1[1]
                w0 = xi[yi0[0]][yi0[1]]
                w1 = xi[yi1[0]][yi1[1]-1]

                if len(w1) == 0 and len(xi[yi1[0]][yi1[1]-2]) != 0:
                    flag = True
                    w1 = xi[yi1[0]][yi1[1]-2]

                i0 = get_word_idx(context, xi, yi0)
                i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                cyi0 = answer_start - i0
                cyi1 = answer_stop - i1 - 1
                # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)

                if flag:
                    assert answer_text[-2] == w1[cyi1], (answer_text, w1, cyi1)
                else:
                    assert answer_text[-1] == w1[cyi1], (answer_text, w1, cyi1)

                assert cyi0 < 32, (answer_text, w0)
                assert cyi1 < 32, (answer_text, w1)

                yi.append([yi0, yi1])
                cyi.append([cyi0, cyi1])

            for qij in qi:
                word_counter[qij] += 1
                lower_word_counter[qij.lower()] += 1
                for qijk in qij:
                    char_counter[qijk] += 1

            q.append(qi)
            cq.append(cqi)
            y.append(yi)
            cy.append(cyi)
            rx.append(rxi)
            rcx.append(rxi)
            ids.append(qa['id'])
            idxs.append(len(idxs))
            answerss.append(answers)

        if args.debug:
            break

        pi += 1


    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy,
            'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx}
    shared = {'x': x, 'cx': cx, 'p': p,
              'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict}

    print("Saving ...")
    save(args, data, shared, out_name)
    print("Saving complete!")
Пример #8
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    if args.tokenizer == "PTB":
        import nltk.tokenize as nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from corenlp import CoreNLPClient
        interface = CoreNLPClient(annotators="tokenize ssplit".split())
    else:
        raise Exception()
    """
    if not args.split:
        sent_tokenize = lambda para: [para]
    """
    source_path = in_path or os.path.join(
        args.source_dir, "{}-{}v1.1.json".format(data_type, args.suffix))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    na = []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            while True:
                try:
                    temp = interface.annotate(context)
                    break
                except Exception as e:
                    time.sleep(0.2)
            context_s = []
            for sent in temp.sentence:
                sent = [word.originalText for word in sent.token]
                # CoreNLP 가 처리못하는 단어인 \xa0 를 예외처리하기위한 수동 방법...
                for wi in range(len(sent)):
                    if "\xa0" in sent[wi]:
                        sent = sent[:wi] + sent[wi].split("\xa0") + sent[wi +
                                                                         1:]
                        wi = 0
                context_s.append(sent)
            xi = context_s
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                # get words
                while True:
                    try:
                        temp = interface.annotate(qa['question']).sentence[0]
                        break
                    except Exception as e:
                        time.sleep(0.2)
                #print(temp.token[0])
                #exit(-1)
                qi = [t_s.originalText for t_s in temp.token]
                print(qi)
                exit(-1)
                qi = process_tokens(qi)
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start,
                                             answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1] - 1]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1] - 1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)
                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                if len(qa['answers']) == 0:
                    yi.append([(0, 0), (0, 1)])
                    cyi.append([0, 1])
                    na.append(True)
                else:
                    na.append(False)

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

        if args.debug:
            break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {
        'q': q,
        'cq': cq,
        'y': y,
        '*x': rx,
        '*cx': rcx,
        'cy': cy,
        'idxs': idxs,
        'ids': ids,
        'answerss': answerss,
        '*p': rx,
        'na': na
    }
    shared = {
        'x': x,
        'cx': cx,
        'p': p,
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }

    print("saving ...")
    save(args, data, shared, out_name)
Пример #9
0
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):
    """


    :param args:            arguments
    :param data_type:       train, dev or all
    :param start_ratio:     default is 0.0
    :param stop_ratio:      default is 1.0
    :param out_name:        train, dev or test
    :param in_path:         default is None, not sure about what is this
    :return:
    """

    # 1. tokenize and sent tokenize

    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            """
            firstly word_tokenize the tokens and replace some
            chars, and return a list
            :param tokens:
            :return:
            """
            return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]  # input is para, turn it to a list

    # 2. load data from disk
    source_path = in_path or os.path.join(args.source_dir,
                                          "{}-v{}.json".format(data_type, args.version))
    source_data = json.load(open(file=source_path, mode='r'))

    # 3. initiate some counter and some lists
    q, cq, rx, rcx = [], [], [], []
    # question, char_question, context, char_context
    y, cy, ids, idxs = [], [], [], []
    x, cx = [], []
    answerss, p = [], []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    start_at_index = int(round(len(source_data['data']) * start_ratio))
    stop_at_index = int(round(len(source_data['data']) * stop_ratio))

    # 4. iterate the dataset
    max_ques_size = 0
    max_context_size = 0
    max_word_size = 0

    for article_index, article in enumerate(tqdm(source_data['data'][start_at_index:stop_at_index])):
        xp, cxp, pp = [], [], []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)

        for paragraph_index, paragraph in enumerate(article['paragraphs']):
            context = paragraph['context']
            context = context.replace("''", '" ')
            # notice this space, so the length of the context will not change when replace
            context = context.replace("``", '" ')

            # context is a str here
            list_of_wordlist = list(map(word_tokenize, sent_tokenize(context)))
            # after sent_tokenizer, it will be a list of sentence, here just one sentence,
            # a list of sentence
            # then the map, will apply the word_tokenize func to each sentence
            # a list of lists of words
            # [[words for sentence1], [words for sentence2]]

            list_of_wordlist = [process_tokens(tokens) for tokens in list_of_wordlist]
            # list_of_wordlist is a 2d stuff
            for wordlist in list_of_wordlist:
                max_context_size = max(max_context_size, len(wordlist))

            list_of_charlist = [[list(word) for word in wordlist] for wordlist in list_of_wordlist]
            # list of charlist is a 3d, sentence-dim, word-dim, char-dim

            xp.append(list_of_wordlist)
            # 3d, paragraph, sentence, words
            cxp.append(list_of_charlist)
            # 4d, paragraph, sentence, words, chars
            pp.append(context)
            # 2d, paragraph, context

            ## update counters
            num_qas = len(paragraph['qas'])
            for wordlist in list_of_wordlist:
                for word in wordlist:
                    word_counter[word] += num_qas
                    lower_word_counter[word.lower()] += num_qas
                    for char in word:
                        char_counter[char] += num_qas

            rxi = [article_index, paragraph_index]
            assert len(x) - 1 == article_index
            # x stores xp, xp is 3d, paragraph, sentece, and words
            assert len(x[article_index]) - 1 == paragraph_index

            for question in paragraph['qas']:
                question_wordslist = word_tokenize(question['question'])
                max_ques_size = max(max_ques_size, len(question_wordslist))
                # it's a list of words
                question_charslist = [list(word) for word in question_wordslist]
                # it's a list of charlist
                yi = []
                cyi = []
                answers = []  # the content of each answers

                for answer in question['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start_index = answer['answer_start']
                    answer_end_index = answer_start_index + len(answer_text)
                    yi0, yi1 = get_word_span(context,
                                             list_of_wordlist,  # 2-d: sentences, words
                                             answer_start_index,
                                             answer_end_index)
                    # yi0 (0, 108), 0 is the index of sentence
                    # yi1 (0, 111). 108 and 111 is the start and end of word index

                    assert len(list_of_wordlist[yi0[0]]) > yi0[1]
                    # the length of the first sentence is larger than 108
                    assert len(list_of_wordlist[yi1[0]]) >= yi1[1]
                    # the length of the first sentence is larger or equla to 111

                    w0 = list_of_wordlist[yi0[0]][yi0[1]]  # the start words of the answer
                    w1 = list_of_wordlist[yi1[0]][yi1[1] - 1]  # the last word of the answer

                    i0 = get_word_idx(context, list_of_wordlist, yi0)
                    i1 = get_word_idx(context, list_of_wordlist, (yi1[0], yi1[1] - 1))
                    # i0 is 515, which is the char index of the answer,
                    # i1 is start index of the final word in terms of chars
                    # 'Saint Bernadette Soubirous', i1 is the index of S in Soubirous
                    cyi0 = answer_start_index - i0
                    # it should be 0 here since start index is 515, and i0 should also be 515
                    cyi1 = answer_end_index - i1 - 1
                    # cyi1 seems to be the length of last word -1, or because some other issues

                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    # be sure the first char and last char are same with the first word's first char and last word's last char
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)
                    #
                    yi.append([yi0, yi1])  # index of words
                    cyi.append([cyi0, cyi1])
                    # index of shifts from the first char and last char of the answer in context

                # update counters
                for word in question_wordslist:
                    word_counter[word] += 1
                    lower_word_counter[word.lower()] += 1
                    for char in word:
                        char_counter[char] += 1

                q.append(question_wordslist)  # 2-d list of wordlist for each question
                cq.append(question_charslist)  # 3-d, question-word-char
                y.append(yi)  # question-startendpair
                cy.append(cyi)  # question-startend char pair
                rx.append(rxi)  # list of article_id-paragraph_id pair
                rcx.append(rxi)
                ids.append(question['id'])  # ids for each question
                idxs.append(len(idxs))  # index for each question
                answerss.append(answers)  # list of answer in string

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    for word in word_counter:
        max_word_size = max(max_word_size, len(word))


    # add context here
    data = {
        'q': q,  # list of word list of each questions, [['who','are', 'you'], ... ]
        'cq': cq,
        # [<class 'list'>: [['T', 'o'], ['w', 'h', 'o', 'm'], ['d', 'i', 'd'], ['t', 'h', 'e'], ['V', 'i', 'r', 'g', 'i', 'n'], ['M', 'a', 'r', 'y'], ['a', 'l', 'l', 'e', 'g', 'e', 'd', 'l', 'y'], ['a', 'p', 'p', 'e', 'a', 'r'], ['i', 'n'], ['1', '8', '5', '8'], ['i', 'n'], ['L', 'o', 'u', 'r', 'd', 'e', 's'], ['F', 'r', 'a', 'n', 'c', 'e'], ['?']] , ...]
        'y': y,  # list of <class 'list'>: [[(0, 108), (0, 111)]]
        '*x': rx,  # list of <class 'list'>: [0, 21], 0 means the number of article, 21 means the 21st paragraph
        '*cx': rcx,  # same with rx but for characters, i guess the values are same as well
        'cy': cy,  #
        'idxs': idxs,  # just those ids
        'ids': ids,  # the id of each question, sth like uuid
        'answerss': answerss,  # the content of the answer
        '*p': rx  #
    }
    shared = {
        'x': x,  # words of each paragraph
        'cx': cx,  # characters of each
        'p': p,  # the content of each paragraph
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }

    print("saving ...")
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize
        def word_tokenize(tokens):
            return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    # source_path = in_path or os.path.join(args.source_dir, "{}-v1.1.json".format(data_type))
    source_path = in_path or os.path.join(args.source_dir, "{}.json".format(data_type))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    # start_ai = int(round(len(source_data['data']) * start_ratio))
    # stop_ai = int(round(len(source_data['data']) * stop_ratio))
    start_ai = int(round(len(source_data) * start_ratio))
    stop_ai = int(round(len(source_data) * stop_ratio))

    # for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
    for ai, article in enumerate(tqdm(source_data[start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        # print(article.keys()) # paragraph, title
        # raise
        # print(article) # {'question', 'answer', 'context', 'answer_list'}
        # raise
        # for pi, para in enumerate(article['paragraphs']):
        for pi, para in enumerate([article]):
            # print(para.keys()) # qas, context
            # raise
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            # print(xi) # [['archi', ',', 'the', 'school']]
            # raise
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            # print(len(para['qas'])) # 5
            # print(para['qas']) # [{'answers': [{'text', 'answer_start'}], 'id', 'question'}]
            # raise
            for xij in xi:
                for xijk in xij:
                    # word_counter[xijk] += len(para['qas'])
                    # lower_word_counter[xijk.lower()] += len(para['qas'])
                    word_counter[xijk] += 1
                    lower_word_counter[xijk.lower()] += 1
                    for xijkl in xijk:
                        # char_counter[xijkl] += len(para['qas'])
                        char_counter[xijkl] += 1

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            # for qa in para['qas']:
            for qa in [article]:
                # get words
                # qi = word_tokenize(qa['question'])
                qi = word_tokenize(qa['question'])
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                # for answer in qa['answers']:
                try:
                    answer = qa['answer']
                    answer_text = qa['answer_list'][answer-1]
                except KeyError:
                    answer_text = ' '.join(qa['answer_list'])
                for _ in [answer_text]:
                    # answer_text = answer['text']
                    answers.append(answer_text)
                    # answer_start = answer['answer_start']
                    try:
                        answer_start = context.index(answer_text)
                        answer_stop = answer_start + len(answer_text)
                    except ValueError:
                        answer_start = 0
                        answer_stop = len(context)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1]-1]
                    # context: str
                    # xi: [[word, word, word, ...]]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, '|', w0[cyi0:], '|', w1[:cyi1+1])
                    # raise
                    #assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    #assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                # ids.append(qa['id'])
                ids.append(qa['question'])
                idxs.append(len(idxs))
                answerss.append(answers)

            if args.debug:
                break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy,
            'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx}
    shared = {'x': x, 'cx': cx, 'p': p,
              'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict}

    print("saving ...")
    save(args, data, shared, out_name)
Пример #11
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    """

    :param args: configurations
    :param data_type: train or dev
    :param start_ratio:
    :param stop_ratio:
    :param out_name: train, dev, test
    :param in_path:
    :return:
    """
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    # 1. load data
    source_path = in_path or os.path.join(
        args.source_dir, "{}-v{}.json".format(data_type, args.version))
    source_data = json.load(open(source_path, 'r'))
    # load the train  data or dev 1.1 dataset

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    start_at_index = int(round(len(source_data['data']) * start_ratio))
    stop_at_index = int(round(len(source_data['data']) * stop_ratio))

    # for each article
    for article_index, article in enumerate(
            tqdm(source_data['data'][start_at_index:stop_at_index])):
        xp, cxp = [], []
        pp = []
        x.append(xp)  # article_paragraph_sentence_wordlist
        cx.append(cxp)  # article_paragraph_sentence_word_charlist
        p.append(pp)  # article_contextlist

        # for each paragrph of the article
        for paragraph_index, paragraph in enumerate(article['paragraphs']):
            # wordss
            context = paragraph['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            list_of_wordlist = list(map(word_tokenize, sent_tokenize(context)))
            list_of_wordlist = [
                process_tokens(tokens) for tokens in list_of_wordlist
            ]  # process tokens
            # xi are words
            # given xi, add chars
            list_of_charlist = [[list(word) for word in word_list]
                                for word_list in list_of_wordlist]
            # cxi are characters for each words
            xp.append(list_of_wordlist)  # paragraph_sentence_wordlist
            cxp.append(list_of_charlist)  # paragraph_sentence_word_charlist
            pp.append(context)  # contextlist

            # update the counter to plus the number of questions
            for wordlist in list_of_wordlist:
                for word in wordlist:
                    word_counter[word] += len(paragraph['qas'])
                    lower_word_counter[word.lower()] += len(paragraph['qas'])
                    for char in word:
                        char_counter[char] += len(paragraph['qas'])

            rxi = [article_index, paragraph_index]
            assert len(x) - 1 == article_index
            assert len(x[article_index]) - 1 == paragraph_index
            for question in paragraph['qas']:
                # get words
                question_wordslist = word_tokenize(question['question'])
                question_charslist = [list(qij) for qij in question_wordslist]
                yi = []
                cyi = []
                answers = []
                for answer in question['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, list_of_wordlist,
                                             answer_start, answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(list_of_wordlist[yi0[0]]) > yi0[1]
                    assert len(list_of_wordlist[yi1[0]]) >= yi1[1]
                    w0 = list_of_wordlist[yi0[0]][yi0[1]]
                    w1 = list_of_wordlist[yi1[0]][yi1[1] - 1]
                    i0 = get_word_idx(context, list_of_wordlist, yi0)
                    i1 = get_word_idx(context, list_of_wordlist,
                                      (yi1[0], yi1[1] - 1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                for qij in question_wordslist:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(question_wordslist)  # question_wordlist,
                cq.append(question_charslist)  # qeustion_word_charlist
                y.append(yi)  #
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(question['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

            if args.debug:
                break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {
        'q':
        q,  # list of word list of each questions, [['who','are', 'you'], ... ]
        'cq': cq,
        # [<class 'list'>: [['T', 'o'], ['w', 'h', 'o', 'm'], ['d', 'i', 'd'], ['t', 'h', 'e'], ['V', 'i', 'r', 'g', 'i', 'n'], ['M', 'a', 'r', 'y'], ['a', 'l', 'l', 'e', 'g', 'e', 'd', 'l', 'y'], ['a', 'p', 'p', 'e', 'a', 'r'], ['i', 'n'], ['1', '8', '5', '8'], ['i', 'n'], ['L', 'o', 'u', 'r', 'd', 'e', 's'], ['F', 'r', 'a', 'n', 'c', 'e'], ['?']] , ...]
        'y': y,  # list of <class 'list'>: [[(0, 108), (0, 111)]]
        '*x':
        rx,  # list of <class 'list'>: [0, 21], 0 means the number of article, 21 means the 21st paragraph
        '*cx':
        rcx,  # same with rx but for characters, i guess the values are same as well
        'cy': cy,  #
        'idxs': idxs,  # just those ids
        'ids': ids,  # the id of each question, sth like uuid
        'answerss': answerss,  # the content of the answer
        '*p': rx  #
    }
    # the following variables are shared by several question,
    shared = {
        'x': x,  # words of each paragraph
        'cx': cx,  # characters of each
        'p': p,  # the content of each paragraph
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }

    print("saving ...")
Пример #12
0
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize
        def word_tokenize(tokens):
 #           return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
            return [token.replace("''", '"').replace("``", '"') for token in jieba.lcut(tokens, cut_all=False)]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

#    if not args.split:
#        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir, "{}-v1.1.json".format(data_type))
    source_data = json.load(open(source_path, 'r', encoding='utf-8'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            
            ################### add by zhijing
            table = {ord(f):ord(t) for f,t in zip(
            	',。!?【】()%#@&1234567890',
            	',.!?[]()%#@&1234567890')}
            context = context.translate(table)
            ################### add by zhijing
            print(context)
            print(len(sent_tokenize(context)))
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            print('xi')
            print(xi)
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                # get words
                qi = word_tokenize(qa['question'])
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1]-1]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

            if args.debug:
                break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy,
            'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx}
    shared = {'x': x, 'cx': cx, 'p': p,
              'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict}

    print("saving ...")
    save(args, data, shared, out_name)
Пример #13
0
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):

    word_tokenize, sent_tokenize = get_sent_tokenize()

    source_data = []
    f = open(os.path.join(args.source_dir, 'WikiQA-%s.txt' % data_type), 'r', encoding='utf-8')
    curr_question = None
    lines = (f.read()).rsplit('\n')
    for i, line in enumerate(lines):
        if line == '' : continue
        t = tuple(line.rsplit('\t'))
        assert len(t)==3, t
        question, sentence, correct = t
        if not curr_question == question:
            if not (curr_question is None or answer_list == []):
                context = ' '.join(context_list)
                context = context.replace(' .', '.')
                answers = [{'answer_start':0, 'text':answer_list}]
                _id = len(source_data)
                qas = [{'answers':answers, 'id':_id, 'question':curr_question}]
                dic = {'context' : context, 'qas' : qas}
                source_data.append({'paragraphs' : [dic]})
            context_list = []
            answer_list = []
        curr_question = question
        if not sentence.endswith('.'):
            sentence += '.'
        context_list.append(sentence)
        if correct == '1':
            answer_list.append(sentence)

    json.dump({'data' : source_data}, open(os.path.join(args.source_dir, '%s.json' % data_type), 'w'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    start_ai = int(round(len(source_data) * start_ratio))
    stop_ai = int(round(len(source_data) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data[start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                # get words
                qi = word_tokenize(qa['question'])
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1]-1]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    #assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    #assert answer_text[-1] == w1[cyi1]
                    #assert cyi0 < 32, (answer_text, w0)
                    #assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

            if args.debug:
                break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy,
            'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx}
    shared = {'x': x, 'cx': cx, 'p': p,
              'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict}

    print("saving ...")
    save(args, data, shared, out_name)
Пример #14
0
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize
        def word_tokenize(tokens):
            return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir, "{}-v1.1.json".format(data_type))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                # get words
                qi = word_tokenize(qa['question'])
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1]-1]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

            if args.debug:
                break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy,
            'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx}
    shared = {'x': x, 'cx': cx, 'p': p,
              'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict}

    print("saving ...")
    save(args, data, shared, out_name)