예제 #1
0
def process_data(df):
    print('Formatting data...')
    data = {}

    n = df.size

    data = {}

    for idx, row in df.iterrows():
        if idx > 0 and idx % 10000 == 0:
            write_data(data)
            data = {}
            print('{} / {}'.format(idx, n))

        doc_id = '{}'.format(idx)
        a = {}
        text = row['ab']

        if type(text) != str or len(text) == 0:
            continue

        a['title'] = row['ti']
        a['text'] = text
        a['sample_size'] = row['num_randomized']
        a['sents'] = utils.sent_tokenize(text)

        data[doc_id] = a

    return data
예제 #2
0
def categorize_message(message):  # Название функции не играет никакой роли
    utterances = sent_tokenize(message.text)
    response = requests.post(url, json={'sentences': utterances}).json()
    reply = "\n".join([f"\"{u}\" - {r}" for u, r in zip(utterances, response)])
    logging.info(
        f"Chat id: {message.chat.id} | Utterances: {utterances} | Reply: {reply}"
    )
    bot.send_message(message.chat.id, reply)
예제 #3
0
def process_t5():
    srcf = open(args.out_src, 'w')
    reff = open(args.out_ref, 'w')
    decf = open(args.out_dec, 'w')

    sent_tokenizer = utils.get_sent_tokenizer(tokenizer='spacy')
    for i, line in enumerate(open(args.infile, 'r')):
        doc = json.loads(line.strip())

        src = utils.sent_list_to_tagged_str(
            utils.sent_tokenize(doc['article'], sent_tokenizer))
        ref = utils.sent_list_to_tagged_str(
            utils.sent_tokenize(doc['reference'], sent_tokenizer))
        dec = utils.sent_list_to_tagged_str(
            utils.sent_tokenize(doc['decoded'], sent_tokenizer))

        srcf.write(src + '\n')
        reff.write(ref + '\n')
        decf.write(dec + '\n')
        print(f"processed line {i}", end='\r')

    srcf.close()
    reff.close()
    decf.close()
예제 #4
0
    def split_data(self, filename):
        self.load_data(filename)
        sub_dir = filename.split('-')[0]

        # create a subdirectory for Train and Dev data
        if not os.path.exists(os.path.join(self.data_dir, sub_dir)):
            os.makedirs(os.path.join(self.data_dir, sub_dir))

        with open(os.path.join(self.data_dir, sub_dir, sub_dir + '.context'), 'w', encoding="utf-8") as context_file,\
             open(os.path.join(self.data_dir, sub_dir, sub_dir + '.sentence'), 'w', encoding="utf-8") as sentence_file,\
             open(os.path.join(self.data_dir, sub_dir, sub_dir + '.question'), 'w', encoding="utf-8") as question_file,\
             open(os.path.join(self.data_dir, sub_dir, sub_dir + '.answer'), 'w', encoding="utf-8") as answer_file:

            # loop over the data
            for article_id in tqdm.tqdm(range(len(self.data['data']))):
                list_paragraphs = self.data['data'][article_id]['paragraphs']
                # loop over the paragraphs
                for paragraph in list_paragraphs:
                    context = paragraph['context']
                    context = clean_text(context)
                    context_tokens = word_tokenize(context)
                    if config.paragraph and (
                            len(context_tokens) < config.min_len_context
                            or len(context_tokens) > config.max_len_context):
                        continue
                    context_sentences = sent_tokenize(context)
                    spans = convert_idx(context, context_tokens)
                    num_tokens = 0
                    first_token_sentence = []
                    for sentence in context_sentences:
                        first_token_sentence.append(num_tokens)
                        num_tokens += len(sentence)
                    qas = paragraph['qas']
                    # loop over Q/A
                    for qa in qas:
                        question = qa['question']
                        question = clean_text(question)
                        question_tokens = word_tokenize(question)
                        if question_tokens[-1] != "?" or len(
                                question_tokens
                        ) < config.min_len_question or len(
                                question_tokens) > config.max_len_question:
                            continue
                        if sub_dir == "train":
                            # select only one ground truth, the top answer, if any answer
                            answer_ids = 1 if qa['answers'] else 0
                        else:
                            answer_ids = len(qa['answers'])
                        if answer_ids:
                            for answer_id in range(answer_ids):
                                answer = qa['answers'][answer_id]['text']
                                answer = clean_text(answer)
                                answer_tokens = word_tokenize(answer)
                                answer_start = qa['answers'][answer_id][
                                    'answer_start']
                                answer_stop = answer_start + len(answer)

                                # Getting spans of the answer in the context
                                answer_span = []
                                for idx, span in enumerate(spans):
                                    if not (answer_stop <= span[0]
                                            or answer_start >= span[1]):
                                        answer_span.append(idx)
                                if not answer_span:
                                    continue

                                # Getting the sentence where we have the answer
                                sentence_tokens = []
                                for idx, start in enumerate(
                                        first_token_sentence):
                                    if answer_span[0] >= start:
                                        sentence_tokens = context_sentences[
                                            idx]
                                        answer_sentence_span = [
                                            span - start
                                            for span in answer_span
                                        ]
                                    else:
                                        break
                                if not sentence_tokens:
                                    print("Sentence cannot be found")
                                    raise Exception()

                            # write to file
                            context_file.write(" ".join([
                                token + u"│" +
                                "1" if idx in answer_span else token + u"│" +
                                "0" for idx, token in enumerate(context_tokens)
                            ]) + "\n")
                            sentence_file.write(" ".join([
                                token + u"│" +
                                "1" if idx in answer_sentence_span else token +
                                u"│" + "0"
                                for idx, token in enumerate(sentence_tokens)
                            ]) + "\n")
                            question_file.write(
                                " ".join([token
                                          for token in question_tokens]) +
                                "\n")
                            answer_file.write(
                                " ".join([token
                                          for token in answer_tokens]) + "\n")
예제 #5
0
    def split_data(self, filename):
        self.load_data(filename)

        envs = ["train", "dev"]
        for sub_dir in envs:
            # create a subdirectory for Train and Dev data
            if not os.path.exists(os.path.join(self.data_dir, sub_dir)):
                os.makedirs(os.path.join(self.data_dir, sub_dir))

            with open(os.path.join(self.data_dir, sub_dir, sub_dir + ".context"), "w", encoding="utf-8") as context_file,\
                 open(os.path.join(self.data_dir, sub_dir, sub_dir + ".sentence"), "w", encoding="utf-8") as sentence_file,\
                 open(os.path.join(self.data_dir, sub_dir, sub_dir + ".question"), "w", encoding="utf-8") as question_file,\
                 open(os.path.join(self.data_dir, sub_dir, sub_dir + ".answer"), "w", encoding="utf-8") as answer_file:

                # loop over the data
                for article in tqdm.tqdm(self.data["data"]):
                    context = article["text"]
                    context_tokens = word_tokenize(context)
                    context_sentences = sent_tokenize(context)
                    if config.paragraph and (
                            len(context_tokens) < config.min_len_context
                            or len(context_tokens) > config.max_len_context):
                        continue
                    spans = convert_idx(context, context_tokens)
                    num_tokens = 0
                    first_token_sentence = []
                    for sentence in context_sentences:
                        first_token_sentence.append(num_tokens)
                        num_tokens += len(sentence)
                    if not article["type"] == sub_dir:
                        continue
                    for question in article["questions"]:
                        if question.get("isQuestionBad") == 0 and question[
                                "consensus"].get("s"):
                            q = question["q"].strip()
                            if q[-1] != "?" or len(q.split(
                            )) < config.min_len_question or len(
                                    q.split()) > config.max_len_question:
                                continue
                            answer_start = question["consensus"]["s"]
                            answer = context[question["consensus"]["s"]:
                                             question["consensus"]["e"]].strip(
                                                 ".| ").strip("\n")
                            answer_stop = answer_start + len(answer)

                            # Getting spans of the answer in the context
                            answer_span = []
                            for idx, span in enumerate(spans):
                                if not (answer_stop <= span[0]
                                        or answer_start >= span[1]):
                                    answer_span.append(idx)
                            if not answer_span:
                                continue

                            # Getting the sentence where we have the answer
                            sentence_tokens = []
                            for idx, start in enumerate(first_token_sentence):
                                if answer_span[0] >= start:
                                    sentence_tokens = context_sentences[idx]
                                    answer_sentence_span = [
                                        span - start for span in answer_span
                                    ]
                                else:
                                    break

                            # write to file
                            sent = []
                            for idx, token in enumerate(sentence_tokens):
                                if token.strip("\n").strip():
                                    if idx in answer_sentence_span:
                                        sent.append(token + u"│" + "1")
                                    else:
                                        sent.append(token + u"│" + "0")
                            sent = " ".join(sent)
                            sent = sent.strip()
                            index = sent.find("(│0 CNN│0 )│0 --│0 ")
                            if index > -1:
                                sent = sent[index +
                                            len("(│0 CNN│0 )│0 --│0 "):]

                            ctxt = []
                            for idx, token in enumerate(context_tokens):
                                if token.strip("\n").strip():
                                    if idx in answer_span:
                                        ctxt.append(token + u"│" + "1")
                                    else:
                                        ctxt.append(token + u"│" + "0")
                            ctxt = " ".join(ctxt)
                            ctxt = ctxt.strip()
                            index = ctxt.find("(│0 CNN│0 )│0 --│0 ")
                            if index > -1:
                                ctxt = ctxt[index +
                                            len("(│0 CNN│0 )│0 --│0 "):]

                            context_file.write(ctxt + "\n")
                            sentence_file.write(sent + "\n")
                            question_file.write(q + "\n")
                            answer_file.write(answer + "\n")
예제 #6
0
    def split_sentence_question(self, filename, data_type):
        data = self.load_data(filename)
        with open(os.path.join(self.save_dir + data_type + '.sentence'), 'w', encoding="utf-8") as sentence_file,\
             open(os.path.join(self.save_dir + data_type + '.question'), 'w', encoding="utf-8") as question_file:

            artilces = data
            for article in tqdm(artilces):
                paragraphs = article['paragraphs']
                for paragraph in paragraphs:
                    context = paragraph['context']

                    context = clean_text(context)
                    context_tokens = word_tokenize(context)
                    context_sentences = sent_tokenize(context)

                    spans = convert_idx(context, context_tokens)
                    num_tokens = 0
                    first_token_sentence = []
                    for sentence in context_sentences:
                        first_token_sentence.append(num_tokens)
                        num_tokens += len(sentence)

                    question_and_answer_list = paragraph['qas']
                    for question_and_answer in question_and_answer_list:
                        question = question_and_answer['question']
                        question = clean_text(question)
                        question_tokens = word_tokenize(question)

                        if len(question_tokens) > MAX_QUESTION_LENGTH or len(
                                question_tokens) < MIN_QUESTION_LENGHT:
                            continue

                        if not question_and_answer['answers']: continue
                        answer = question_and_answer['answers'][0]
                        answer_text = answer['text']
                        answer_text = clean_text(answer_text)
                        answer_tokens = word_tokenize(answer_text)
                        answer_start = answer['answer_start']
                        answer_stop = answer_start + len(answer_text)

                        answer_span = []
                        for idx, span in enumerate(spans):
                            if not (answer_stop <= span[0]
                                    or answer_start >= span[1]):
                                answer_span.append(idx)
                        if not answer_span: continue

                        sentence_tokens = []
                        for idx, start in enumerate(first_token_sentence):
                            if answer_span[0] >= start:
                                sentence_tokens = context_sentences[idx]
                                answer_sentence_span = [
                                    span - start for span in answer_span
                                ]
                            else:
                                break
                        if not sentence_tokens:
                            print("Sentence cannot be found")
                            raise Exception()

                        if len(sentence_tokens) > MAX_SENTENCE_LENGTH or len(
                                sentence_tokens) < MIN_SENTENCE_LENGTH:
                            continue

                        sentence_file.write(" ".join([
                            token + u"│" +
                            "1" if idx in answer_sentence_span else token +
                            u"│" + "0"
                            for idx, token in enumerate(sentence_tokens)
                        ]) + "\n")
                        question_file.write(
                            " ".join([token
                                      for token in question_tokens]) + "\n")
    sys.setdefaultencoding('utf-8')

    # define the delimiter
    danda_ = int("0964", 16)
    delim = unichr(danda_)

    # read in the data
    f = open("data/eng-hin-modified.txt", "r+")
    s = f.readlines()
    f.close()

    sentences = []

    # tokenize the whole thing into sentences
    for line in s[1:2000]:
        t_ = sent_tokenize(line, delim)
        t_ = [x for x in t_ if x != "\n"]
        sentences += t_

    # tokenize the whole thing into words
    words = []
    for sent in sentences:
        tok_ = Tokenizer(sent)
        tok_.tokenize()
        words += tok_.tokens

    unigrams = unigrammatize(words)
    unigrams = freq_sorted_unigrams(unigrams)

    #stopwords = []
    for gram in unigrams:
예제 #8
0
    return utils.sent_list_to_tagged_str(sents)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-ref", required=True, help="path to ref file")
    parser.add_argument("-out", required=True, help="path to out file")
    parser.add_argument("-out_dir",
                        required=True,
                        help="path to output dir: processed/bart")
    args = parser.parse_args()

    ref = utils.read_file(args.ref)
    out = utils.read_file(args.out)
    out = [line.split('\t')[1].strip() for line in out]
    assert len(ref) == len(out)

    ref = [tokenize(line) for line in ref]
    sent_tokenizer = utils.get_sent_tokenizer(tokenizer='spacy')
    out = [
        utils.sent_list_to_tagged_str(utils.sent_tokenize(
            line, sent_tokenizer)) for line in out
    ]
    assert len(out) == len(ref)

    with open(join(args.out_dir, "ref.txt"), 'w') as f:
        f.write('\n'.join(ref) + '\n')

    with open(join(args.out_dir, "out.txt"), 'w') as f:
        f.write('\n'.join(out) + '\n')