Exemplo n.º 1
0
def read_medquad_raw_dataset() -> List[Dict]:
    logging.basicConfig(level=logging.INFO)

    tokenizer = stanza.Pipeline(lang='en', processors='tokenize')
    ds = []
    nb_generate_data = 0

    for subset_dir in os.listdir(MEDQUAD_RAW_DIR):
        dirpath = f"{MEDQUAD_RAW_DIR}/{subset_dir}"
        if os.path.isdir(dirpath):
            for xml_file in os.listdir(dirpath):
                filepath = f"{dirpath}/{xml_file}"
                if os.path.isfile(filepath) and xml_file.endswith(".xml"):
                    parsed = ET.parse(filepath)
                    qa_pairs = parsed.getroot().find('QAPairs')
                    pair_tag = "QAPair"
                    q_tag = "Question"
                    a_tag = "Answer"
                    if qa_pairs is None:
                        # Some documents have XML tags although having the same structure
                        qa_pairs = parsed.getroot().find('qaPairs')
                        pair_tag = "pair"
                        q_tag = "question"
                        a_tag = "answer"
                        if qa_pairs is None:
                            logging.warning(f"No QAPairs tag in {ET.tostring(parsed.getroot())}")
                            continue
                    for qa in qa_pairs.findall(pair_tag):
                        question = qa.find(q_tag).text
                        answer = qa.find(a_tag).text
                        if not isinstance(question, str) or not isinstance(answer, str) or len(question) == 0 or \
                                len(answer) == 0:
                            logging.warning(f"Issue with QA pair: \n'{question}' \n'{answer}")
                            continue
                        question_tokens = tokenizer.process(question).sentences[0].tokens
                        paragraph = tokenizer.process(answer)
                        for i in range(0, len(paragraph.sentences), 2):
                            # Takes 2 sentences at a time
                            if i + 1 < len(paragraph.sentences):
                                tokens = paragraph.sentences[i].tokens + paragraph.sentences[i+1].tokens
                            else:
                                tokens = paragraph.sentences[i].tokens
                            answer_content = array_to_string(list(tok.text for tok in tokens))
                            question_content = array_to_string(list(tok.text for tok in question_tokens)).lower()
                            ds.append({
                                'question': question_content,
                                'answer': answer_content,
                                'sub_dataset': subset_dir,
                                'filename': xml_file
                            })
                            nb_generate_data += 1
                            if nb_generate_data % 10 == 0:
                                logging.info(f"Processed {nb_generate_data}")
    random.shuffle(ds)
    return ds
 def create_pos_sequences(self):
     pos_sequences = []
     for passage in self.passages:
         # Creates the POS sequence
         pos_sequence = []
         for word in passage:
             pos_sequence.append(word.xpos)
         pos_sequences.append(array_to_string(pos_sequence))
     return np.array(pos_sequences)
Exemplo n.º 3
0
def generate_medqa_handmade_dataset(ds_path):
    ds_raw = pd.read_csv(ds_path, sep='|')
    tokenizer = stanza.Pipeline(lang='en', processors='tokenize')
    ds = []
    for question, answer in zip(ds_raw['question'], ds_raw['answer']):
        question_tokens = tokenizer.process(question).sentences[0].tokens
        paragraph = tokenizer.process(answer)
        for i in range(0, len(paragraph.sentences), 2):
            # Takes 2 sentences at a time
            if i + 1 < len(paragraph.sentences):
                tokens = paragraph.sentences[i].tokens + paragraph.sentences[i + 1].tokens
            else:
                tokens = paragraph.sentences[i].tokens
            answer_content = array_to_string(list(tok.text for tok in tokens))
            question_content = array_to_string(list(tok.text for tok in question_tokens)).lower()
            ds.append({
                'question': question_content,
                'answer': answer_content,
            })
    pd.DataFrame(ds).to_csv(MEDQA_HANDMADE_FILEPATH, index=False, sep="|")
 def create_case_sequences(self) -> np.ndarray:
     """
     :return: The casing sequence for each passage ('UP' when the word's first letter is capitalized, 'LOW' otherwise).
     """
     case_seqs = []
     for passage in self.passages:
         case_seq = np.array(list("LOW" for _ in range(len(passage))))
         case_indices = np.where(
             list(str.isupper(word.text[0]) for word in passage))
         case_seq[case_indices] = "UP"
         case_seqs.append(array_to_string(case_seq))
     return case_seqs
 def create_ner_sequence(enhanced_ner, passage, ner_mapping=None):
     # Takes care of creating the NER sequence
     ner_sequence = np.full(shape=len(passage),
                            fill_value='O',
                            dtype=object)
     i = 0
     for word in passage:
         token_ner = word.parent._ner if len(
             word.parent._ner) == 1 else word.parent._ner[2:]
         # Takes either the most recent NER tag or the ones used in the original NQG paper
         ner_sequence[i] = token_ner if enhanced_ner else ner_mapping(
             token_ner)
         i += 1
     return array_to_string(ner_sequence)
 def create_bio_sequences(self, answer_starts: np.ndarray,
                          answer_lengths: np.ndarray) -> np.ndarray:
     """
     :param answer_starts: Indices of where the answers start for each passage.
     :param answer_lengths: The lengths (number of words) of each answer.
     :return: The BIO sequence of each passage.
     """
     bio_seqs = []
     for passage, answer_start, answer_length in zip(
             self.passages, answer_starts, answer_lengths):
         bio = list('O' for _ in range(len(passage)))
         bio[answer_start] = 'B'
         if answer_length > 1:
             for i in range(answer_start + 1, answer_start + answer_length):
                 bio[i] = 'I'
         bio_seqs.append(array_to_string(bio))
     return bio_seqs
    def create_ner_sequences(self, enhanced_ner):
        ner_sequences = []
        for passage in self.passages:
            # Takes care of creating the NER sequence
            ner_sequence = np.full(shape=len(passage),
                                   fill_value='O',
                                   dtype=object)
            i = 0
            for word in passage:
                token_ner = word.parent._ner if len(
                    word.parent._ner) == 1 else word.parent._ner[2:]
                # Takes either the most recent NER tag or the ones used in the original NQG paper
                ner_sequence[
                    i] = token_ner if enhanced_ner else self._ner_mapping(
                        token_ner)
                i += 1
            ner_sequences.append(array_to_string(ner_sequence))

        return np.array(ner_sequences)
Exemplo n.º 8
0
def generate_bio_features(mode: str, ds_name: str, answer_mode: str):
    assert answer_mode in ("none", "guess")
    source_dir = f"{NQG_DATA_HOME}/{ds_name}/{mode}"
    target_dir = f"{NQG_DATA_HOME}/{ds_name}"
    if answer_mode == "none":
        target_dir += "_NA"
    else:
        target_dir += "_GA"
    assert os.path.exists(source_dir) and os.path.isdir(source_dir)

    if not os.path.exists(target_dir):
        os.mkdir(target_dir)
    if not os.path.exists(f"{target_dir}/{mode}"):
        os.mkdir(f"{target_dir}/{mode}")

    if answer_mode == "none":
        bios = []
        source_passages = np.loadtxt(f"{source_dir}/data.txt.source.txt", dtype=str, delimiter='\n', comments=None)
        for passage in source_passages:
            bio = ["I" for _ in range(len(passage.split(" ")))]
            bio[0] = "B"
            bios.append(array_to_string(bio))

    if answer_mode == "guess":
        corpus_named_entities = np.loadtxt(f"{source_dir}/data.txt.ner", dtype=str, delimiter='\n', comments=None)
        corpus_pos_tags = np.loadtxt(f"{source_dir}/data.txt.pos", dtype=str, delimiter='\n', comments=None)
        bios = []
        for named_entities, pos_tags in zip(corpus_named_entities, corpus_pos_tags):
            named_entities = named_entities.split(' ')
            longest_ne_seq = []
            current_seq_length = []
            for i in range(len(named_entities)):
                ne = named_entities[i]
                if ne != 'O':
                    current_seq_length.append(i)
                else:
                    if len(current_seq_length) > len(longest_ne_seq):
                        longest_ne_seq = current_seq_length
                    current_seq_length = []
            if len(longest_ne_seq) == 0:
                # No named entities in this passage so we take the first noun phrase
                pos_tags = pos_tags.split(' ')
                try:
                    bio = ["O" for _ in range(len(pos_tags))]
                    i = 0
                    while i < len(pos_tags):
                        if pos_tags[i].startswith("NN"):
                            bio[i] = "B"
                            i += 1
                            break
                        i += 1
                    while i < len(pos_tags) and pos_tags[i].startswith("NN"):
                        bio[i] = "I"
                        i += 1
                except ValueError:
                    # No noun either, we fallback on using the full passage as the answer
                    bio = array_to_string(['B'] + ['I' for _ in range(len(named_entities) - 1)])
            else:
                bio = ['O' for _ in range(len(named_entities))]
                bio[longest_ne_seq[0]] = "B"
                for i in longest_ne_seq[1:]:
                    bio[i] = "I"
            bios.append(array_to_string(bio))

    np.savetxt(f"{target_dir}/{mode}/data.txt.bio", bios, fmt="%s")
 def uncased_sequences(self):
     return list(
         array_to_string(list(word.text.lower() for word in sequence))
         for sequence in self.passages)