示例#1
0
 def test_init_lines_no_repeats(self):
     """Test that indices are accurate when there are no repeated lines."""
     text_lines = sample_text_1.split('\n')
     line_objs = Line.init_lines(sample_text_1)
     expected = [sample_text_1.index(line) for line in text_lines]
     actual = [line.index for line in line_objs]
     self.assertListEqual(actual, expected)
示例#2
0
def add_rel_to_brat(ann_path,
                    rel_path,
                    txt_path,
                    null_ent_1="null",
                    null_ent_2="null"):
    """
    Given the full paths for an ann file, a rel file, and the txt file that those annotations represent,
    converts the incoming rel data to brat and appends it to the ann file. Input ann file must not contain any
    rel data.

    rel annotations typically refer to entities that have already been annotated. If this is not the case,
    new entities are appended to the ann file. However, the rel format only allows us to determine the character
    spans and entity text and not the entity type. Two parameters exist for this function to specify what those
    entities types should be.

    :param ann_path: The full path to the ann file
    :param rel_path: The full path to the rel file
    :param txt_path: The full path to the text file
    :param null_ent_1: What type of entity the first item in a relation pair should be called if not found in the
        input file.
    :param null_ent_2: What type of entity the second item in a relation pair should be called if not found in the
        input file.
    :return: None
    """

    # Get the text of the old ann file
    with open(ann_path, "r") as f:
        old_ann_text = f.read()

    old_ann_text_lines = old_ann_text.split("\n")

    t = 0

    # Go to the last line of the old ann file to figure out what T number we're starting at
    for line in reversed(old_ann_text_lines):
        if line.startswith("T"):
            t = int(re.findall("T\d+", line)[0][1:]) + 1
            break

    # We need to have a list of all the entities so that we can find the match
    all_entities = []

    # For each line in the old ann file that starts with T, we need to make a new Entity object and
    # add it to the list of entities
    for line in old_ann_text_lines:
        if line.startswith("T"):
            d = line_to_dict(line)
            new_entity = Entity(d["id_num"], d["data_type"], d["start_ind"],
                                d["end_ind"], d["data_item"])
            all_entities.append(new_entity)

    # Get the text of the rel file
    with open(rel_path, "r") as f:
        rel_text = f.read()

    # Get the text file that we have the annotations of
    with open(txt_path, "r") as f:
        text = f.read()
    # Create line objects for that file
    text_lines = Line.init_lines(text)

    output_text = ""
    r = 1

    # Iterate over all the relation lines
    for line in rel_text.split("\n"):
        if line == "": continue  # Skip blank lines
        if not is_valid_rel(line):
            logging.warning("Invalid rel text was skipped: %s" % line)
            continue
        # Using regex to pick apart the line of input
        c_items = re.findall(r'c="([^"]*)"', line)
        c1, c2 = c_items[0], c_items[1]
        all_spans = re.findall(r'\d+:\d+', line)
        r_item = re.findall(r'r="([^"]*)"', line)[0]

        start_ind_1 = get_absolute_index(text_lines, all_spans[0], c1)
        start_ind_2 = get_absolute_index(text_lines, all_spans[2], c2)

        # Create new Entity objects for the incoming data, which don't come with T numbers or entity types
        new_ent_1 = Entity(0, null_ent_1, start_ind_1, start_ind_1 + len(c1),
                           c1)
        new_ent_2 = Entity(0, null_ent_2, start_ind_2, start_ind_2 + len(c2),
                           c2)

        # These are booleans for if we're dealing with new data. We're going to check to see if these entities
        # already appear in our data.
        new_ent_1_new = True
        new_ent_2_new = True

        # Loop through to see if our new entities are already in the data.
        for e in all_entities:
            if new_ent_1 == e:
                new_ent_1 = e
                new_ent_1_new = False
                break
        for e in all_entities:
            if new_ent_2 == e:
                new_ent_2 = e
                new_ent_2_new = False
                break

        # Set the T values if they are new data and add it to the output text
        if new_ent_1_new:
            new_ent_1.t = t
            t += 1
            output_text += str(new_ent_1)
        if new_ent_2_new:
            new_ent_2.t = t
            t += 1
            output_text += str(new_ent_1)

        # Whether the entities are new or not, we still need to add the new relationship data
        output_text += f"R{r}\t{r_item} Arg1:T{new_ent_1.t} Arg2:T{new_ent_2.t}\n"
        r += 1

    # Write the new data to file, and we're done
    with open(ann_path, "a") as f:
        f.write(output_text)
示例#3
0
def convert_brat_to_con(brat_file_path, text_file_path=None):
    """
    Takes a path to a brat file and returns a string representation of that file converted to the con format.
    :param brat_file_path: The path to the brat file; not the file itself. If the path is not valid, the argument
        will be assumed to be text of the brat file itself.
    :param text_file_path: The path to the text file; if not provided, assumed to be a file with the same path as
        the brat file ending in '.txt' instead of '.ann'. If neither file is found, raises error.
    :return: A string (not a file) of the con equivalent of the brat file.
    """

    global num_lines, num_skipped_regex

    # By default, find txt file with equivalent name
    if text_file_path is None:
        text_file_path = switch_extension(brat_file_path, ".txt")
        if not os.path.isfile(text_file_path):
            raise FileNotFoundError(
                "No text file path was provided and no matching text file was found in the input"
                " directory")
        with open(text_file_path, 'r') as text_file:
            text = text_file.read()
            text_lines = Line.init_lines(text)
    # Otherwise open the file with the path passed to the function
    elif os.path.isfile(text_file_path):
        with open(text_file_path, 'r') as text_file:
            text = text_file.read()
            text_lines = Line.init_lines(text)
    else:
        raise FileNotFoundError(
            "No text file path was provided or the file was not found."
            " Note that direct string input of the source text is not supported."
        )

    # If con_file_path is actually a path, open it and split it into lines
    if os.path.isfile(brat_file_path):
        with open(brat_file_path, 'r') as brat_file:
            brat_text = brat_file.read()
            brat_text_lines = brat_text.split('\n')
    else:  # Else, read whatever string is passed to the function as if it were the file itself
        brat_text = brat_file_path
        brat_text_lines = brat_text.split('\n')

    output_lines = ""  # This value will be appended

    for line in brat_text_lines:

        if line.startswith("#") or not line:
            # Comments and blank lines can be skipped without warning
            continue
        elif not is_valid_brat(line):
            logging.warning(
                "Incorrectly formatted line in %s was skipped: \"%s\"." %
                (brat_file_path, line))
            num_skipped_regex += 1
            continue

        d = line_to_dict(line)

        start_line_num = find_line_num(text, d["start_ind"])
        start_source_line = text_lines[start_line_num]
        start_word_num = get_word_num(start_source_line, d["start_ind"])
        start_str = str(start_line_num + 1) + ':' + str(start_word_num)

        end_line_num = find_line_num(text, d["end_ind"])
        end_word_num = start_word_num + len(
            re.findall(whitespace_pattern, d["data_item"]))
        end_str = str(end_line_num + 1) + ':' + str(end_word_num)

        con_line = "c=\"%s\" %s %s||t=\"%s\"\n" % (d["data_item"], start_str,
                                                   end_str, d['data_type'])
        output_lines += con_line

        num_lines += 1

    return output_lines
示例#4
0
 def test_init_lines_with_repeats(self):
     """Test that indices are accurate even when lines are repeated."""
     line_objs = Line.init_lines(sample_text_2)
     expected = [0, 30, 54, 69, 90, 105, 146, 161]
     actual = [line.index for line in line_objs]
     self.assertListEqual(actual, expected)
示例#5
0
def convert_con_to_brat(con_file_path, text_file_path=None):
    """
    Converts a con file to a string representation of a brat file.
    :param con_file_path: Path to the con file being converted. If a valid path is not provided but the argument is a
        string, it will be parsed as if it were a representation of the con file itself.
    :param text_file_path: Path to the text file associated with the con file. If not provided, the function will look
        for a text file in the same directory with the same name except for the extention switched to 'txt'.
        Else, raises error. Note that no conversion can be performed without the text file.
    :return: A string representation of the brat file, which can then be written to file if desired.
    """

    global num_lines, num_skipped_regex, num_skipped_value_error

    # By default, find txt file with equivalent name
    if text_file_path is None:
        text_file_path = switch_extension(con_file_path, ".txt")
        if not os.path.isfile(text_file_path):
            raise FileNotFoundError(
                "No text file path was provided and no matching text file was found in the input"
                " directory")
        with open(text_file_path, 'r') as text_file:
            text = text_file.read()
            text_lines = Line.init_lines(text)
    # Else, open the file with the path passed to the function
    elif os.path.isfile(text_file_path):
        with open(text_file_path, 'r') as text_file:
            text = text_file.read()
            text_lines = Line.init_lines(text)
    else:
        raise FileNotFoundError(
            "No text file path was provided or the file was not found."
            " Note that direct string input of the source text is not supported."
        )

    num_lines += len(text_lines)

    # If con_file_path is actually a path, open it and split it into lines
    if os.path.isfile(con_file_path):
        with open(con_file_path, 'r') as con_file:
            con_text = con_file.read()
            con_text_lines = con_text.split('\n')
    # Else, read whatever string is passed to the function as if it were the file itself
    else:
        con_text = con_file_path
        con_text_lines = con_text.split('\n')

    output_text = ""
    t = 1
    for line in con_text_lines:
        if line == "" or line.startswith("#"): continue
        elif not is_valid_con(line):
            logging.warning(
                "Incorrectly formatted line in %s was skipped: \"%s\"." %
                (con_file_path, line))
            num_skipped_regex += 1
            continue
        d = line_to_dict(line)
        start_ind = get_absolute_index(text_lines, d["start_ind"],
                                       d["data_item"])
        if start_ind == -1:
            num_skipped_value_error += 1
            continue  # skips data that could not be converted
        span_length = len(d["data_item"])
        end_ind = start_ind + span_length

        # Check that the text of the annotation matches what's between its spans in the text document
        is_match = check_same_text(d['data_item'], start_ind, end_ind, text)
        if isinstance(is_match, str):
            logging.info(
                f"Annotation in file '{con_file_path}' did not match text between spans: '{d['data_item']}' != '{is_match}'"
            )
            d['data_item'] = is_match

        output_line = "T%s\t%s %s %s\t%s\n" % (str(t), d["data_type"],
                                               str(start_ind), str(end_ind),
                                               d["data_item"])
        output_text += output_line
        t += 1

    return output_text
示例#6
0
def add_ast_to_brat(ast_file_path, ann_file_path, txt_file_path):
    """
    Adds the assertion annotations to a given ann file
    :param ast_file_path: The assertion file to get the assertion annotations from
    :param ann_file_path: The ann file to add the assertion annotations to
    :param txt_file_path: The text file that the previous two are annotating
    :return: None
    """

    with open(txt_file_path) as f:
        text = f.read()
    text_lines = Line.init_lines(text)

    with open(ast_file_path) as f:
        ast_text = f.read()

    if ast_text == "":
        logging.info(
            f"There were no assertions in file {ast_file_path}, no conversion was performed"
        )
        return

    assertions = ast_text.split('\n')
    entities = Entity.init_from_doc(ann_file_path)

    a = 1  # used to keep track of the assertion number
    add_to_ann = ""

    for line in assertions:

        if not is_valid_assert(line):
            logging.warning(
                f"Invalid line of ast text in file {ast_file_path} was skipped: {line}"
            )
            continue

        # Get the part of the assertion annotation that is an entity (up to the '||a')
        print(line)
        a_part_index = line.index('||a')
        assertion_text = line[a_part_index + 5:-1]
        entity_part = line[:a_part_index]
        # Break up entity_part into named substrings
        ent_dict = line_to_dict(entity_part)
        # Get the BRAT-formatted (relative to the start of the document) index for the start of the entity
        ent_text = ent_dict['data_item']
        ent_type = ent_dict['data_type']
        start_ind = get_absolute_index(text_lines, ent_dict['start_ind'],
                                       ent_text)
        end_ind = start_ind + len(ent_text)

        # Get the text of the entity as it appears in the document, since it might not match what's provided
        # in the assertion file
        real_ent_text = text[start_ind:end_ind]
        if real_ent_text != ent_text:
            logging.info(
                f"Enity text in document {ast_file_path} didn't match; expected '{ent_text}', actual {real_ent_text}"
            )
        ent_text = real_ent_text

        ent = Entity(ent_type, start_ind, end_ind, ent_text)
        ent_match = None

        # See if the entity already exists in the ann file
        for e in entities:
            if ent == e:
                ent_match = e
                break

        # If not, add it to the new entities
        if ent_match is None:
            add_to_ann += str(ent) + '\n'
        else:
            # If the ent does have a match, we will use that from now on instead of the one we made
            ent = ent_match

        add_to_ann += f"A{a}\t{assertion_text} T{ent.num}\n"
        a += 1
        # End for

    with open(ann_file_path, 'a') as f:
        print("WRITING", add_to_ann)
        f.write(add_to_ann)