Пример #1
0
def importann(pathtofile):
    """
    Import ann and .txt files from a folder.

    :param pathtofile: (string) the path to the folder containing both the
    .ann and .txt files.
    :return: a tuple containing a dictionary of annotations and a string,
    representing the text of the document
    """
    annotations = readannfile(pathtofile)
    path, extension = os.path.splitext(pathtofile)

    sentences = []
    text = []

    char_index = 0

    for sent_index, line in enumerate(open(path + ".txt", encoding='utf-8')):
        sentences.append(Sentence(sent_index, line, char_index))
        char_index += len(line)

    _join(annotations.values(), sentences)

    with open(path + ".txt", encoding='utf-8') as input_file:
        text = input_file.read()

    return sentences, text
Пример #2
0
def importann(pathtofile):
    """
    Import ann and .txt files from a folder.

    :param pathtofile: (string) the path to the folder containing both the
    .ann and .txt files.
    :return: a tuple containing a dictionary of annotations and a string,
    representing the text of the document
    """
    annotations = readannfile(pathtofile)
    path, extension = os.path.splitext(pathtofile)

    sentences = []

    char_index = 0

    ##this is a horrible hack because i want to preserve newlines
    with open(path + ".txt", encoding='utf8') as f:
        for sent_index, line in enumerate(f.read().split('\n\n')):
            sentences.append(Sentence(sent_index, line, char_index))
            char_index += len(line) + len('\n\n')

    # for sent_index, line in enumerate(open(path + ".txt", encoding='utf-8')):
    #     sentences.append(Sentence(sent_index, line, char_index))
    #     char_index += len(line)

    _join(annotations.values(), sentences)
    return sentences
Пример #3
0
def importann(pathtofile):
    """
    Imports ann and .txt files from a folder.

    :param pathtofile: (string) the path to the folder containing both the .ann and .txt files.
    :return: a tuple containing a dictionary of annotations and a string, representing the text of the document
    """

    annotations = readannfile(pathtofile)
    context = _readcontext(".".join(pathtofile.split(".")[:-1] + ["txt"]))

    sentences = []

    index = 0

    for sindex, line in enumerate(context.splitlines()):

        sentences.append(Sentence(sindex, line, index))
        index += len(line)+1

    _couple(annotations.values(), sentences)

    return annotations, sentences
def importann(pathtofile):
    """
    Import ann and .txt files from a folder.

    :param pathtofile: (string) the path to the folder containing both the
    .ann and .txt files.
    :return: a tuple containing a dictionary of annotations and a string,
    representing the text of the document
    """
    annotations = readannfile(pathtofile)
    path, extension = os.path.splitext(pathtofile)

    sentences = []

    char_index = 0

    # for sent_index, line in enumerate(open((path + ".txt"), errors='ignore')):
    for sent_index, line in enumerate(
            open((path + ".txt"), errors='ignore', encoding="utf-8")):
        sentences.append(Sentence(sent_index, line, char_index))
        char_index += len(line) + 1

    _join(annotations.values(), sentences)
    return sentences
Пример #5
0
def importxml(filename):
    """
    Imports an XML file formatted with the format created by this program. Used for persistency and to operate on
    RepoModels in memory.

    :param filename: (string) the path to the file to be imported.
    :return: A tuple containing a dictionary of annotations and a list of dictionaries representing the context.
    """

    anndict = OrderedDict()
    sentobjects = []

    with codecs.open(filename, 'r', encoding='utf-8') as f:
        data = f.read()

    doc = etree.fromstring(data)

    sentences, annotations = doc.getchildren()

    for s in sentences.getchildren():

        repr = " ".join([w.text for w in s.getchildren()])
        sentobjects.append(
            Sentence(key=s.get('id').split(".")[1],
                     line=repr,
                     start=int(s.get("start"))))

    for annotation in annotations.getchildren():

        id = unicode(annotation.get('id')[3:])
        repr = unicode(annotation.get('repr'))
        spans = [[int(y) for y in x.split("|")]
                 for x in annotation.get('spans').split(",")]

        ann = Annotation(id, repr, spans)

        for span in ann.spans:
            for s in sentobjects:

                start, end = span
                ann.words.extend(s.getwordsinspan(start, end))

        anndict[id] = ann

    for annotation in annotations.getchildren():

        id = unicode(annotation.get('id')[3:])
        ann = anndict[id]

        for x, y in {
                unicode(x): unicode(y)
                for x, y in annotation.attrib.items()
                if x not in ["id", "repr", "spans", "words"]
        }.items():

            if x.startswith("link."):
                ann.links[x[5:]].extend(
                    [anndict[key[3:]] for key in y.split()])
            else:
                ann.labels[x].append(y)

    return anndict, sentobjects