def importann(pathtofile): """ Import ann and .txt files from a folder. :param pathtofile: (string) the path to the folder containing both the .ann and .txt files. :return: a tuple containing a dictionary of annotations and a string, representing the text of the document """ annotations = readannfile(pathtofile) path, extension = os.path.splitext(pathtofile) sentences = [] text = [] char_index = 0 for sent_index, line in enumerate(open(path + ".txt", encoding='utf-8')): sentences.append(Sentence(sent_index, line, char_index)) char_index += len(line) _join(annotations.values(), sentences) with open(path + ".txt", encoding='utf-8') as input_file: text = input_file.read() return sentences, text
def importann(pathtofile): """ Import ann and .txt files from a folder. :param pathtofile: (string) the path to the folder containing both the .ann and .txt files. :return: a tuple containing a dictionary of annotations and a string, representing the text of the document """ annotations = readannfile(pathtofile) path, extension = os.path.splitext(pathtofile) sentences = [] char_index = 0 ##this is a horrible hack because i want to preserve newlines with open(path + ".txt", encoding='utf8') as f: for sent_index, line in enumerate(f.read().split('\n\n')): sentences.append(Sentence(sent_index, line, char_index)) char_index += len(line) + len('\n\n') # for sent_index, line in enumerate(open(path + ".txt", encoding='utf-8')): # sentences.append(Sentence(sent_index, line, char_index)) # char_index += len(line) _join(annotations.values(), sentences) return sentences
def importann(pathtofile): """ Imports ann and .txt files from a folder. :param pathtofile: (string) the path to the folder containing both the .ann and .txt files. :return: a tuple containing a dictionary of annotations and a string, representing the text of the document """ annotations = readannfile(pathtofile) context = _readcontext(".".join(pathtofile.split(".")[:-1] + ["txt"])) sentences = [] index = 0 for sindex, line in enumerate(context.splitlines()): sentences.append(Sentence(sindex, line, index)) index += len(line)+1 _couple(annotations.values(), sentences) return annotations, sentences
def importann(pathtofile): """ Import ann and .txt files from a folder. :param pathtofile: (string) the path to the folder containing both the .ann and .txt files. :return: a tuple containing a dictionary of annotations and a string, representing the text of the document """ annotations = readannfile(pathtofile) path, extension = os.path.splitext(pathtofile) sentences = [] char_index = 0 # for sent_index, line in enumerate(open((path + ".txt"), errors='ignore')): for sent_index, line in enumerate( open((path + ".txt"), errors='ignore', encoding="utf-8")): sentences.append(Sentence(sent_index, line, char_index)) char_index += len(line) + 1 _join(annotations.values(), sentences) return sentences
def importxml(filename): """ Imports an XML file formatted with the format created by this program. Used for persistency and to operate on RepoModels in memory. :param filename: (string) the path to the file to be imported. :return: A tuple containing a dictionary of annotations and a list of dictionaries representing the context. """ anndict = OrderedDict() sentobjects = [] with codecs.open(filename, 'r', encoding='utf-8') as f: data = f.read() doc = etree.fromstring(data) sentences, annotations = doc.getchildren() for s in sentences.getchildren(): repr = " ".join([w.text for w in s.getchildren()]) sentobjects.append( Sentence(key=s.get('id').split(".")[1], line=repr, start=int(s.get("start")))) for annotation in annotations.getchildren(): id = unicode(annotation.get('id')[3:]) repr = unicode(annotation.get('repr')) spans = [[int(y) for y in x.split("|")] for x in annotation.get('spans').split(",")] ann = Annotation(id, repr, spans) for span in ann.spans: for s in sentobjects: start, end = span ann.words.extend(s.getwordsinspan(start, end)) anndict[id] = ann for annotation in annotations.getchildren(): id = unicode(annotation.get('id')[3:]) ann = anndict[id] for x, y in { unicode(x): unicode(y) for x, y in annotation.attrib.items() if x not in ["id", "repr", "spans", "words"] }.items(): if x.startswith("link."): ann.links[x[5:]].extend( [anndict[key[3:]] for key in y.split()]) else: ann.labels[x].append(y) return anndict, sentobjects