def correct_annotations(orig_fn, ann_fn, change_fn): with annotation.TextAnnotations(ann_fn) as anns: orig_text = anns.get_document_text() with annotation.open_textfile(change_fn, 'r') as f: changed_text = f.read() diffs = diff_match_patch().diff_main(orig_text, changed_text) orig_offset = 0 change_offset = 0 offsets = [] for diff in diffs: kind = diff[0] text = diff[1] size = len(text) delta = size * kind offsets.append((orig_offset, delta)) if kind != 1: orig_offset += size offsets = offsets[::-1] tbs = list(anns.get_textbounds()) indices = [] for tbi, tb in enumerate(tbs): for spani, span in enumerate(tb.spans): indices.append((span[0], tbi, spani, 0)) indices.append((span[1], tbi, spani, 1)) indices.sort(reverse=True) for orig_offset, delta in offsets: for index in indices: if index[0] < orig_offset: break frag = list(tbs[index[1]].spans[index[2]]) frag[index[3]] += delta tbs[index[1]].spans[index[2]] = tuple(frag) for tb in tbs: if isinstance(tb, annotation.TextBoundAnnotationWithText): tb.text = annotation.DISCONT_SEP.join((changed_text[start:end] for start, end in tb.spans)) copy(change_fn, orig_fn)
def download_file(document, collection, extension): directory = collection real_dir = real_directory(directory) fname = '%s.%s' % (document, extension) fpath = path_join(real_dir, fname) #hdrs = [('Content-Type', 'text/plain; charset=utf-8'), ('Content-Disposition', 'inline; filename=%s' % fname)] hdrs = [('Content-Type', 'application/octet-stream'), ('Content-Disposition', 'inline; filename=%s' % fname)] if allowed_to_read(fpath): if not exists(fpath): data = "" if extension == "zip": import zipfile zipf = zipfile.ZipFile(fpath, 'w') zipf.close() with open(fpath, 'rb') as txt_file: data = txt_file.read() else: if extension != "zip": with open_textfile(fpath, 'r') as txt_file: data = txt_file.read().encode('utf-8') else: with open(fpath, 'rb') as txt_file: data = txt_file.read() else: data = "Access Denied" raise NoPrintJSONError(hdrs, data)
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, "r") as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error("Error reading text file: nonstandard encoding or binary?", -1) raise UnableToReadTextFile(txt_file_path) j_dic["text"] = text from logging import info as log_info tokeniser = options_get_tokenization(dirname(txt_file_path)) # First, generate tokenisation if tokeniser == "mecab": from tokenise import jp_token_boundary_gen tok_offset_gen = jp_token_boundary_gen elif tokeniser == "whitespace": from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen elif tokeniser == "ptblike": from tokenise import gtb_token_boundary_gen tok_offset_gen = gtb_token_boundary_gen else: Messager.warning("Unrecognized tokenisation option " ", reverting to whitespace tokenisation.") from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen j_dic["token_offsets"] = [o for o in tok_offset_gen(text)] ssplitter = options_get_ssplitter(dirname(txt_file_path)) if ssplitter == "newline": from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen elif ssplitter == "regex": from ssplit import regex_sentence_boundary_gen ss_offset_gen = regex_sentence_boundary_gen else: Messager.warning("Unrecognized sentence splitting option " ", reverting to newline sentence splitting.") from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen j_dic["sentence_offsets"] = [o for o in ss_offset_gen(text)] return True
def __read_or_default(filename, default): try: f = open_textfile(filename, 'r') r = f.read() f.close() return r except: # TODO: specific exception handling and reporting return default
def download_file(document, collection, extension): directory = collection real_dir = real_directory(directory) fname = "%s.%s" % (document, extension) fpath = path_join(real_dir, fname) hdrs = [("Content-Type", "text/plain; charset=utf-8"), ("Content-Disposition", "inline; filename=%s" % fname)] with open_textfile(fpath, "r") as txt_file: data = txt_file.read().encode("utf-8") raise NoPrintJSONError(hdrs, data)
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error( 'Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) j_dic['text'] = text from logging import info as log_info tokeniser = options_get_tokenization(dirname(txt_file_path)) # First, generate tokenisation if tokeniser == 'mecab': from tokenise import jp_token_boundary_gen tok_offset_gen = jp_token_boundary_gen elif tokeniser == 'whitespace': from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen elif tokeniser == 'ptblike': from tokenise import gtb_token_boundary_gen tok_offset_gen = gtb_token_boundary_gen else: Messager.warning('Unrecognized tokenisation option ' ', reverting to whitespace tokenisation.') from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen j_dic['token_offsets'] = [o for o in tok_offset_gen(text)] ssplitter = options_get_ssplitter(dirname(txt_file_path)) if ssplitter == 'newline': from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen elif ssplitter == 'regex': from ssplit import regex_sentence_boundary_gen ss_offset_gen = regex_sentence_boundary_gen else: Messager.warning('Unrecognized sentence splitting option ' ', reverting to newline sentence splitting.') from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)] return True
def download_file(document, collection, extension): directory = collection real_dir = real_directory(directory) fname = '%s.%s' % (document, extension) fpath = path_join(real_dir, fname) hdrs = [('Content-Type', 'text/plain; charset=utf-8'), ('Content-Disposition', 'inline; filename=%s' % fname)] with open_textfile(fpath, 'r') as txt_file: data = txt_file.read().encode('utf-8') raise NoPrintJSONError(hdrs, data)
def download_file(document, collection, extension): directory = collection real_dir = real_directory(directory) fname = '%s.%s' % (document, extension) fpath = path_join(real_dir, fname) hdrs = [('Content-Type', 'text/plain; charset=utf-8'), ('Content-Disposition', 'inline; filename=%s' % fname)] with open_textfile(fpath, 'r') as txt_file: data = txt_file.read() raise NoPrintJSONError(hdrs, data)
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error('Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) j_dic['text'] = text tokeniser = options_get_tokenization(dirname(txt_file_path)) # First, generate tokenisation if tokeniser == 'mecab': from tokenise import jp_token_boundary_gen tok_offset_gen = jp_token_boundary_gen elif tokeniser == 'whitespace': from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen elif tokeniser == 'ptblike': from tokenise import gtb_token_boundary_gen tok_offset_gen = gtb_token_boundary_gen else: Messager.warning('Unrecognized tokenisation option ' ', reverting to whitespace tokenisation.') from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen j_dic['token_offsets'] = [o for o in tok_offset_gen(text)] ssplitter = options_get_ssplitter(dirname(txt_file_path)) if ssplitter == 'newline': from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen elif ssplitter == 'regex': from ssplit import regex_sentence_boundary_gen ss_offset_gen = regex_sentence_boundary_gen else: Messager.warning('Unrecognized sentence splitting option ' ', reverting to newline sentence splitting.') from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)] return True
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error( 'Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) # TODO XXX huge hack, sorry, the client currently crashing on # chrome for two or more consecutive space, so replace every # second with literal non-breaking space. Note that this is just # for the client display -- server-side storage is not affected. # NOTE: it might be possible to fix this in a principled way by # having xml:space="preserve" on the relevant elements. text = text.replace(" ", ' ' + unichr(0x00A0)) j_dic['text'] = text from logging import info as log_info # First, generate tokenisation if JAPANESE: from tokenise import jp_token_boundary_gen token_offsets = [o for o in jp_token_boundary_gen(text)] else: from tokenise import en_token_boundary_gen token_offsets = [o for o in en_token_boundary_gen(text)] j_dic['token_offsets'] = token_offsets if NEWLINE_SS: from ssplit import newline_sentence_boundary_gen sentence_offsets = [o for o in newline_sentence_boundary_gen(text)] elif JAPANESE: from ssplit import jp_sentence_boundary_gen sentence_offsets = [o for o in jp_sentence_boundary_gen(text)] #log_info('offsets: ' + str(offsets)) else: from ssplit import en_sentence_boundary_gen sentence_offsets = [o for o in en_sentence_boundary_gen(text)] #log_info('offsets: ' + str(sentence_offsets)) j_dic['sentence_offsets'] = sentence_offsets return True
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error('Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) # TODO XXX huge hack, sorry, the client currently crashing on # chrome for two or more consecutive space, so replace every # second with literal non-breaking space. Note that this is just # for the client display -- server-side storage is not affected. # NOTE: it might be possible to fix this in a principled way by # having xml:space="preserve" on the relevant elements. text = text.replace(" ", ' '+unichr(0x00A0)) j_dic['text'] = text from logging import info as log_info # First, generate tokenisation if JAPANESE: from tokenise import jp_token_boundary_gen token_offsets = [o for o in jp_token_boundary_gen(text)] else: from tokenise import en_token_boundary_gen token_offsets = [o for o in en_token_boundary_gen(text)] j_dic['token_offsets'] = token_offsets if NEWLINE_SS: from ssplit import newline_sentence_boundary_gen sentence_offsets = [o for o in newline_sentence_boundary_gen(text)] elif JAPANESE: from ssplit import jp_sentence_boundary_gen sentence_offsets = [o for o in jp_sentence_boundary_gen(text)] #log_info('offsets: ' + str(offsets)) else: from ssplit import en_sentence_boundary_gen sentence_offsets = [o for o in en_sentence_boundary_gen(text)] #log_info('offsets: ' + str(sentence_offsets)) j_dic['sentence_offsets'] = sentence_offsets return True
def __create_span(ann_obj, mods, type, start, end, txt_file_path, projectconf, attributes): # TODO: Rip this out! start = int(start) end = int(end) # Before we add a new trigger, does it already exist? found = None for tb_ann in ann_obj.get_textbounds(): try: if (tb_ann.start == start and tb_ann.end == end and tb_ann.type == type): found = tb_ann break except AttributeError: # Not a trigger then pass if found is None: # Get a new ID new_id = ann_obj.get_new_id('T') #XXX: Cons # Get the text span with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read()[start:end] #TODO: Data tail should be optional if '\n' not in text: ann = TextBoundAnnotationWithText(start, end, new_id, type, text) ann_obj.add_annotation(ann) mods.addition(ann) else: ann = None else: ann = found if ann is not None: if projectconf.is_physical_entity_type(type): # TODO: alert that negation / speculation are ignored if set event = None else: # Create the event also new_event_id = ann_obj.get_new_id('E') #XXX: Cons event = EventAnnotation(ann.id, [], unicode(new_event_id), type, '') ann_obj.add_annotation(event) mods.addition(event) else: # We got a newline in the span, don't take any action event = None return ann, event
def save_import(title, text, docid, collection=None): ''' TODO: DOC: ''' directory = collection if directory is None: dir_path = DATA_DIR else: #XXX: These "security" measures can surely be fooled if (directory.count('../') or directory == '..'): raise InvalidDirError(directory) dir_path = real_directory(directory) # Is the directory a directory and are we allowed to write? if not isdir(dir_path): raise InvalidDirError(dir_path) if not access(dir_path, W_OK): raise NoWritePermissionError(dir_path) base_path = join_path(dir_path, docid) txt_path = base_path + '.' + TEXT_FILE_SUFFIX ann_path = base_path + '.' + JOINED_ANN_FILE_SUFF dat_path = base_path + '.' + DATA_FILE_SUFFIX # Before we proceed, verify that we are not overwriting for path in (txt_path, ann_path): if isfile(path): raise FileExistsError(path) # Make sure we have a valid POSIX text file, i.e. that the # file ends in a newline. if text != "" and text[-1] != '\n': text = text + '\n' with open_textfile(txt_path, 'w') as txt_file: txt_file.write(title + '\n' + text) # Touch the ann file so that we can edit the file later with open(ann_path, 'w') as _: pass # Touch the dat file so that we can edit the file later with open(dat_path, 'w') as _: pass return { 'document': docid }
def __create_span(ann_obj, mods, type, offsets, txt_file_path, projectconf, attributes): # Before we add a new trigger, does an equivalent one already exist? found = None for tb_ann in ann_obj.get_textbounds(): try: if _offsets_equal(tb_ann.spans, offsets) and tb_ann.type == type: found = tb_ann break except AttributeError: # Not a trigger then pass if found is None: # Get a new ID new_id = ann_obj.get_new_id('T') #XXX: Cons # Get the text span with open_textfile(txt_file_path, 'r') as txt_file: # TODO discont: use offsets instead (note need for int conversion) text = _text_for_offsets(txt_file.read(), offsets) #TODO: Data tail should be optional if '\n' not in text: ann = TextBoundAnnotationWithText(offsets[:], new_id, type, text) ann_obj.add_annotation(ann) mods.addition(ann) else: ann = None else: ann = found if ann is not None: if projectconf.is_physical_entity_type(type): # TODO: alert that negation / speculation are ignored if set event = None else: # Create the event also new_event_id = ann_obj.get_new_id('E') #XXX: Cons event = EventAnnotation(ann.id, [], unicode(new_event_id), type, '') ann_obj.add_annotation(event) mods.addition(event) else: # We got a newline in the span, don't take any action event = None return ann, event
def download_file(document, collection, extension): directory = collection real_dir = real_directory(directory) fname = '%s.%s' % (document, extension) fpath = path_join(real_dir, fname) hdrs = [('Content-Type', 'text/plain; charset=utf-8'), ('Content-Disposition', 'inline; filename=%s' % fname)] #Folia conversion added by Sander Naert from brat2folia import convert if extension == 'xml': convert(real_dir, document) #convert to folia with open_textfile(fpath, 'r') as txt_file: data = txt_file.read().encode('utf-8') raise NoPrintJSONError(hdrs, data)
def save_import(title, text, docid, collection=None): ''' TODO: DOC: ''' directory = collection if directory is None: dir_path = DATA_DIR else: #XXX: These "security" measures can surely be fooled if (directory.count('../') or directory == '..'): raise InvalidDirError(directory) dir_path = real_directory(directory) # Is the directory a directory and are we allowed to write? if not isdir(dir_path): raise InvalidDirError(dir_path) if not access(dir_path, W_OK): raise NoWritePermissionError(dir_path) base_path = join_path(dir_path, docid) txt_path = base_path + '.' + TEXT_FILE_SUFFIX ann_path = base_path + '.' + JOINED_ANN_FILE_SUFF # Before we proceed, verify that we are not overwriting for path in (txt_path, ann_path): if isfile(path): raise FileExistsError(path) # Make sure we have a valid POSIX text file, i.e. that the # file ends in a newline. if text != "" and text[-1] != '\n': text = text + '\n' with open_textfile(txt_path, 'w') as txt_file: txt_file.write(title + '\n' + text) # Touch the ann file so that we can edit the file later with open(ann_path, 'w') as _: pass return { 'document': docid }
def download_file(document, collection, extension): directory = collection real_dir = real_directory(directory) fname = '%s.%s' % (document, extension) fpath = path_join(real_dir, fname) hdrs = [('Content-Type', 'text/plain; charset=utf-8'), ('Content-Disposition', 'inline; filename=%s' % fname)] #Folia conversion added by Sander Naert from brat2folia import convert if extension=='xml': convert(real_dir, document) #convert to folia with open_textfile(fpath, 'r') as txt_file: data = txt_file.read().encode('utf-8') raise NoPrintJSONError(hdrs, data)
def convert(data, src): # Fail early if we don't have a converter try: conv_text, conv_ann = CONV_BY_SRC[src] except KeyError: raise InvalidSrcFormat # Note: Due to a lack of refactoring we need to write to disk to read # annotions, once this is fixed, the below code needs some clean-up tmp_dir = None try: tmp_dir = mkdtemp() doc_base = path_join(tmp_dir, 'tmp') with open_textfile(doc_base + '.txt', 'w') as txt_file: txt_file.write(conv_text(data)) with open(doc_base + '.ann', 'w'): pass with Annotations(doc_base) as ann_obj: for ann in conv_ann(data): ann_obj.add_annotation(ann) json_dic = _document_json_dict(doc_base) # Note: Blank the comments, they rarely do anything good but whine # about configuration when we use the tool solely for visualisation # purposes json_dic['comments'] = [] # Note: This is an ugly hack... we want to ride along with the # Stanford tokenisation and sentence splits when returning their # output rather than relying on the ones generated by brat. if src.startswith('stanford-'): json_dic['token_offsets'] = stanford_token_offsets(data) json_dic['sentence_offsets'] = stanford_sentence_offsets(data) return json_dic finally: if tmp_dir is not None: rmtree(tmp_dir)
def export_document(document, collection, extension): directory = collection real_dir = real_directory(directory) fname = '%s.%s' % (document, 'txt') fpath = path_join(real_dir, fname) rr = None if allowed_to_read(fpath): rr = ReadProject() owlfile, ttlfile = rr.read_project(real_dir, document, extension) fpaths = owlfile if extension[0:3] == 'owl' else ttlfile if extension[-1] == 's': fname = '%s.%s' % (document, "zip") hdrs = [('Content-Type', 'application/octet-stream'), ('Content-Disposition', 'inline; filename=%s' % fname)] from zipfile import ZipFile with ZipFile(path_join(real_dir, document) + ".zip", "w") as outfile: for f in fpaths: with open(f) as infile: outfile.writestr(f.split('/')[-1], infile.read()) with open(path_join(real_dir, document) + ".zip", 'rb') as tmp_file: data = tmp_file.read() else: fname = '%s.%s' % (document, extension) #hdrs = [('Content-Type', 'text/plain; charset=utf-8'), ('Content-Disposition', 'inline; filename=%s' % fname)] hdrs = [('Content-Type', 'application/octet-stream'), ('Content-Disposition', 'inline; filename=%s' % fname)] with open_textfile(fpaths, 'r') as txt_file: data = txt_file.read().encode('utf-8') else: data = "Access Denied" try: raise NoPrintJSONError(hdrs, data) finally: if rr: rr.clean_up() if isfile(path_join(real_dir, '%s.%s' % (document, 'zip'))): os.remove(path_join(real_dir, '%s.%s' % (document, 'zip')))
def build_text_structure(ann,txt_file_path): ''' Will split a text file in paragraphs, sentences and words and return the folia document For every word it will check 2 main things: 1) is the word part of some entities? and if so it will add them to a list of lists of words 2) is their an entity that ends with this word? if so it will create the entity with the right words out of the list and delete this element after it took the words out. After every sentence, paragraph all the entities that started and ended within that structure will be added into the EntityLayer ''' from annotation import open_textfile from tokenise import gtb_token_boundary_gen def add_list_entities(struct, folia_entities): #will check if any entities have to be added and add if needed if folia_entities: layer = struct.append(folia.EntitiesLayer) for folia_entity in folia_entities: layer.append(folia_entity) for attr in attributes[folia_entity.id]: folia_entity.append(folia.Feature(doc,subset=attr.type, cls=str(attr.value))) try: #Sort entities on offset instead of id entities = sorted(ann.get_textbounds(), key=lambda entity: (entity.start, -entity.end)) index = 0 doc = folia.Document(id='brat') attributes = build_entities_attr(ann) folia_text = doc.append(folia.Text) paragraph = folia_text.append(folia.Paragraph) folia_sentence = 0 par_start = 0 #fictive sets doc.annotationdefaults[folia.AnnotationType.ENTITY] = {"entiteit_set.xml": {} } doc.annotations.append( (folia.AnnotationType.ENTITY, "entiteit_set.xml" ) ) doc.annotationdefaults[folia.AnnotationType.MORPHOLOGICAL] = {"morph_set.xml": {} } doc.annotations.append( (folia.AnnotationType.MORPHOLOGICAL, "morph_set.xml" ) ) entity = entities[index] entities_words=[] inner_index=0 entities_words.append([]) folia_entitiesLayer_par=[] folia_entitiesLayer_sen=[] folia_entitiesLayer_txt=[] with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() offsets = [o for o in regex_sentence_boundary_gen(text)] for start, end, sentence in _text_by_offsets_gen(text, offsets): if start == end and text[start-1] == '\n': add_list_entities(paragraph, folia_entitiesLayer_par) folia_entitiesLayer_par = [] paragraph = folia_text.append(folia.Paragraph) par_start = start elif sentence != "" : add_list_entities(folia_sentence, folia_entitiesLayer_sen) folia_entitiesLayer_sen = [] folia_sentence = paragraph.append(folia.Sentence,sentence) offsetsw = [o for o in gtb_token_boundary_gen(sentence)] for tok in _text_by_offsets_gen(sentence, offsetsw): entity = entities[index] inner_index=0 folia_word = folia_sentence.append(folia.Word, tok[2]) morph_layer= "" #check if word is part of the entity and if so remember folia word while entity.start <= entities[index].end : while( len(entities_words) <= inner_index ): entities_words.append([]) for span_start, span_end in entity.spans: if ( span_start <= tok[0]+start and tok[1]+start <= span_end): entities_words[inner_index].append(doc[folia_word.id]) #entity ends within the word elif (tok[1]+start >= span_end and span_end > tok[0]+start) : offset_start = span_start-(start+tok[0]) if offset_start <0 :# entity started before this word offset_start =0; offset_end = span_end-(start+tok[0]) string = tok[2][offset_start:offset_end] if not morph_layer: morph_layer = folia_word.append(folia.MorphologyLayer) morph = morph_layer.append(folia.Morpheme(doc, generate_id_in=folia_word)) morph.append(folia.TextContent(doc, value=string, offset=offset_start)) entities_words[inner_index].append(doc[morph.id]) #entity starts within the word elif (tok[1]+start > span_start and span_start >= tok[0]+start) : offset_start = span_start-(start+tok[0]) offset_end = span_end-(start+tok[0]) string = tok[2][offset_start:offset_end] if not morph_layer: morph_layer = folia_word.append(folia.MorphologyLayer) morph = morph_layer.append(folia.Morpheme(doc, generate_id_in=folia_word)) morph.append(folia.TextContent(doc, value=string, offset=offset_start)) entities_words[inner_index].append(doc[morph.id]) inner_index = inner_index + 1 if len(entities) > index + inner_index : entity = entities[index+inner_index] else: break entity = entities[index] inner_index = 0 #check for end of an entity and append entity to either text, paragraph or sentece depending on start of the entity current_index = index while entity.start <= entities[current_index].end : if entity.end <= start + tok[1] and entity.start <= start + tok[0] : if (entity.start >= start): folia_entitiesLayer = folia_entitiesLayer_sen elif (entity.start >= par_start): folia_entitiesLayer = folia_entitiesLayer_par else: folia_entitiesLayer = folia_entitiesLayer_txt if entities_words[inner_index]: folia_entity = folia.Entity(doc, cls=entity.type, id=entity.id , contents=entities_words[inner_index]) folia_entitiesLayer.append(folia_entity) elif not any(x.id == entity.id for x in folia_entitiesLayer): #see if entity is already added try: doc[entity.id] except KeyError: raise EntityNotFoundError(entity) if(inner_index == 0): entities_words.pop(0) if len(entities) > index+1 : index = index + 1 for i in range(0, len(entities_words)): if(not entities_words[0]): entities_words.pop(0) index = index + 1 else: break elif(inner_index > 0): entities_words[inner_index]=[] inner_index = inner_index + 1 else: inner_index = inner_index + 1 if len(entities) > index + inner_index: entity = entities[index+inner_index] else: break add_list_entities(paragraph, folia_entitiesLayer_par) add_list_entities(folia_sentence, folia_entitiesLayer_sen) add_list_entities(folia_text, folia_entitiesLayer_txt) return doc except IOError: pass # Most likely a broken pipe
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error( 'Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) # TODO XXX huge hack, sorry, the client currently crashing on # chrome for two or more consecutive space, so replace every # second with literal non-breaking space. Note that this is just # for the client display -- server-side storage is not affected. # NOTE: it might be possible to fix this in a principled way by # having xml:space="preserve" on the relevant elements. text = text.replace(" ", ' ' + unichr(0x00A0)) j_dic['text'] = text from logging import info as log_info tokeniser = options_get_tokenization(dirname(txt_file_path)) # First, generate tokenisation if tokeniser == 'mecab': from tokenise import jp_token_boundary_gen tok_offset_gen = jp_token_boundary_gen elif tokeniser == 'whitespace': from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen elif tokeniser == 'ptblike': from tokenise import gtb_token_boundary_gen tok_offset_gen = gtb_token_boundary_gen else: Messager.warning('Unrecognized tokenisation option ' ', reverting to whitespace tokenisation.') from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen j_dic['token_offsets'] = [o for o in tok_offset_gen(text)] ssplitter = options_get_ssplitter(dirname(txt_file_path)) if ssplitter == 'newline': from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen elif ssplitter == 'regex': from ssplit import regex_sentence_boundary_gen ss_offset_gen = regex_sentence_boundary_gen else: Messager.warning('Unrecognized sentence splitting option ' ', reverting to newline sentence splitting.') from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)] return True
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error('Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) # TODO XXX huge hack, sorry, the client currently crashing on # chrome for two or more consecutive space, so replace every # second with literal non-breaking space. Note that this is just # for the client display -- server-side storage is not affected. # NOTE: it might be possible to fix this in a principled way by # having xml:space="preserve" on the relevant elements. text = text.replace(" ", ' '+unichr(0x00A0)) j_dic['text'] = text from logging import info as log_info tokeniser = options_get_tokenization(dirname(txt_file_path)) # First, generate tokenisation if tokeniser == 'mecab': from tokenise import jp_token_boundary_gen tok_offset_gen = jp_token_boundary_gen elif tokeniser == 'whitespace': from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen elif tokeniser == 'ptblike': from tokenise import gtb_token_boundary_gen tok_offset_gen = gtb_token_boundary_gen else: Messager.warning('Unrecognized tokenisation option ' ', reverting to whitespace tokenisation.') from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen j_dic['token_offsets'] = [o for o in tok_offset_gen(text)] ssplitter = options_get_ssplitter(dirname(txt_file_path)) if ssplitter == 'newline': from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen elif ssplitter == 'regex': from ssplit import regex_sentence_boundary_gen ss_offset_gen = regex_sentence_boundary_gen else: Messager.warning('Unrecognized sentence splitting option ' ', reverting to newline sentence splitting.') from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)] return True
def get_directory_information(collection): directory = collection real_dir = real_directory(directory) assert_allowed_to_read(real_dir) # Get the document names base_names = [fn[0:-4] for fn in _listdir(real_dir) if fn.endswith('txt')] doclist = base_names[:] doclist_header = [("Document", "string")] # Then get the modification times doclist_with_time = [] for file_name in doclist: file_path = path_join(DATA_DIR, real_dir, file_name + "." + JOINED_ANN_FILE_SUFF) doclist_with_time.append([file_name, _getmtime(file_path)]) doclist = doclist_with_time doclist_header.append(("Modified", "time")) try: stats_types, doc_stats = get_statistics(real_dir, base_names) except OSError: # something like missing access permissions? raise CollectionNotAccessibleError doclist = [doclist[i] + doc_stats[i] for i in range(len(doclist))] doclist_header += stats_types dirlist = [dir for dir in _listdir(real_dir) if isdir(path_join(real_dir, dir))] # just in case, and for generality dirlist = [[dir] for dir in dirlist] # check whether at root, ignoring e.g. possible trailing slashes if normpath(real_dir) != normpath(DATA_DIR): parent = abspath(path_join(real_dir, '..'))[len(DATA_DIR) + 1:] # to get consistent processing client-side, add explicitly to list dirlist.append([".."]) else: parent = None # combine document and directory lists, adding a column # differentiating files from directories and an unused column (can # point to a specific annotation) required by the protocol. The # values filled here for the first are "c" for "collection" # (i.e. directory) and "d" for "document". combolist = [] for i in dirlist: combolist.append(["c", None]+i) for i in doclist: combolist.append(["d", None]+i) # plug in the search config too search_config = get_search_config(real_dir) # ... and the disambiguator config ... this is getting a bit much disambiguator_config = get_disambiguator_config(real_dir) # ... and the normalization config (TODO: rethink) normalization_config = get_normalization_config(real_dir) # read in README (if any) to send as a description of the # collection try: with open_textfile(path_join(real_dir, "README")) as txt_file: readme_text = txt_file.read() except IOError: readme_text = None # fill in a flag for whether annotator logging is active so that # the client knows whether to invoke timing actions ann_logging = annotation_logging_active(real_dir) # fill in NER services, if any ner_taggers = get_annotator_config(real_dir) return _inject_annotation_type_conf(real_dir, json_dic={ 'items': combolist, 'header' : doclist_header, 'parent': parent, 'messages': [], 'description': readme_text, 'search_config': search_config, 'disambiguator_config' : disambiguator_config, 'normalization_config' : normalization_config, 'annotation_logging': ann_logging, 'ner_taggers': ner_taggers, })
if __name__ == '__main__': from sys import argv from annotation import open_textfile def _text_by_offsets_gen(text, offsets): for start, end in offsets: yield text[start:end] if len(argv) > 1: try: for txt_file_path in argv[1:]: print print '### Splitting:', txt_file_path with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() print '# Original text:' print text.replace('\n', '\\n') offsets = [o for o in newline_sentence_boundary_gen(text)] print '# Offsets:' print offsets print '# Sentences:' for sentence in _text_by_offsets_gen(text, offsets): # These should only be allowed when coming from original # explicit newlines. #assert sentence, 'blank sentences disallowed' #assert not sentence[0].isspace(), ( # 'sentence may not start with white-space "%s"' % sentence) print '"%s"' % sentence.replace('\n', '\\n') except IOError:
def __create_span(ann_obj, mods, type, offsets, txt_file_path, projectconf, attributes): # For event types, reuse trigger if a matching one exists. found = None if projectconf.is_event_type(type): for tb_ann in ann_obj.get_textbounds(): try: if (_offsets_equal(tb_ann.spans, offsets) and tb_ann.type == type): found = tb_ann break except AttributeError: # Not a trigger then pass if found is None: # Get a new ID new_id = ann_obj.get_new_id('T') #XXX: Cons # Get the text span with open_textfile(txt_file_path, 'r') as txt_file: # TODO discont: use offsets instead (note need for int conversion) text = _text_for_offsets(txt_file.read(), offsets) # The below code resolves cases where there are newlines in the # offsets by creating discontinuous annotations for each span # separated by newlines. For most cases it preserves the offsets. seg_offsets = [] for o_start, o_end in offsets: pos = o_start for text_seg in text.split('\n'): if not text_seg: # Double new-line, skip ahead pos += 1 continue end = pos + len(text_seg) seg_offsets.append((pos, end)) # Our current position is after the newline pos = end + 1 ann = TextBoundAnnotationWithText(seg_offsets, new_id, type, # Replace any newlines with the discontinuous separator MUL_NL_REGEX.sub(DISCONT_SEP, text)) ann_obj.add_annotation(ann) mods.addition(ann) else: ann = found if ann is not None: if projectconf.is_physical_entity_type(type): # TODO: alert that negation / speculation are ignored if set event = None else: # Create the event also new_event_id = ann_obj.get_new_id('E') #XXX: Cons event = EventAnnotation(ann.id, [], unicode(new_event_id), type, '') ann_obj.add_annotation(event) mods.addition(event) else: # We got a newline in the span, don't take any action event = None return ann, event
def get_directory_information(collection): directory = collection real_dir = real_directory(directory) assert_allowed_to_read(real_dir) # Get the document names base_names = [fn[0:-4] for fn in _listdir(real_dir) if fn.endswith('txt')] doclist = base_names[:] doclist_header = [("Document", "string")] # Then get the modification times from os.path import getmtime, join doclist_with_time = [] for file in doclist: try: from annotation import JOINED_ANN_FILE_SUFF mtime = getmtime( join(DATA_DIR, join(real_dir, file + "." + JOINED_ANN_FILE_SUFF))) except: # The file did not exist (or similar problem) mtime = -1 doclist_with_time.append([file, mtime]) doclist = doclist_with_time doclist_header.append(("Modified", "time")) try: stats_types, doc_stats = get_statistics(real_dir, base_names) except OSError: # something like missing access permissions? raise CollectionNotAccessibleError doclist = [doclist[i] + doc_stats[i] for i in range(len(doclist))] doclist_header += stats_types dirlist = [ dir for dir in _listdir(real_dir) if isdir(path_join(real_dir, dir)) ] # just in case, and for generality dirlist = [[dir] for dir in dirlist] if real_dir != DATA_DIR: parent = abspath(path_join(real_dir, '..'))[len(DATA_DIR) + 1:] # to get consistent processing client-side, add explicitly to list dirlist.append([".."]) else: parent = None # combine document and directory lists, adding a column # differentiating files from directories and an unused column (can # point to a specific annotation) required by the protocol. The # values filled here for the first are "c" for "collection" # (i.e. directory) and "d" for "document". combolist = [] for i in dirlist: combolist.append(["c", None] + i) for i in doclist: combolist.append(["d", None] + i) event_types, entity_types, attribute_types, relation_types, unconf_types = get_span_types( real_dir) # read in README (if any) to send as a description of the # collection try: with open_textfile(path_join(real_dir, "README")) as txt_file: readme_text = txt_file.read() except IOError: readme_text = None json_dic = { 'items': combolist, 'header': doclist_header, 'parent': parent, 'messages': [], 'event_types': event_types, 'entity_types': entity_types, 'attribute_types': attribute_types, 'relation_types': relation_types, 'unconfigured_types': unconf_types, 'description': readme_text, } return json_dic
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: (filepath, tempfilename) = os.path.split(txt_file_path) (filename, extension) = os.path.splitext(tempfilename) r = RepoModel(filepath) r.save_xml(filepath) # xml_save(filepath, filename, filename) xml_file_path = os.path.join(filepath, filename+'.xml') # print("xml_file_path::::", r, file=sys.stderr) # if xml_file_path: # pass # else: # xml_save(filepath, filename, filename) with open(xml_file_path, 'r') as xml_file: xml = xml_file.read() with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() j_dic['xml'] = xml except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error( 'Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) j_dic['text'] = text tokeniser = options_get_tokenization(dirname(txt_file_path)) # First, generate tokenisation if tokeniser == 'mecab': from tokenise import jp_token_boundary_gen tok_offset_gen = jp_token_boundary_gen elif tokeniser == 'whitespace': from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen elif tokeniser == 'ptblike': from tokenise import gtb_token_boundary_gen tok_offset_gen = gtb_token_boundary_gen else: Messager.warning('Unrecognized tokenisation option ' ', reverting to whitespace tokenisation.') from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen j_dic['token_offsets'] = [o for o in tok_offset_gen(text)] ssplitter = options_get_ssplitter(dirname(txt_file_path)) if ssplitter == 'newline': from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen elif ssplitter == 'regex': from ssplit import regex_sentence_boundary_gen ss_offset_gen = regex_sentence_boundary_gen else: Messager.warning('Unrecognized sentence splitting option ' ', reverting to newline sentence splitting.') from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)] return True
def get_directory_information(collection): directory = collection real_dir = real_directory(directory) assert_allowed_to_read(real_dir) # Get the document names user = get_session().get('user') if user is None or user == 'guest': base_names = [] # # 可以从配置文件获取用户。 elif user in USER_PASSWORD: base_names = [fn[0:-4] for fn in _listdir(real_dir) if fn.endswith('txt')] else: db = DBlite() base_names = db.get_AnnNull_files(directory) names_ING = db.get_AnnING_files(directory, user) print("names_ING", names_ING, file=sys.stderr) base_names.extend(names_ING) doclist = base_names[:] doclist_header = [("文档", "string")] # Then get the modification times doclist_with_time = [] for file_name in doclist: file_path = path_join(DATA_DIR, real_dir, file_name + "." + JOINED_ANN_FILE_SUFF) doclist_with_time.append([file_name, _getmtime(file_path)]) doclist = doclist_with_time doclist_header.append(("修改时间", "time")) """ stats_types: [('Entities', 'int'), ('Relations', 'int'), ('Events', 'int')] doc_stats: [[29, 0, 0], [97, 0, 0], [22, 0, 0], [8, 0, 0], [17, 0, 0], [22, 0, 0], [14, 0, 0], [24, 0, 0], [22, 0, 0], [21, 0, 0]] doclist: [['ned.train-doc-184', 1555259780.624325, 29, 0, 0], ['ned.train-doc-181', 1555259780.623239, 97, 0, 0], ['ned.train-doc-236' """ try: stats_types, doc_stats = get_statistics(real_dir, base_names) print("stats_types:", stats_types, file=sys.stderr) print("doc_stats:", doc_stats, file=sys.stderr) except OSError: # something like missing access permissions? raise CollectionNotAccessibleError doclist = [doclist[i] + doc_stats[i] for i in range(len(doclist))] print("doclist:", doclist, file=sys.stderr) doclist_header += stats_types # doclist_header.append(("修改者", "string")) print("doclist_header:", doclist_header, file=sys.stderr) if user is None or user == 'guest': dirlist = [] elif user in USER_PASSWORD: dirlist = [dir for dir in _listdir(real_dir) if isdir(path_join(real_dir, dir))] else: # for user ACL dirlist = [dir for dir in _listdir(real_dir) if isdir(path_join(real_dir, dir))] # just in case, and for generality dirlist = [[dir] for dir in dirlist] # print("---------------dirlist------------------", dirlist, file=sys.stderr) # 打开最后的文件目录结构时出现 # 文件名 修改时间 实体 关系 事件 # [['esp.train-doc-46', 1555259780.6167455, 104, 0, 0], ['esp.train-doc-989', 1555259780.6174483, 34, 0, 0], # print(doclist, file=sys.stderr) # check whether at root, ignoring e.g. possible trailing slashes if normpath(real_dir) != normpath(DATA_DIR): parent = abspath(path_join(real_dir, '..'))[len(DATA_DIR) + 1:] # to get consistent processing client-side, add explicitly to list dirlist.append([".."]) else: parent = None # combine document and directory lists, adding a column # differentiating files from directories and an unused column (can # point to a specific annotation) required by the protocol. The # values filled here for the first are "c" for "collection" # (i.e. directory) and "d" for "document". combolist = [] for i in dirlist: combolist.append(["c", None] + i) for i in doclist: combolist.append(["d", None] + i) # plug in the search config too search_config = get_search_config(real_dir) # ... and the disambiguator config ... this is getting a bit much disambiguator_config = get_disambiguator_config(real_dir) # ... and the normalization config (TODO: rethink) normalization_config = get_normalization_config(real_dir) # read in README (if any) to send as a description of the # collection try: with open_textfile(path_join(real_dir, "README")) as txt_file: readme_text = txt_file.read() except IOError: readme_text = None # fill in a flag for whether annotator logging is active so that # the client knows whether to invoke timing actions ann_logging = annotation_logging_active(real_dir) # fill in NER services, if any ner_taggers = get_annotator_config(real_dir) return _inject_annotation_type_conf(real_dir, json_dic={ 'items': combolist, 'header': doclist_header, 'parent': parent, 'messages': [], 'description': readme_text, 'search_config': search_config, 'disambiguator_config': disambiguator_config, 'normalization_config': normalization_config, 'annotation_logging': ann_logging, 'ner_taggers': ner_taggers, })
if __name__ == "__main__": from sys import argv from annotation import open_textfile def _text_by_offsets_gen(text, offsets): for start, end in offsets: yield text[start:end] if len(argv) > 1: try: for txt_file_path in argv[1:]: print print "### Splitting:", txt_file_path with open_textfile(txt_file_path, "r") as txt_file: text = txt_file.read() print "# Original text:" print text.replace("\n", "\\n") offsets = [o for o in newline_sentence_boundary_gen(text)] print "# Offsets:" print offsets print "# Sentences:" for sentence in _text_by_offsets_gen(text, offsets): # These should only be allowed when coming from original # explicit newlines. # assert sentence, 'blank sentences disallowed' # assert not sentence[0].isspace(), ( # 'sentence may not start with white-space "%s"' % sentence) print '"%s"' % sentence.replace("\n", "\\n") except IOError:
def save_import(text, docid, collection=None, anntext = None): ''' TODO: DOC: ''' if len(docid) > 4 and docid[-4] == '.': docid = docid[:-4] directory = collection if directory is None: dir_path = DATA_DIR else: #XXX: These "security" measures can surely be fooled if (directory.count('../') or directory == '..'): raise InvalidDirError(directory) dir_path = real_directory(directory) # Is the directory a directory and are we allowed to write? if not isdir(dir_path): raise InvalidDirError(dir_path) if not access(dir_path, W_OK): raise NoWritePermissionError(dir_path) ############################ from session import get_session try: username = get_session()['user'] except KeyError: username = None if username != 'admin': if (not username) or username + '/' not in dir_path: raise NoWritePermissionError(dir_path) ############################ base_path = join_path(dir_path, docid) txt_path = base_path + '.' + TEXT_FILE_SUFFIX ann_path = base_path + '.' + JOINED_ANN_FILE_SUFF # Before we proceed, verify that we are not overwriting for path in (txt_path, ann_path): if isfile(path): raise FileExistsError(path) # Make sure we have a valid POSIX text file, i.e. that the # file ends in a newline. newtext = '' for line in text.splitlines(): if line: newtext += line + '\n' text = newtext if text != "" and text[-1] != '\n': text = text + '\n' with open_textfile(txt_path, 'w') as txt_file: txt_file.write(text) if anntext: with open(ann_path, 'w') as ann_file: ann_file.write(anntext) else: # Touch the ann file so that we can edit the file later with open(ann_path, 'w') as _: pass return { 'document': docid }
def __create_span(ann_obj, mods, type, offsets, txt_file_path, projectconf, attributes): # For event types, reuse trigger if a matching one exists. found = None if projectconf.is_event_type(type): for tb_ann in ann_obj.get_textbounds(): try: if (_offsets_equal(tb_ann.spans, offsets) and tb_ann.type == type): found = tb_ann break except AttributeError: # Not a trigger then pass if found is None: # Get a new ID new_id = ann_obj.get_new_id('T') # XXX: Cons # Get the text span with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() text_span = _text_for_offsets(text, offsets) # The below code resolves cases where there are newlines in the # offsets by creating discontinuous annotations for each span # separated by newlines. For most cases it preserves the offsets. seg_offsets = [] for o_start, o_end in offsets: pos = o_start for text_seg in text_span.split('\n'): if not text_seg and o_start != o_end: # Double new-line, skip ahead pos += 1 continue start = pos end = start + len(text_seg) # For the next iteration the position is after the newline. pos = end + 1 # Adjust the offsets to compensate for any potential leading # and trailing whitespace. start += len(text_seg) - len(text_seg.lstrip()) end -= len(text_seg) - len(text_seg.rstrip()) # If there is any segment left, add it to the offsets. if start != end: seg_offsets.append((start, end, )) # if we're dealing with a null-span if not seg_offsets: seg_offsets = offsets ann_text = DISCONT_SEP.join((text[start:end] for start, end in seg_offsets)) ann = TextBoundAnnotationWithText(seg_offsets, new_id, type, ann_text) ann_obj.add_annotation(ann) mods.addition(ann) else: ann = found if ann is not None: if projectconf.is_physical_entity_type(type): # TODO: alert that negation / speculation are ignored if set event = None else: # Create the event also new_event_id = ann_obj.get_new_id('E') # XXX: Cons event = EventAnnotation( ann.id, [], str(new_event_id), type, '') ann_obj.add_annotation(event) mods.addition(event) else: # We got a newline in the span, don't take any action event = None return ann, event
def __create_span(ann_obj, mods, type, offsets, txt_file_path, projectconf, attributes): # For event types, reuse trigger if a matching one exists. found = None if projectconf.is_event_type(type): for tb_ann in ann_obj.get_textbounds(): try: if (_offsets_equal(tb_ann.spans, offsets) and tb_ann.type == type): found = tb_ann break except AttributeError: # Not a trigger then pass if found is None: # Get a new ID new_id = ann_obj.get_new_id('T') #XXX: Cons # Get the text span with open_textfile(txt_file_path, 'r') as txt_file: # TODO discont: use offsets instead (note need for int conversion) text = _text_for_offsets(txt_file.read(), offsets) # The below code resolves cases where there are newlines in the # offsets by creating discontinuous annotations for each span # separated by newlines. For most cases it preserves the offsets. seg_offsets = [] for o_start, o_end in offsets: pos = o_start for text_seg in text.split('\n'): if not text_seg and o_start != o_end: # Double new-line, skip ahead pos += 1 continue end = pos + len(text_seg) seg_offsets.append((pos, end)) # Our current position is after the newline pos = end + 1 ann = TextBoundAnnotationWithText( seg_offsets, new_id, type, # Replace any newlines with the discontinuous separator MUL_NL_REGEX.sub(DISCONT_SEP, text)) ann_obj.add_annotation(ann) mods.addition(ann) else: ann = found if ann is not None: if projectconf.is_physical_entity_type(type): # TODO: alert that negation / speculation are ignored if set event = None else: # Create the event also new_event_id = ann_obj.get_new_id('E') #XXX: Cons event = EventAnnotation(ann.id, [], unicode(new_event_id), type, '') ann_obj.add_annotation(event) mods.addition(event) else: # We got a newline in the span, don't take any action event = None return ann, event
def get_directory_information(collection): directory = collection real_dir = real_directory(directory) assert_allowed_to_read(real_dir) # Get the document names base_names = [fn[0:-4] for fn in _listdir(real_dir) if fn.endswith('txt')] doclist = base_names[:] doclist_header = [("Document", "string")] # Then get the modification times doclist_with_time = [] for file_name in doclist: file_path = path_join(DATA_DIR, real_dir, file_name + "." + JOINED_ANN_FILE_SUFF) doclist_with_time.append([file_name, _getmtime(file_path)]) doclist = doclist_with_time doclist_header.append(("Modified", "time")) try: stats_types, doc_stats = get_statistics(real_dir, base_names) except OSError: # something like missing access permissions? raise CollectionNotAccessibleError doclist = [doclist[i] + doc_stats[i] for i in range(len(doclist))] doclist_header += stats_types dirlist = [ dir for dir in _listdir(real_dir) if isdir(path_join(real_dir, dir)) ] # just in case, and for generality dirlist = [[dir] for dir in dirlist] # check whether at root, ignoring e.g. possible trailing slashes if normpath(real_dir) != normpath(DATA_DIR): parent = abspath(path_join(real_dir, '..'))[len(DATA_DIR) + 1:] # to get consistent processing client-side, add explicitly to list dirlist.append([".."]) else: parent = None # combine document and directory lists, adding a column # differentiating files from directories and an unused column (can # point to a specific annotation) required by the protocol. The # values filled here for the first are "c" for "collection" # (i.e. directory) and "d" for "document". combolist = [] for i in dirlist: combolist.append(["c", None] + i) for i in doclist: combolist.append(["d", None] + i) # plug in the search config too search_config = get_search_config(real_dir) # ... and the disambiguator config ... this is getting a bit much disambiguator_config = get_disambiguator_config(real_dir) # ... and the normalization config (TODO: rethink) normalization_config = get_normalization_config(real_dir) # read in README (if any) to send as a description of the # collection try: with open_textfile(path_join(real_dir, "README")) as txt_file: readme_text = txt_file.read() except IOError: readme_text = None # fill in a flag for whether annotator logging is active so that # the client knows whether to invoke timing actions ann_logging = annotation_logging_active(real_dir) # fill in NER services, if any ner_taggers = get_annotator_config(real_dir) #send logging directory: logging = options_get_annlogfile(real_dir) return _inject_annotation_type_conf(real_dir, json_dic={ 'items': combolist, 'header': doclist_header, 'parent': parent, 'messages': [], 'description': readme_text, 'search_config': search_config, 'disambiguator_config': disambiguator_config, 'normalization_config': normalization_config, 'annotation_logging': ann_logging, 'ner_taggers': ner_taggers, 'logging': logging, })
def get_directory_information(collection): directory = collection real_dir = real_directory(directory) assert_allowed_to_read(real_dir) # Get the document names base_names = [fn[0:-4] for fn in _listdir(real_dir) if fn.endswith('txt')] doclist = base_names[:] doclist_header = [("Document", "string")] # Then get the modification times from os.path import getmtime, join doclist_with_time = [] for file in doclist: try: from annotation import JOINED_ANN_FILE_SUFF mtime = getmtime(join(DATA_DIR, join(real_dir, file + "." + JOINED_ANN_FILE_SUFF))) except: # The file did not exist (or similar problem) mtime = -1 doclist_with_time.append([file, mtime]) doclist = doclist_with_time doclist_header.append(("Modified", "time")) try: stats_types, doc_stats = get_statistics(real_dir, base_names) except OSError: # something like missing access permissions? raise CollectionNotAccessibleError doclist = [doclist[i] + doc_stats[i] for i in range(len(doclist))] doclist_header += stats_types dirlist = [dir for dir in _listdir(real_dir) if isdir(path_join(real_dir, dir))] # just in case, and for generality dirlist = [[dir] for dir in dirlist] if real_dir != DATA_DIR: parent = abspath(path_join(real_dir, '..'))[len(DATA_DIR) + 1:] # to get consistent processing client-side, add explicitly to list dirlist.append([".."]) else: parent = None # combine document and directory lists, adding a column # differentiating files from directories and an unused column (can # point to a specific annotation) required by the protocol. The # values filled here for the first are "c" for "collection" # (i.e. directory) and "d" for "document". combolist = [] for i in dirlist: combolist.append(["c", None]+i) for i in doclist: combolist.append(["d", None]+i) event_types, entity_types, attribute_types, relation_types, unconf_types = get_span_types(real_dir) # read in README (if any) to send as a description of the # collection try: with open_textfile(path_join(real_dir, "README")) as txt_file: readme_text = txt_file.read() except IOError: readme_text = None json_dic = { 'items': combolist, 'header' : doclist_header, 'parent': parent, 'messages': [], 'event_types': event_types, 'entity_types': entity_types, 'attribute_types': attribute_types, 'relation_types': relation_types, 'unconfigured_types': unconf_types, 'description': readme_text, } return json_dic
if __name__ == '__main__': from sys import argv from annotation import open_textfile def _text_by_offsets_gen(text, offsets): for start, end in offsets: yield text[start:end] if len(argv) > 1: try: for txt_file_path in argv[1:]: print print '### Splitting:', txt_file_path with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() print '# Original text:' print text.replace('\n', '\\n') offsets = [o for o in en_sentence_boundary_gen(text)] print '# Offsets:' print offsets print '# Sentences:' for sentence in _text_by_offsets_gen(text, offsets): # These should only be allowed when coming from original # explicit newlines. #assert sentence, 'blank sentences disallowed' #assert not sentence[0].isspace(), ( # 'sentence may not start with white-space "%s"' % sentence) print '"%s"' % sentence.replace('\n', '\\n') except IOError:
# location = join_path(dir_path, 'input.json') # data = getFileData(location) try: json_resp = loads(data) except ValueError, e: raise FormatError(apiUrl, e) # Make sure we have a valid POSIX text file, i.e. that the # file ends in a newline. response = json_resp[1] text = response['doc'] if text != "" and text[-1] != '\n': text = text + '\n' with open_textfile(txt_path, 'w') as txt_file: txt_file.write(text) annotations = "" index = 1 for sentence in response['annotatedSentences']: for annotation in sentence['spans']: if len(annotation['tokens']) > 0: token = annotation['tokens'][0] type = token['namedEntity'] if len(annotation['annotations']) > 0: type = annotation['annotations'].keys()[0].split('.')[-1] annotations += 'T' + str(index) + '\t' + str(type) + ' ' + str( token['start']) + ' ' + str(token['end']) + '\t' + str(