def main(args): argp = ARGPARSER.parse_args(args[1:]) if not argp.no_cache: # We can't do it iteratively listening to stdin, read it all doc = Document('<classify>', [], [], '<classify>') for _string in (l.rstrip('\n') for l in argp.input): doc.abstract.append(_string_to_ann_sent(_string)) docs = (doc, ) else: docs = (Document('Line: %s' % i, [], [_string_to_ann_sent(_string)], '<stdin>') for i, _string in enumerate( (l.rstrip('\n') for l in argp.input), start=1)) # Cache the strings for speed if not argp.no_cache: cache_simstring((docs, ), verbose=argp.verbose) with open(argp.model_path, 'r') as model_file: classifier = pickle_load(model_file) # TODO: Faster to do it in a batch instead for doc in docs: for sent in doc: for ann in sent: print '%s\t%s' % (sent.annotation_text(ann), str(classifier.classify(doc, sent, ann, ranked=True)))
def __handleDocument(self, xmlDocument): d = Document() root = xmlDocument.firstChild maxNumParag = 2 counter = 0 if root.hasChildNodes(): for xmlNode in root.childNodes: #print 'xmlNode.nodeName '+str(xmlNode.nodeName) if xmlNode.nodeName == 'DESC' or xmlNode.nodeName == 'TIME': pass if xmlNode.nodeName == 'P': counter += 1 p = Paragraph() print 'before hangle_paragraph' self.__handleParagraph( xmlNode, p ) print 'after handle_paragraph' d.addParagraph(p) p.document = d print p.idx if counter >= maxNumParag: break print d, d.paragraphs """ # After all the paragraphs, therefore sentences, therefore words # and markables have been parsed, the markables # will have its slices relative to the word indexes in the sentence, # being the first word index 0 and the last index numWords-1 # Transform the indexes to absolute indexes, relative to the # word objects' ID attributes for paragraph in d._paragraphs: for sentence in paragraph._sentences: for markable in sentence._markables: sliceIndex = markable._slice fromIndex, toIndex = sliceIndex.split(":")[0], sliceIndex.split(":")[1] try: fromId = sentence._words[int(fromIndex)]._id toId = sentence._words[int(toIndex)]._id #print fromId + " " + toId markable._slice = fromId + ":" + toId except ValueError: # Value error here means that the key couldn't be converted to an int # value, in that case it has already been converted by another # markable and can be used "as is" markable._slice = fromIndex + ":" + toIndex """ return d
def add_document(self): schema = DocumentSchema() form = Form(schema, buttons=('submit',)) if 'submit' in self.request.POST: # Make a new Document title = self.request.POST['title'] content = self.request.POST['content'] name = str(randint(0,999999)) new_document = Document(title, content) new_document.__name__ = name new_document.__parent__ = self.context self.context[name] = new_document # Redirect to the new document url = self.request.resource_url(new_document) return HTTPFound(location=url) return {"form": form.render()}
def add_document(self): schema = DocumentSchema() form = Form(schema, buttons=('submit', )) if 'submit' in self.request.POST: # Make a new Document title = self.request.POST['title'] content = self.request.POST['content'] name = str(randint(0, 999999)) new_document = Document(title, content) new_document.__name__ = name new_document.__parent__ = self.context self.context[name] = new_document # Redirect to the new document url = self.request.resource_url(new_document) return HTTPFound(location=url) return {"form": form.render()}
def _tab_separated_input_to_doc(input): # Create a dataset out of the input doc = Document(input.name, [], [], '<%s>' % input.name) for _string, _type in (l.rstrip('\n').split('\t') for l in input): doc.abstract.append( Sentence(_string, [ Annotation(0, len(_string), _type), ])) return doc
def add_document_view(self): # Make a new Document title = self.request.POST['document_title'] name = str(randint(0, 999999)) new_document = Document(name, self.context, title) self.context[name] = new_document # Redirect to the new document url = self.request.resource_url(new_document) return HTTPFound(location=url)
def add_document(self): schema = DocumentSchema() form = Form(schema, buttons=('submit', )) if 'submit' in self.request.POST: # Make a new Document title = self.request.POST['title'] content = self.request.POST['content'] doc_id = self.document_map.new_docid() name = "document%s" % doc_id new_document = Document(title, content) new_document.__name__ = name new_document.__parent__ = self.context self.context[name] = new_document # map object path to catalog id path = resource_path(new_document) self.document_map.add(path, doc_id) # index new folder self.catalog.index_doc(doc_id, new_document) # Redirect to the new document url = self.request.resource_url(new_document) return HTTPFound(location=url) return {"form": form.render()}
def add_document(self): schema = DocumentSchema() form = Form(schema, buttons=('submit',)) if 'submit' in self.request.POST: # Make a new Document title = self.request.POST['title'] content = self.request.POST['content'] doc_id = self.document_map.new_docid() name = "document%s" % doc_id new_document = Document(title, content) new_document.__name__ = name new_document.__parent__ = self.context self.context[name] = new_document # map object path to catalog id path = resource_path(new_document) self.document_map.add(path, doc_id) # index new folder self.catalog.index_doc(doc_id, new_document) # Redirect to the new document url = self.request.resource_url(new_document) return HTTPFound(location=url) return {"form": form.render()}
def _get_documents(dir): for id, txt_path, ss_path, a1_path, a2_path in _get_aligned_resources(dir): #print id # First we align the text and the sentences since we need to map the # offsets of the stand-off to map to the sentences in the sentence # split file #with open(txt_path, 'r') as txt_file, open(ss_path, 'r') as ss_file: with open(txt_path, 'r') as txt_file: if ENCODE_WRAP: txt_file = _encode_wrap(txt_file) with open(ss_path, 'r') as ss_file: if ENCODE_WRAP: ss_file = _encode_wrap(ss_file) #sentences, s_offset_by_sentence = ( s_starts_and_sentences = (_get_sentences_and_offsets( txt_file, ss_file)) #XXX: HACK! if a2_path is None: a2_path = '/dev/null' #with open(a1_path, 'r') as a1_file, open(a2_path, 'r') as a2_file: with open(a1_path, 'r') as a1_file: if ENCODE_WRAP: a1_file = _encode_wrap(a1_file) with open(a2_path, 'r') as a2_file: if ENCODE_WRAP: a2_file = _encode_wrap(a2_file) for line in (l.rstrip('\n') for l in chain(a1_file, a2_file)): # We ignore everything apart from the text-bound annotations match = TB_SO_REGEX.match(line) if match is not None: g_dict = match.groupdict() ann_start = int(g_dict['start']) ann_end = int(g_dict['end']) # Find the sentence and its index containing the annotation s_idx, sentence = _find_containing_idx( ann_start, s_starts_and_sentences) # XXX: There are cases where an annotation is cut-off # by a sentence break. If this is the case, merge # the sentences. if ann_end > s_idx + len(sentence.text): next_s_idx, next_sentence = _find_containing_idx( ann_end, s_starts_and_sentences) # Merge the next sentence into this one # XXX: Just assumes a space! May be wrong! sentence = Sentence( sentence.text + ' ' + next_sentence.text, sentence.annotations + next_sentence.annotations) # Remove the old one s_starts_and_sentences.remove( (next_s_idx, next_sentence)) # Create an annotation object but adjust the indices to # be relative to the sentence and not to the file new_ann_start = ann_start - s_idx assert 0 <= new_ann_start < len( sentence.text ), '0 <= {} < {} ({}, {}) {} "{}" {}'.format( new_ann_start, len(sentence.text), s_idx, g_dict['start'], id, g_dict['text'], s_idx) new_ann_end = ann_end - s_idx assert 0 < new_ann_end <= len( sentence.text ), '0 < {} <= {} ({}, {}) {} {}'.format( new_ann_end, len(sentence.text), s_idx, g_dict['end'], id, g_dict['text']) assert new_ann_start < new_ann_end annotation = Annotation(ann_start - s_idx, ann_end - s_idx, g_dict['type']) # If we have a text span in the stand-off we sanity check # it against what is in the sentence #XXX: Run this again! if g_dict['text'] is not None: g_dict['text'] = unicode( g_dict['text'].strip('\r\n'), encoding='utf-8') #XXX: Regex is not perfect # it leaves spaces around target_ann_text = sentence.annotation_text( annotation) assert target_ann_text == g_dict['text'], ( 'text span mismatch in {} ' 'target: "{}" != source: "{}" {} "{}" {} {} {}' ).format(id, target_ann_text, g_dict['text'], annotation, sentence.text, g_dict, type(target_ann_text), type(g_dict['text'])) sentence.add_annotation(annotation) #else: # assert False, line.replace(' ', '\s').replace('\t', '\\t') yield Document(id, [], [sentence for _, sentence in s_starts_and_sentences], txt_path)