def test_review_response_with_invalid_role(self): submission = Document('Submission', 'First submission', 1, ['/tmp/edms/samples/submission.pdf'], 'pdf') submission_id = self._document_manager.add_document(submission) request_1 = Document('request_1', 'First request', 1, ['/tmp/edms/samples/request_1.pdf'], 'pdf') request_1_id = self._document_manager.add_document(request_1) request_2 = Document('request_2', 'Second request', 1, ['/tmp/edms/samples/request_2.pdf'], 'pdf') request_2_id = self._document_manager.add_document(request_2) response_1 = Document('response_1', 'First response', 1, ['/tmp/edms/samples/response_1.pdf'], 'pdf') response_1_id = self._document_manager.add_document(response_1) response_2 = Document('response_2', 'Second response', 1, ['/tmp/edms/samples/response_2.pdf'], 'pdf') response_2_id = self._document_manager.add_document(response_2) self._review_manager.select_document(submission_id) self._review_manager.submit_document(self._author_id, self._manager_id, submission_id) self._review_manager.send_reviewing_request_1(self._manager_id, self._reviewer_1_id, request_1_id) self._review_manager.send_reviewing_request_2(self._manager_id, self._reviewer_2_id, request_2_id) self._review_manager.send_review_1(self._reviewer_1_id, self._manager_id, response_1_id) with self.assertRaises(ValueError): self._review_manager.send_review_2(self._author_id, self._manager_id, response_2_id) self._review_manager.save_review()
def importbib(db, bibfile, tags=[], overwrite=False): errors = [] sources = Sources() for entry in sorted(Bibtex(bibfile), key=lambda entry: entry.key): print >>sys.stderr, entry.key try: docs = [] # check for doc with this bibkey bdoc = db.doc_for_bib(entry.key) if bdoc: docs.append(bdoc) # check for known sids for source in sources.scan_bibentry(entry): sdoc = db.doc_for_source(source.sid) # FIXME: why can't we match docs in list? if sdoc and sdoc.docid not in [doc.docid for doc in docs]: docs.append(sdoc) if len(docs) == 0: doc = Document(db) elif len(docs) > 0: if len(docs) > 1: print >>sys.stderr, " Multiple distinct docs found for entry. Using first found." doc = docs[0] print >>sys.stderr, " Updating id:%d..." % (doc.docid) doc.add_bibentry(entry) filepath = entry.get_file() if filepath: print >>sys.stderr, " Adding file: %s" % filepath doc.add_file(filepath) doc.add_tags(tags) doc.sync() except BibtexError as e: print >>sys.stderr, " Error processing entry %s: %s" % (entry.key, e) print >>sys.stderr errors.append(entry.key) if errors: print >>sys.stderr print >>sys.stderr, "Failed to import %d" % (len(errors)), if len(errors) == 1: print >>sys.stderr, "entry", else: print >>sys.stderr, "entries", print >>sys.stderr, "from bibtex:" for error in errors: print >>sys.stderr, " %s" % (error) sys.exit(1) else: sys.exit(0)
def test_visibility(self): document = Document('title1', 'desc1', 1, ['1.pdf', '2.pdf'], 'pdf') self.assertFalse(document.is_public()) document.make_public() self.assertTrue(document.is_public()) document.make_private() self.assertFalse(document.is_public())
class Test(TestCase): def setUp(self): import time self.document = Document() self.fract = Fraction(seed=time.time()) def test_gerenate(self): questions = list() for _ in range(0, 10): questions.append(self.fract.pattern_fraction_lv0020()) ex = {'description': '분수끼리의 덧셈 (초5 1학기)', 'pattern': questions} self.document.add_exercise(ex) questions = list() for _ in range(0, 10): questions.append(self.fract.pattern_fraction_lv0030()) ex = {'description': '분수끼리의 뺄셈 (초5 1학기)', 'pattern': questions} self.document.add_exercise(ex) questions = list() for _ in range(0, 10): questions.append(self.fract.pattern_fraction_lv0040()) ex = {'description': '분수끼리의 곱셈 (초5 2학기)', 'pattern': questions} self.document.add_exercise(ex) questions = list() for _ in range(0, 10): questions.append(self.fract.pattern_fraction_lv0050()) ex = {'description': '분수끼리의 나눗셈 (초5 2학기)', 'pattern': questions} self.document.add_exercise(ex) print(self.document.generate_tex())
def test_document_update_with_invalid_id(self): with self.assertRaises(ValueError): a = Document( 'A', 'description of A', 1, ['/tmp/edms/samples/a1.pdf', '/tmp/edms/samples/a2.pdf'], 'pdf') document_id = self._document_manager.add_document(a) b = Document('B', 'description of B', 2, ['/tmp/edms/samples/b.doc'], 'doc') # sleep(1000) self._document_manager.update_document(document_id + 1, b)
def test_document_update(self): a = Document('A', 'description of A', 1, ['/tmp/edms/samples/a1.pdf', '/tmp/edms/samples/a2.pdf'], 'pdf') document_id = self._document_manager.add_document(a) b = Document('B', 'description of B', 2, ['/tmp/edms/samples/b.doc'], 'doc') self._document_manager.update_document(document_id, b) updated_document = self._document_manager.find_document_by_id( document_id) self.assertEqual(updated_document.author, b.author) self.assertEqual(updated_document.doc_format, b.doc_format)
def restore(self, log=False): """Restore a database from an existing root.""" docdirs = os.listdir(self.root) docdirs.sort() for ddir in docdirs: if ddir == '.xapers': continue docdir = os.path.join(self.root, ddir) if not os.path.isdir(docdir): # skip things that aren't directories continue if log: print >> sys.stderr, docdir # if we can't convert the directory name into an integer, # assume it's not relevant to us and continue try: docid = int(ddir) except ValueError: continue docfiles = os.listdir(docdir) if not docfiles: # skip empty directories continue if log: print >> sys.stderr, ' docid:', docid try: doc = self[docid] except xapian.DocNotFoundError: doc = Document(self, docid=docid) for dfile in docfiles: dpath = os.path.join(docdir, dfile) if dfile == 'bibtex': if log: print >> sys.stderr, ' adding bibtex' doc.add_bibtex(dpath) elif os.path.splitext(dpath)[1] == '.pdf': if log: print >> sys.stderr, ' adding file:', dfile doc.add_file(dpath) elif dfile == 'tags': if log: print >> sys.stderr, ' adding tags' with open(dpath, 'r') as f: tags = f.read().strip().split('\n') doc.add_tags(tags) doc.sync()
def add_copy_for_doc(self, original: dc.Document, copy_count): if self.get_priv() == 3: original.add_copy(copy_count) db.update(title=original.get_title(), copies=original.get_count_of_copies()) id_str = str(self.get_id()) date = get_date() title_str = str(original.get_title()) db.insert_log(date + " | Librarian(" + id_str + ") added copy for book: " + title_str) else: return
def test_review_request_1(self): submission = Document('Submission', 'First submission', 1, ['/tmp/edms/samples/submission.pdf'], 'pdf') submission_id = self._document_manager.add_document(submission) request_1 = Document('request_1', 'First request', 1, ['/tmp/edms/samples/request_1.pdf'], 'pdf') request_1_id = self._document_manager.add_document(request_1) self._review_manager.select_document(submission_id) self._review_manager.submit_document(self._author_id, self._manager_id, submission_id) self._review_manager.send_reviewing_request_1(self._manager_id, self._reviewer_1_id, request_1_id) self._review_manager.save_review()
def test_document_remove(self): a = Document('A', 'description of A', 1, ['/tmp/edms/samples/a1.pdf', '/tmp/edms/samples/a2.pdf'], 'pdf') a_id = self._document_manager.add_document(a) b = Document('B', 'description of B', 2, ['/tmp/edms/samples/b.doc'], 'doc') b_id = self._document_manager.add_document(b) self.assertEqual(self._document_manager.count_documents(), 2) self._document_manager.remove_document(a_id) # sleep(1000) document = self._document_manager.find_document_by_id(b_id) self.assertEqual(document.title, 'B') self.assertEqual(self._document_manager.count_documents(), 1)
def predict(files, model_path, output_dir, format): # Must specify output format if format not in ['i2b2']: print >>sys.stderr, '\n\tError: Must specify output format' print >>sys.stderr, '\tAvailable formats: i2b2' print >>sys.stderr, '' exit(1) # Load model with open(model_path, 'rb') as f: model = pickle.load(f) # Tell user if not predicting if not files: print >>sys.stderr, "\n\tYou did not supply any input files\n" exit(1) # For each file, predict concept labels n = len(files) for i,txt in enumerate(sorted(files)): note = Document(txt) fname = os.path.splitext(os.path.basename(txt))[0] + '.con' out_path = os.path.join(output_dir, fname) #''' if os.path.exists(out_path): #print '\tWARNING: prediction file already exists (%s)' % out_path continue #''' print '-' * 30 print '\n\t%d of %d' % (i+1,n) print '\t', txt, '\n' # Predict concept labels labels = model.predict_classes_from_document(note) # Get predictions in proper format output = note.write(labels) # Output the concept predictions print '\n\nwriting to: ', out_path with open(out_path, 'w') as f: print >>f, output print
def test_properties(self): document = Document('title1', 'desc1', 1, ['1.pdf', '2.pdf'], 'pdf') self.assertEqual(document.title, 'title1') self.assertEqual(document.description, 'desc1') self.assertEqual(document.author, 1) self.assertEqual(document.files, ['1.pdf', '2.pdf']) self.assertEqual(document.doc_format, 'pdf')
def test_invalid_id(self): a = Document('title1', 'desc1', 1, ['/tmp/edms/samples/a1.pdf', '/tmp/edms/samples/a2.pdf'], 'pdf') a_id = self._document_manager.add_document(a) with self.assertRaises(ValueError): self._document_manager.find_document_by_id(a_id + 1)
def test_find_documents_by_author(self): a = Document('A', 'description of A', 1, ['/tmp/edms/samples/a1.pdf', '/tmp/edms/samples/a2.pdf'], 'pdf') self._document_manager.add_document(a) b = Document('B', 'description of B', 2, ['/tmp/edms/samples/b.doc'], 'doc') self._document_manager.add_document(b) documents = self._document_manager.find_documents_by_author(1) self.assertEqual(len(documents), 1) document = documents[0] self.assertEqual(a.title, document.title) self.assertEqual(a.description, document.description) self.assertEqual(a.author, document.author) self.assertEqual(document.files, ['a1.pdf', 'a2.pdf']) self.assertEqual(a.doc_format, document.doc_format)
def test_document_remove_with_invalid_id(self): a = Document('A', 'description of A', 1, ['/tmp/edms/samples/a1.pdf', '/tmp/edms/samples/a2.pdf'], 'pdf') a_id = self._document_manager.add_document(a) with self.assertRaises(ValueError): self._document_manager.remove_document(a_id + 1)
def train(training_list, model_path, format, logfile=None): # Read the data into a Document object docs = [] for txt, con in training_list: #try: doc_tmp = Document(txt, con) docs.append(doc_tmp) #except Exception, e: # exit( '\n\tWARNING: Document Exception - %s\n\n' % str(e) ) # file names if not docs: print 'Error: Cannot train on 0 files. Terminating train.' exit(1) # Create a Machine Learning model model = ClinerModel() # Train the model using the Document's data model.fit_from_documents(docs) # Pickle dump print '\nserializing model to %s\n' % model_path with open(model_path, 'wb') as f: pickle.dump(model, f) model.log(logfile , model_file=model_path) model.log(sys.stdout, model_file=model_path)
def restore(self, log=False): """Restore a database from an existing root.""" docdirs = os.listdir(self.root) docdirs.sort() for ddir in docdirs: if ddir == '.xapers': continue docdir = os.path.join(self.root, ddir) if not os.path.isdir(docdir): # skip things that aren't directories continue if log: print >>sys.stderr, docdir # if we can't convert the directory name into an integer, # assume it's not relevant to us and continue try: docid = int(ddir) except ValueError: continue docfiles = os.listdir(docdir) if not docfiles: # skip empty directories continue if log: print >>sys.stderr, ' docid:', docid try: doc = self[docid] except xapian.DocNotFoundError: doc = Document(self, docid=docid) for dfile in docfiles: dpath = os.path.join(docdir, dfile) if dfile == 'bibtex': if log: print >>sys.stderr, ' adding bibtex' doc.add_bibtex(dpath) elif os.path.splitext(dpath)[1] == '.pdf': if log: print >>sys.stderr, ' adding file:', dfile doc.add_file(dpath) elif dfile == 'tags': if log: print >>sys.stderr, ' adding tags' with open(dpath, 'r') as f: tags = f.read().strip().split('\n') doc.add_tags(tags) doc.sync()
def test_submission(self): submission = Document('Submission', 'First submission', 1, ['/tmp/edms/samples/submission.pdf'], 'pdf') submission_id = self._document_manager.add_document(submission) self._review_manager.select_document(submission_id) self._review_manager.submit_document(self._author_id, self._manager_id, submission_id) self._review_manager.save_review()
def test_find_documents_by_format(self): a = Document('A', 'description of A', 1, ['/tmp/edms/samples/a1.pdf', '/tmp/edms/samples/a2.pdf'], 'pdf') b = Document('B', 'description of B', 2, ['/tmp/edms/samples/b.doc'], 'doc') c = Document('C', 'description of A', 1, [ '/tmp/edms/samples/c1.pdf', '/tmp/edms/samples/c2.pdf', '/tmp/edms/samples/c3.pdf' ], 'pdf') self._document_manager.add_document(a) self._document_manager.add_document(b) self._document_manager.add_document(c) documents = self._document_manager.find_documents_by_format('pdf') self.assertEqual(len(documents), 2) titles = {document.title for document in documents} self.assertIn('A', titles) self.assertIn('C', titles)
def _doc_for_term(self, term): enquire = xapian.Enquire(self.xapian) query = xapian.Query(term) enquire.set_query(query) mset = enquire.get_mset(0, 2) # FIXME: need to throw an exception if more than one match found if mset: return Document(self, mset[0].document) else: return None
def test_find_document_by_id(self): a = Document('A', 'description of A', 1, ['/tmp/edms/samples/a1.pdf', '/tmp/edms/samples/a2.pdf'], 'pdf') b = Document('B', 'description of B', 2, ['/tmp/edms/samples/b.doc'], 'doc') c = Document('C', 'description of C', 1, [ '/tmp/edms/samples/c1.html', '/tmp/edms/samples/c2.png', '/tmp/edms/samples/c3.png' ], 'html') a_id = self._document_manager.add_document(a) b_id = self._document_manager.add_document(b) c_id = self._document_manager.add_document(c) document = self._document_manager.find_document_by_id(c_id) self.assertEqual(document.title, 'C') document = self._document_manager.find_document_by_id(b_id) self.assertEqual(document.title, 'B') document = self._document_manager.find_document_by_id(a_id) self.assertEqual(document.title, 'A')
def create_document(): body = request.get_json() try: title = body["title"] content = body["content"] parent = body.get("parent") children = body.get("children") except KeyError: return "Data missing from request" document = Document(**{ "title": title, "content": content, "parent": parent, "children": children }) document.save() return respond(document=document.to_mongo())
def test_submission_with_invalid_roles(self): submission = Document('Submission', 'First submission', 1, ['/tmp/edms/samples/submission.pdf'], 'pdf') submission_id = self._document_manager.add_document(submission) self._review_manager.select_document(submission_id) self._user_manager.remove_role(self._author_id, 'author') with self.assertRaises(ValueError): self._review_manager.submit_document(self._author_id, self._manager_id, submission_id) self._review_manager.save_review()
def test_retrieve_last_document(self): document = Document( 'title1', 'desc1', 1, ['/tmp/edms/samples/a1.pdf', '/tmp/edms/samples/a2.pdf'], 'pdf') document_id = self._document_manager.add_document(document) retrieved = self._document_manager.find_document_by_id(document_id) self.assertEqual(retrieved.title, 'title1') self.assertEqual(retrieved.description, 'desc1') self.assertEqual(retrieved.author, 1) self.assertEqual(retrieved.files, ['a1.pdf', 'a2.pdf']) self.assertEqual(retrieved.doc_format, 'pdf')
def get_documents_from_file(file_name, neutral): dom = parse(file_name + '.aa') handle_dom(dom, neutral) i = 0 while i < len(index_list): if (((index_list[i] == "positive") or (index_list[i] == "negative") or (neutral and (index_list[i] == "none"))) and index_list[i - 1].isdigit() and index_list[i - 2].isdigit()): text = fetch_text(file_name + ".ac", int(index_list[i - 2]), int(index_list[i - 1])) true_list.append( Document(build_vector(text, neutral), index_list[i])) i = i + 1
def test_multiple_document_addition(self): for i in range(100): document = Document( 'Title {}'.format(i), 'Desc {}'.format(i), 1, ['/tmp/edms/samples/a1.pdf', '/tmp/edms/samples/a2.pdf'], 'pdf') document_generator = generator.DocumentGenerator() for name in [ 'a1.pdf', 'a2.pdf', 'b.doc', 'c1.html', 'c2.png', 'c3.png', 'c1.pdf', 'c2.pdf', 'c3.pdf' ]: if not path.exists('/tmp/edms/samples'): makedirs('/tmp/edms/samples') document_generator.generate_random_file( '/tmp/edms/samples/{}'.format(name)) self._document_manager.add_document(document) self.assertEqual(self._document_manager.count_documents(), 100)
def read_doc_from_file(config, filename, name=None): """ Create a document from file. If pytho source code has syntax problems it will return None. If acceptempty is enable, it will create (almost) empty documents from source files without code (module is always the root node from a python AST). """ if not name: name = filename o_file = open(filename, 'rb') try: text = ast(config, o_file.read()) if not config['acceptempty'] and len(text) <= 2: return None return Document(name, text) except (SyntaxError, UnicodeEncodeError): return None finally: o_file.close()
def test_reviewing_process_with_evaluation(self): submission = Document('Submission', 'First submission', 1, ['/tmp/edms/samples/submission.pdf'], 'pdf') submission_id = self._document_manager.add_document(submission) request_1 = Document('request_1', 'First request', 1, ['/tmp/edms/samples/request_1.pdf'], 'pdf') request_1_id = self._document_manager.add_document(request_1) request_2 = Document('request_2', 'Second request', 1, ['/tmp/edms/samples/request_2.pdf'], 'pdf') request_2_id = self._document_manager.add_document(request_2) response_1 = Document('response_1', 'First response', 1, ['/tmp/edms/samples/response_1.pdf'], 'pdf') response_1_id = self._document_manager.add_document(response_1) response_2 = Document('response_2', 'Second response', 1, ['/tmp/edms/samples/response_2.pdf'], 'pdf') response_2_id = self._document_manager.add_document(response_2) evaluation_result = Document( 'response_2', 'Second response', 1, ['/tmp/edms/samples/evaluation_result.pdf'], 'pdf') evaluation_result_id = self._document_manager.add_document( evaluation_result) self._review_manager.select_document(submission_id) self._review_manager.submit_document(self._author_id, self._manager_id, submission_id) self._review_manager.save_review() self._review_manager.send_reviewing_request_1(self._manager_id, self._reviewer_1_id, request_1_id) self._review_manager.save_review() self._review_manager.send_reviewing_request_2(self._manager_id, self._reviewer_2_id, request_2_id) self._review_manager.save_review() self._review_manager.send_review_1(self._reviewer_1_id, self._manager_id, response_1_id) self._review_manager.save_review() self._review_manager.send_review_2(self._reviewer_2_id, self._manager_id, response_2_id) self._review_manager.save_review() self._review_manager.send_evaluation(self._manager_id, self._author_id, evaluation_result_id) self._review_manager.save_review()
def importbib(db, bibfile, tags=[], overwrite=False): errors = [] sources = Sources() for entry in sorted(Bibtex(bibfile), key=lambda entry: entry.key): print >> sys.stderr, entry.key try: docs = [] # check for doc with this bibkey bdoc = db.doc_for_bib(entry.key) if bdoc: docs.append(bdoc) # check for known sids for source in sources.scan_bibentry(entry): sdoc = db.doc_for_source(source.sid) # FIXME: why can't we match docs in list? if sdoc and sdoc.docid not in [doc.docid for doc in docs]: docs.append(sdoc) if len(docs) == 0: doc = Document(db) elif len(docs) > 0: if len(docs) > 1: print >> sys.stderr, " Multiple distinct docs found for entry. Using first found." doc = docs[0] print >> sys.stderr, " Updating id:%d..." % (doc.docid) doc.add_bibentry(entry) filepath = entry.get_file() if filepath: print >> sys.stderr, " Adding file: %s" % filepath doc.add_file(filepath) doc.add_tags(tags) doc.sync() except BibtexError as e: print >> sys.stderr, " Error processing entry %s: %s" % ( entry.key, e) print >> sys.stderr errors.append(entry.key) if errors: print >> sys.stderr print >> sys.stderr, "Failed to import %d" % (len(errors)), if len(errors) == 1: print >> sys.stderr, "entry", else: print >> sys.stderr, "entries", print >> sys.stderr, "from bibtex:" for error in errors: print >> sys.stderr, " %s" % (error) sys.exit(1) else: sys.exit(0)
def add(db, query_string, infile=None, sid=None, tags=None, prompt=False): doc = None bibtex = None sources = Sources() doc_sid = sid source = None file_data = None if infile and infile is not True: infile = os.path.expanduser(infile) ################################## # if query provided, find single doc to update if query_string: if db.count(query_string) != 1: print >> sys.stderr, "Search '%s' did not match a single document." % query_string print >> sys.stderr, "Aborting." sys.exit(1) for doc in db.search(query_string): break ################################## # do fancy option prompting if prompt: doc_sids = [] if doc_sid: doc_sids = [doc_sid] # scan the file for source info if infile is not True: infile = prompt_for_file(infile) print >> sys.stderr, "Scanning document for source identifiers..." try: ss = sources.scan_file(infile) except ParseError as e: print >> sys.stderr, "\n" print >> sys.stderr, "Parse error: %s" % e sys.exit(1) if len(ss) == 0: print >> sys.stderr, "0 source ids found." else: if len(ss) == 1: print >> sys.stderr, "1 source id found:" else: print >> sys.stderr, "%d source ids found:" % (len(ss)) for sid in ss: print >> sys.stderr, " %s" % (sid) doc_sids += [s.sid for s in ss] doc_sid = prompt_for_source(db, doc_sids) tags = prompt_for_tags(db, tags) if not query_string and not infile and not doc_sid: print >> sys.stderr, "Must specify file or source to import, or query to update existing document." sys.exit(1) ################################## # process source and get bibtex # check if source is a file, in which case interpret it as bibtex if doc_sid and os.path.exists(doc_sid): bibtex = doc_sid elif doc_sid: # get source object for sid string try: source = sources.match_source(doc_sid) except SourceError as e: print >> sys.stderr, e sys.exit(1) # check that the source doesn't match an existing doc sdoc = db.doc_for_source(source.sid) if sdoc: if doc and sdoc != doc: print >> sys.stderr, "A different document already exists for source '%s'." % ( doc_sid) print >> sys.stderr, "Aborting." sys.exit(1) print >> sys.stderr, "Source '%s' found in database. Updating existing document..." % ( doc_sid) doc = sdoc try: print >> sys.stderr, "Retrieving bibtex...", bibtex = source.fetch_bibtex() print >> sys.stderr, "done." except SourceError as e: print >> sys.stderr, "\n" print >> sys.stderr, "Could not retrieve bibtex: %s" % e sys.exit(1) if infile is True: try: print >> sys.stderr, "Retrieving file...", file_name, file_data = source.fetch_file() print >> sys.stderr, "done." except SourceError as e: print >> sys.stderr, "\n" print >> sys.stderr, "Could not retrieve file: %s" % e sys.exit(1) elif infile is True: print >> sys.stderr, "Must specify source with retrieve file option." sys.exit(1) if infile and not file_data: with open(infile, 'r') as f: file_data = f.read() file_name = os.path.basename(infile) ################################## # if we still don't have a doc, create a new one if not doc: doc = Document(db) ################################## # add stuff to the doc if bibtex: try: print >> sys.stderr, "Adding bibtex...", doc.add_bibtex(bibtex) print >> sys.stderr, "done." except BibtexError as e: print >> sys.stderr, "\n" print >> sys.stderr, e print >> sys.stderr, "Bibtex must be a plain text file with a single bibtex entry." sys.exit(1) except: print >> sys.stderr, "\n" raise # add source sid if it hasn't been added yet if source and not doc.get_sids(): doc.add_sid(source.sid) if infile: try: print >> sys.stderr, "Adding file...", doc.add_file_data(file_name, file_data) print >> sys.stderr, "done." except ParseError as e: print >> sys.stderr, "\n" print >> sys.stderr, "Parse error: %s" % e sys.exit(1) except: print >> sys.stderr, "\n" raise if tags: try: print >> sys.stderr, "Adding tags...", doc.add_tags(tags) print >> sys.stderr, "done." except: print >> sys.stderr, "\n" raise ################################## # sync the doc to db and disk try: print >> sys.stderr, "Syncing document...", doc.sync() print >> sys.stderr, "done.\n", except: print >> sys.stderr, "\n" raise print_doc_summary(doc) return doc.docid
def add(db, query_string, infile=None, sid=None, tags=None, prompt=False): doc = None bibtex = None sources = Sources() doc_sid = sid source = None file_data = None if infile and infile is not True: infile = os.path.expanduser(infile) ################################## # if query provided, find single doc to update if query_string: if db.count(query_string) != 1: print >>sys.stderr, "Search '%s' did not match a single document." % query_string print >>sys.stderr, "Aborting." sys.exit(1) for doc in db.search(query_string): break ################################## # do fancy option prompting if prompt: doc_sids = [] if doc_sid: doc_sids = [doc_sid] # scan the file for source info if infile is not True: infile = prompt_for_file(infile) print >>sys.stderr, "Scanning document for source identifiers..." try: ss = sources.scan_file(infile) except ParseError as e: print >>sys.stderr, "\n" print >>sys.stderr, "Parse error: %s" % e sys.exit(1) if len(ss) == 0: print >>sys.stderr, "0 source ids found." else: if len(ss) == 1: print >>sys.stderr, "1 source id found:" else: print >>sys.stderr, "%d source ids found:" % (len(ss)) for sid in ss: print >>sys.stderr, " %s" % (sid) doc_sids += [s.sid for s in ss] doc_sid = prompt_for_source(db, doc_sids) tags = prompt_for_tags(db, tags) if not query_string and not infile and not doc_sid: print >>sys.stderr, "Must specify file or source to import, or query to update existing document." sys.exit(1) ################################## # process source and get bibtex # check if source is a file, in which case interpret it as bibtex if doc_sid and os.path.exists(doc_sid): bibtex = doc_sid elif doc_sid: # get source object for sid string try: source = sources.match_source(doc_sid) except SourceError as e: print >>sys.stderr, e sys.exit(1) # check that the source doesn't match an existing doc sdoc = db.doc_for_source(source.sid) if sdoc: if doc and sdoc != doc: print >>sys.stderr, "A different document already exists for source '%s'." % (doc_sid) print >>sys.stderr, "Aborting." sys.exit(1) print >>sys.stderr, "Source '%s' found in database. Updating existing document..." % (doc_sid) doc = sdoc try: print >>sys.stderr, "Retrieving bibtex...", bibtex = source.fetch_bibtex() print >>sys.stderr, "done." except SourceError as e: print >>sys.stderr, "\n" print >>sys.stderr, "Could not retrieve bibtex: %s" % e sys.exit(1) if infile is True: try: print >>sys.stderr, "Retrieving file...", file_name, file_data = source.fetch_file() print >>sys.stderr, "done." except SourceError as e: print >>sys.stderr, "\n" print >>sys.stderr, "Could not retrieve file: %s" % e sys.exit(1) elif infile is True: print >>sys.stderr, "Must specify source with retrieve file option." sys.exit(1) if infile and not file_data: with open(infile, 'r') as f: file_data = f.read() file_name = os.path.basename(infile) ################################## # if we still don't have a doc, create a new one if not doc: doc = Document(db) ################################## # add stuff to the doc if bibtex: try: print >>sys.stderr, "Adding bibtex...", doc.add_bibtex(bibtex) print >>sys.stderr, "done." except BibtexError as e: print >>sys.stderr, "\n" print >>sys.stderr, e print >>sys.stderr, "Bibtex must be a plain text file with a single bibtex entry." sys.exit(1) except: print >>sys.stderr, "\n" raise # add source sid if it hasn't been added yet if source and not doc.get_sids(): doc.add_sid(source.sid) if infile: try: print >>sys.stderr, "Adding file...", doc.add_file_data(file_name, file_data) print >>sys.stderr, "done." except ParseError as e: print >>sys.stderr, "\n" print >>sys.stderr, "Parse error: %s" % e sys.exit(1) except: print >>sys.stderr, "\n" raise if tags: try: print >>sys.stderr, "Adding tags...", doc.add_tags(tags) print >>sys.stderr, "done." except: print >>sys.stderr, "\n" raise ################################## # sync the doc to db and disk try: print >>sys.stderr, "Syncing document...", doc.sync() print >>sys.stderr, "done.\n", except: print >>sys.stderr, "\n" raise print_doc_summary(doc) return doc.docid
def test_export_documents(self): try: shutil.rmtree('/tmp/test_repo3') shutil.rmtree('/tmp/exported_documents') except OSError as error: pass for test_file in ['part1.pdf', 'part2.pdf', 'data.doc']: file_to_path = reduce(os.path.join, [SAMPLE_DIR_PATH, 'importable', test_file]) file_from_path = reduce(os.path.join, [ SAMPLE_DIR_PATH, 'importable', 'export_files_for_test', test_file ]) if not os.path.exists(file_to_path): copy2(file_from_path, file_to_path) repository = Repository('Empty', '/tmp/test_repo3') alice = User('Alice', 'Smith', date(1980, 10, 10), '*****@*****.**', '****') bob = User('Bob', 'Marker', date(1970, 11, 11), '*****@*****.**', '****') alice_id = repository._user_manager.add_user(alice) bob_id = repository._user_manager.add_user(bob) first_document = Document( 'Some important doc', 'Contains various documentations', alice_id, [ SAMPLE_DIR_PATH + '/importable/part1.pdf', SAMPLE_DIR_PATH + '/importable/part2.pdf' ], 'pdf') first_document.make_public() first_document.change_state('pending') first_document.change_state('accepted') second_document = Document('Data report', 'Figures and graphs mainly', bob_id, [SAMPLE_DIR_PATH + '/importable/data.doc'], 'doc') second_document.make_public() second_document.change_state('pending') second_document.change_state('accepted') first_id = repository._document_manager.add_document(first_document) second_id = repository._document_manager.add_document(second_document) repository.export_documents([first_id, second_id], '/tmp/exported_documents') self.assertTrue(os.path.exists('/tmp/exported_documents/part1.pdf')) self.assertTrue(os.path.exists('/tmp/exported_documents/part2.pdf')) self.assertTrue(os.path.exists('/tmp/exported_documents/data.doc')) self.assertTrue(os.path.exists('/tmp/exported_documents/1.edd')) self.assertTrue(os.path.exists('/tmp/exported_documents/2.edd')) with open('/tmp/exported_documents/1.edd') as edd_file: lines = edd_file.readlines() self.assertIn('[document]\n', lines) self.assertIn('title=Some important doc\n', lines) self.assertIn('description=Contains various documentations\n', lines) self.assertIn('author=Alice Smith\n', lines) self.assertIn("files=['part1.pdf', 'part2.pdf']\n", lines) self.assertIn('doc_format=pdf\n', lines) with open('/tmp/exported_documents/2.edd') as edd_file: lines = edd_file.readlines() self.assertIn('[document]\n', lines) self.assertIn('title=Data report\n', lines) self.assertIn('description=Figures and graphs mainly\n', lines) self.assertIn('author=Bob Marker\n', lines) self.assertIn("files=['data.doc']\n", lines) self.assertIn('doc_format=doc\n', lines) shutil.rmtree('/tmp/exported_documents') shutil.rmtree('/tmp/test_repo3')
def main(): # Parse command line arguments parser = argparse.ArgumentParser(prog='cliner evaluate') parser.add_argument( "--txt", dest="txt", help="Glob of .txt files of discharge summaries", ) parser.add_argument( "--predictions", dest="pred", help="Directory where predictions are stored.", ) parser.add_argument( "--gold", dest="gold", help="Directory where gold standard is stored.", ) parser.add_argument("--format", dest="format", help="Data format ( con )") parser.add_argument( "--output", dest="output", help="Write the evaluation to a file rather than STDOUT", ) args = parser.parse_args() if not args.txt: print '\n\tERROR: must provide --txt argument\n' parser.print_help(sys.stderr) print >> sys.stderr, '' exit(1) if not args.pred: print '\n\tERROR: must provide --pred argument\n' parser.print_help(sys.stderr) print >> sys.stderr, '' exit(1) if not args.gold: print '\n\tERROR: must provide --gold argument\n' parser.print_help(sys.stderr) print >> sys.stderr, '' exit(1) if args.format: format = args.format else: print '\n\tERROR: must provide --format argument\n' parser.print_help(sys.stderr) print >> sys.stderr, '' exit(1) # Is output destination specified? if args.output: args.output = open(args.output, "w") else: args.output = sys.stdout # Must specify output format if format not in ['i2b2']: print >> sys.stderr, '\n\tError: Must specify output format' print >> sys.stderr, '\tAvailable formats: i2b2' print >> sys.stderr, '' parser.print_help(sys.stderr) print >> sys.stderr, '' exit(1) # List of medical text txt_files = glob.glob(args.txt) txt_files_map = tools.map_files(txt_files) wildcard = '*.con' # List of gold data ref_files = glob.glob(os.path.join(args.gold, wildcard)) ref_files_map = tools.map_files(ref_files) # List of predictions pred_files = glob.glob(os.path.join(args.pred, wildcard)) pred_files_map = tools.map_files(pred_files) # Grouping of text, predictions, gold files = [] for k in txt_files_map: if k in pred_files_map and k in ref_files_map: files.append( (txt_files_map[k], pred_files_map[k], ref_files_map[k])) # txt <- medical text # annotations <- predictions # gold <- gold standard if len(files) == 0: print "No files to be evaluated" exit() print for txt, annotations, gold in files: # Read predictions and gols standard data cnote = Document(txt, annotations) rnote = Document(txt, gold) sents = rnote.getTokenizedSentences() # Note - can also get first pass (IOB labels) ref = rnote.conlist() pred = cnote.conlist() for i, toks, pline, rline in zip(range(len(sents)), sents, pred, ref): for j, token, rlab, plab in zip(range(len(pline)), toks, rline, pline): if rlab != plab: ind = max(0, j - 3) #print 'ref: ', rline[j-3:j+3] #print 'pred: ', pline[j-3:j+3] print token for k in range(ind, j): print ' ' * (len(toks[k]) + 4), print '<>' print toks[j - 3:j + 3] print '\tpred: ', plab print '\tref: ', rlab print '\n'