示例#1
0
 def test_review_response_with_invalid_role(self):
     submission = Document('Submission', 'First submission', 1,
                           ['/tmp/edms/samples/submission.pdf'], 'pdf')
     submission_id = self._document_manager.add_document(submission)
     request_1 = Document('request_1', 'First request', 1,
                          ['/tmp/edms/samples/request_1.pdf'], 'pdf')
     request_1_id = self._document_manager.add_document(request_1)
     request_2 = Document('request_2', 'Second request', 1,
                          ['/tmp/edms/samples/request_2.pdf'], 'pdf')
     request_2_id = self._document_manager.add_document(request_2)
     response_1 = Document('response_1', 'First response', 1,
                           ['/tmp/edms/samples/response_1.pdf'], 'pdf')
     response_1_id = self._document_manager.add_document(response_1)
     response_2 = Document('response_2', 'Second response', 1,
                           ['/tmp/edms/samples/response_2.pdf'], 'pdf')
     response_2_id = self._document_manager.add_document(response_2)
     self._review_manager.select_document(submission_id)
     self._review_manager.submit_document(self._author_id, self._manager_id,
                                          submission_id)
     self._review_manager.send_reviewing_request_1(self._manager_id,
                                                   self._reviewer_1_id,
                                                   request_1_id)
     self._review_manager.send_reviewing_request_2(self._manager_id,
                                                   self._reviewer_2_id,
                                                   request_2_id)
     self._review_manager.send_review_1(self._reviewer_1_id,
                                        self._manager_id, response_1_id)
     with self.assertRaises(ValueError):
         self._review_manager.send_review_2(self._author_id,
                                            self._manager_id, response_2_id)
     self._review_manager.save_review()
示例#2
0
def importbib(db, bibfile, tags=[], overwrite=False):
    errors = []

    sources = Sources()

    for entry in sorted(Bibtex(bibfile), key=lambda entry: entry.key):
        print >>sys.stderr, entry.key

        try:
            docs = []

            # check for doc with this bibkey
            bdoc = db.doc_for_bib(entry.key)
            if bdoc:
                docs.append(bdoc)

            # check for known sids
            for source in sources.scan_bibentry(entry):
                sdoc = db.doc_for_source(source.sid)
                # FIXME: why can't we match docs in list?
                if sdoc and sdoc.docid not in [doc.docid for doc in docs]:
                    docs.append(sdoc)

            if len(docs) == 0:
                doc = Document(db)
            elif len(docs) > 0:
                if len(docs) > 1:
                    print >>sys.stderr, "  Multiple distinct docs found for entry.  Using first found."
                doc = docs[0]
                print >>sys.stderr, "  Updating id:%d..." % (doc.docid)

            doc.add_bibentry(entry)

            filepath = entry.get_file()
            if filepath:
                print >>sys.stderr, "  Adding file: %s" % filepath
                doc.add_file(filepath)

            doc.add_tags(tags)

            doc.sync()

        except BibtexError as e:
            print >>sys.stderr, "  Error processing entry %s: %s" % (entry.key, e)
            print >>sys.stderr
            errors.append(entry.key)

    if errors:
        print >>sys.stderr
        print >>sys.stderr, "Failed to import %d" % (len(errors)),
        if len(errors) == 1:
            print >>sys.stderr, "entry",
        else:
            print >>sys.stderr, "entries",
        print >>sys.stderr, "from bibtex:"
        for error in errors:
            print >>sys.stderr, "  %s" % (error)
        sys.exit(1)
    else:
        sys.exit(0)
示例#3
0
 def test_visibility(self):
     document = Document('title1', 'desc1', 1, ['1.pdf', '2.pdf'], 'pdf')
     self.assertFalse(document.is_public())
     document.make_public()
     self.assertTrue(document.is_public())
     document.make_private()
     self.assertFalse(document.is_public())
示例#4
0
class Test(TestCase):
    def setUp(self):
        import time
        self.document = Document()
        self.fract = Fraction(seed=time.time())

    def test_gerenate(self):
        questions = list()
        for _ in range(0, 10):
            questions.append(self.fract.pattern_fraction_lv0020())
        ex = {'description': '분수끼리의 덧셈 (초5 1학기)', 'pattern': questions}
        self.document.add_exercise(ex)

        questions = list()
        for _ in range(0, 10):
            questions.append(self.fract.pattern_fraction_lv0030())
        ex = {'description': '분수끼리의 뺄셈 (초5 1학기)', 'pattern': questions}
        self.document.add_exercise(ex)

        questions = list()
        for _ in range(0, 10):
            questions.append(self.fract.pattern_fraction_lv0040())
        ex = {'description': '분수끼리의 곱셈 (초5 2학기)', 'pattern': questions}
        self.document.add_exercise(ex)

        questions = list()
        for _ in range(0, 10):
            questions.append(self.fract.pattern_fraction_lv0050())
        ex = {'description': '분수끼리의 나눗셈 (초5 2학기)', 'pattern': questions}
        self.document.add_exercise(ex)

        print(self.document.generate_tex())
 def test_document_update_with_invalid_id(self):
     with self.assertRaises(ValueError):
         a = Document(
             'A', 'description of A', 1,
             ['/tmp/edms/samples/a1.pdf', '/tmp/edms/samples/a2.pdf'],
             'pdf')
         document_id = self._document_manager.add_document(a)
         b = Document('B', 'description of B', 2,
                      ['/tmp/edms/samples/b.doc'], 'doc')
         # sleep(1000)
         self._document_manager.update_document(document_id + 1, b)
 def test_document_update(self):
     a = Document('A', 'description of A', 1,
                  ['/tmp/edms/samples/a1.pdf', '/tmp/edms/samples/a2.pdf'],
                  'pdf')
     document_id = self._document_manager.add_document(a)
     b = Document('B', 'description of B', 2, ['/tmp/edms/samples/b.doc'],
                  'doc')
     self._document_manager.update_document(document_id, b)
     updated_document = self._document_manager.find_document_by_id(
         document_id)
     self.assertEqual(updated_document.author, b.author)
     self.assertEqual(updated_document.doc_format, b.doc_format)
示例#7
0
    def restore(self, log=False):
        """Restore a database from an existing root."""
        docdirs = os.listdir(self.root)
        docdirs.sort()
        for ddir in docdirs:
            if ddir == '.xapers':
                continue
            docdir = os.path.join(self.root, ddir)
            if not os.path.isdir(docdir):
                # skip things that aren't directories
                continue

            if log:
                print >> sys.stderr, docdir

            # if we can't convert the directory name into an integer,
            # assume it's not relevant to us and continue
            try:
                docid = int(ddir)
            except ValueError:
                continue

            docfiles = os.listdir(docdir)
            if not docfiles:
                # skip empty directories
                continue

            if log:
                print >> sys.stderr, '  docid:', docid

            try:
                doc = self[docid]
            except xapian.DocNotFoundError:
                doc = Document(self, docid=docid)

            for dfile in docfiles:
                dpath = os.path.join(docdir, dfile)
                if dfile == 'bibtex':
                    if log:
                        print >> sys.stderr, '  adding bibtex'
                    doc.add_bibtex(dpath)
                elif os.path.splitext(dpath)[1] == '.pdf':
                    if log:
                        print >> sys.stderr, '  adding file:', dfile
                    doc.add_file(dpath)
                elif dfile == 'tags':
                    if log:
                        print >> sys.stderr, '  adding tags'
                    with open(dpath, 'r') as f:
                        tags = f.read().strip().split('\n')
                    doc.add_tags(tags)
            doc.sync()
示例#8
0
    def add_copy_for_doc(self, original: dc.Document, copy_count):
        if self.get_priv() == 3:
            original.add_copy(copy_count)
            db.update(title=original.get_title(),
                      copies=original.get_count_of_copies())

            id_str = str(self.get_id())
            date = get_date()
            title_str = str(original.get_title())
            db.insert_log(date + " | Librarian(" + id_str +
                          ") added copy for book: " + title_str)
        else:
            return
示例#9
0
 def test_review_request_1(self):
     submission = Document('Submission', 'First submission', 1,
                           ['/tmp/edms/samples/submission.pdf'], 'pdf')
     submission_id = self._document_manager.add_document(submission)
     request_1 = Document('request_1', 'First request', 1,
                          ['/tmp/edms/samples/request_1.pdf'], 'pdf')
     request_1_id = self._document_manager.add_document(request_1)
     self._review_manager.select_document(submission_id)
     self._review_manager.submit_document(self._author_id, self._manager_id,
                                          submission_id)
     self._review_manager.send_reviewing_request_1(self._manager_id,
                                                   self._reviewer_1_id,
                                                   request_1_id)
     self._review_manager.save_review()
 def test_document_remove(self):
     a = Document('A', 'description of A', 1,
                  ['/tmp/edms/samples/a1.pdf', '/tmp/edms/samples/a2.pdf'],
                  'pdf')
     a_id = self._document_manager.add_document(a)
     b = Document('B', 'description of B', 2, ['/tmp/edms/samples/b.doc'],
                  'doc')
     b_id = self._document_manager.add_document(b)
     self.assertEqual(self._document_manager.count_documents(), 2)
     self._document_manager.remove_document(a_id)
     # sleep(1000)
     document = self._document_manager.find_document_by_id(b_id)
     self.assertEqual(document.title, 'B')
     self.assertEqual(self._document_manager.count_documents(), 1)
示例#11
0
def predict(files, model_path, output_dir, format):
    # Must specify output format
    if format not in ['i2b2']:
        print >>sys.stderr, '\n\tError: Must specify output format'
        print >>sys.stderr,   '\tAvailable formats: i2b2'
        print >>sys.stderr, ''
        exit(1)

    # Load model
    with open(model_path, 'rb') as f:
        model = pickle.load(f)

    # Tell user if not predicting
    if not files:
        print >>sys.stderr, "\n\tYou did not supply any input files\n"
        exit(1)

    # For each file, predict concept labels
    n = len(files)
    for i,txt in enumerate(sorted(files)):
        note = Document(txt)

        fname = os.path.splitext(os.path.basename(txt))[0] + '.con'
        out_path = os.path.join(output_dir, fname)
        #'''
        if os.path.exists(out_path):
            #print '\tWARNING: prediction file already exists (%s)' % out_path
            continue
        #'''


        print '-' * 30
        print '\n\t%d of %d' % (i+1,n)
        print '\t', txt, '\n'


        # Predict concept labels
        labels = model.predict_classes_from_document(note)

        # Get predictions in proper format
        output = note.write(labels)


        # Output the concept predictions
        print '\n\nwriting to: ', out_path
        with open(out_path, 'w') as f:
            print >>f, output
        print
示例#12
0
 def test_properties(self):
     document = Document('title1', 'desc1', 1, ['1.pdf', '2.pdf'], 'pdf')
     self.assertEqual(document.title, 'title1')
     self.assertEqual(document.description, 'desc1')
     self.assertEqual(document.author, 1)
     self.assertEqual(document.files, ['1.pdf', '2.pdf'])
     self.assertEqual(document.doc_format, 'pdf')
 def test_invalid_id(self):
     a = Document('title1', 'desc1', 1,
                  ['/tmp/edms/samples/a1.pdf', '/tmp/edms/samples/a2.pdf'],
                  'pdf')
     a_id = self._document_manager.add_document(a)
     with self.assertRaises(ValueError):
         self._document_manager.find_document_by_id(a_id + 1)
 def test_find_documents_by_author(self):
     a = Document('A', 'description of A', 1,
                  ['/tmp/edms/samples/a1.pdf', '/tmp/edms/samples/a2.pdf'],
                  'pdf')
     self._document_manager.add_document(a)
     b = Document('B', 'description of B', 2, ['/tmp/edms/samples/b.doc'],
                  'doc')
     self._document_manager.add_document(b)
     documents = self._document_manager.find_documents_by_author(1)
     self.assertEqual(len(documents), 1)
     document = documents[0]
     self.assertEqual(a.title, document.title)
     self.assertEqual(a.description, document.description)
     self.assertEqual(a.author, document.author)
     self.assertEqual(document.files, ['a1.pdf', 'a2.pdf'])
     self.assertEqual(a.doc_format, document.doc_format)
 def test_document_remove_with_invalid_id(self):
     a = Document('A', 'description of A', 1,
                  ['/tmp/edms/samples/a1.pdf', '/tmp/edms/samples/a2.pdf'],
                  'pdf')
     a_id = self._document_manager.add_document(a)
     with self.assertRaises(ValueError):
         self._document_manager.remove_document(a_id + 1)
示例#16
0
文件: train.py 项目: wboag/CliNER
def train(training_list, model_path, format, logfile=None):
    # Read the data into a Document object
    docs = []
    for txt, con in training_list:
        #try:
            doc_tmp = Document(txt, con)
            docs.append(doc_tmp)
        #except Exception, e:
        #    exit( '\n\tWARNING: Document Exception - %s\n\n' % str(e) )

    # file names
    if not docs:
        print 'Error: Cannot train on 0 files. Terminating train.'
        exit(1)

    # Create a Machine Learning model
    model = ClinerModel()

    # Train the model using the Document's data
    model.fit_from_documents(docs)

    # Pickle dump
    print '\nserializing model to %s\n' % model_path
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)
    model.log(logfile   , model_file=model_path)
    model.log(sys.stdout, model_file=model_path)
示例#17
0
    def restore(self, log=False):
        """Restore a database from an existing root."""
        docdirs = os.listdir(self.root)
        docdirs.sort()
        for ddir in docdirs:
            if ddir == '.xapers':
                continue
            docdir = os.path.join(self.root, ddir)
            if not os.path.isdir(docdir):
                # skip things that aren't directories
                continue

            if log:
                print >>sys.stderr, docdir

            # if we can't convert the directory name into an integer,
            # assume it's not relevant to us and continue
            try:
                docid = int(ddir)
            except ValueError:
                continue

            docfiles = os.listdir(docdir)
            if not docfiles:
                # skip empty directories
                continue

            if log:
                print >>sys.stderr, '  docid:', docid

            try:
                doc = self[docid]
            except xapian.DocNotFoundError:
                doc = Document(self, docid=docid)

            for dfile in docfiles:
                dpath = os.path.join(docdir, dfile)
                if dfile == 'bibtex':
                    if log:
                        print >>sys.stderr, '  adding bibtex'
                    doc.add_bibtex(dpath)
                elif os.path.splitext(dpath)[1] == '.pdf':
                    if log:
                        print >>sys.stderr, '  adding file:', dfile
                    doc.add_file(dpath)
                elif dfile == 'tags':
                    if log:
                        print >>sys.stderr, '  adding tags'
                    with open(dpath, 'r') as f:
                        tags = f.read().strip().split('\n')
                    doc.add_tags(tags)
            doc.sync()
示例#18
0
 def test_submission(self):
     submission = Document('Submission', 'First submission', 1,
                           ['/tmp/edms/samples/submission.pdf'], 'pdf')
     submission_id = self._document_manager.add_document(submission)
     self._review_manager.select_document(submission_id)
     self._review_manager.submit_document(self._author_id, self._manager_id,
                                          submission_id)
     self._review_manager.save_review()
 def test_find_documents_by_format(self):
     a = Document('A', 'description of A', 1,
                  ['/tmp/edms/samples/a1.pdf', '/tmp/edms/samples/a2.pdf'],
                  'pdf')
     b = Document('B', 'description of B', 2, ['/tmp/edms/samples/b.doc'],
                  'doc')
     c = Document('C', 'description of A', 1, [
         '/tmp/edms/samples/c1.pdf', '/tmp/edms/samples/c2.pdf',
         '/tmp/edms/samples/c3.pdf'
     ], 'pdf')
     self._document_manager.add_document(a)
     self._document_manager.add_document(b)
     self._document_manager.add_document(c)
     documents = self._document_manager.find_documents_by_format('pdf')
     self.assertEqual(len(documents), 2)
     titles = {document.title for document in documents}
     self.assertIn('A', titles)
     self.assertIn('C', titles)
示例#20
0
 def _doc_for_term(self, term):
     enquire = xapian.Enquire(self.xapian)
     query = xapian.Query(term)
     enquire.set_query(query)
     mset = enquire.get_mset(0, 2)
     # FIXME: need to throw an exception if more than one match found
     if mset:
         return Document(self, mset[0].document)
     else:
         return None
 def test_find_document_by_id(self):
     a = Document('A', 'description of A', 1,
                  ['/tmp/edms/samples/a1.pdf', '/tmp/edms/samples/a2.pdf'],
                  'pdf')
     b = Document('B', 'description of B', 2, ['/tmp/edms/samples/b.doc'],
                  'doc')
     c = Document('C', 'description of C', 1, [
         '/tmp/edms/samples/c1.html', '/tmp/edms/samples/c2.png',
         '/tmp/edms/samples/c3.png'
     ], 'html')
     a_id = self._document_manager.add_document(a)
     b_id = self._document_manager.add_document(b)
     c_id = self._document_manager.add_document(c)
     document = self._document_manager.find_document_by_id(c_id)
     self.assertEqual(document.title, 'C')
     document = self._document_manager.find_document_by_id(b_id)
     self.assertEqual(document.title, 'B')
     document = self._document_manager.find_document_by_id(a_id)
     self.assertEqual(document.title, 'A')
示例#22
0
def create_document():
    body = request.get_json()
    try:
        title = body["title"]
        content = body["content"]
        parent = body.get("parent")
        children = body.get("children")
    except KeyError:
        return "Data missing from request"

    document = Document(**{
        "title": title,
        "content": content,
        "parent": parent,
        "children": children
    })
    document.save()

    return respond(document=document.to_mongo())
示例#23
0
 def test_submission_with_invalid_roles(self):
     submission = Document('Submission', 'First submission', 1,
                           ['/tmp/edms/samples/submission.pdf'], 'pdf')
     submission_id = self._document_manager.add_document(submission)
     self._review_manager.select_document(submission_id)
     self._user_manager.remove_role(self._author_id, 'author')
     with self.assertRaises(ValueError):
         self._review_manager.submit_document(self._author_id,
                                              self._manager_id,
                                              submission_id)
     self._review_manager.save_review()
 def test_retrieve_last_document(self):
     document = Document(
         'title1', 'desc1', 1,
         ['/tmp/edms/samples/a1.pdf', '/tmp/edms/samples/a2.pdf'], 'pdf')
     document_id = self._document_manager.add_document(document)
     retrieved = self._document_manager.find_document_by_id(document_id)
     self.assertEqual(retrieved.title, 'title1')
     self.assertEqual(retrieved.description, 'desc1')
     self.assertEqual(retrieved.author, 1)
     self.assertEqual(retrieved.files, ['a1.pdf', 'a2.pdf'])
     self.assertEqual(retrieved.doc_format, 'pdf')
def get_documents_from_file(file_name, neutral):
    dom = parse(file_name + '.aa')
    handle_dom(dom, neutral)

    i = 0
    while i < len(index_list):
        if (((index_list[i] == "positive") or (index_list[i] == "negative") or
             (neutral and (index_list[i] == "none")))
                and index_list[i - 1].isdigit()
                and index_list[i - 2].isdigit()):
            text = fetch_text(file_name + ".ac", int(index_list[i - 2]),
                              int(index_list[i - 1]))
            true_list.append(
                Document(build_vector(text, neutral), index_list[i]))

        i = i + 1
 def test_multiple_document_addition(self):
     for i in range(100):
         document = Document(
             'Title {}'.format(i), 'Desc {}'.format(i), 1,
             ['/tmp/edms/samples/a1.pdf', '/tmp/edms/samples/a2.pdf'],
             'pdf')
         document_generator = generator.DocumentGenerator()
         for name in [
                 'a1.pdf', 'a2.pdf', 'b.doc', 'c1.html', 'c2.png', 'c3.png',
                 'c1.pdf', 'c2.pdf', 'c3.pdf'
         ]:
             if not path.exists('/tmp/edms/samples'):
                 makedirs('/tmp/edms/samples')
             document_generator.generate_random_file(
                 '/tmp/edms/samples/{}'.format(name))
         self._document_manager.add_document(document)
     self.assertEqual(self._document_manager.count_documents(), 100)
def read_doc_from_file(config, filename, name=None):
    """ Create a document from file. If pytho source code
    has syntax problems it will return None.

    If acceptempty is enable, it will create (almost) empty documents
    from source files without code (module is always the root node from
    a python AST). """
    if not name:
        name = filename
    o_file = open(filename, 'rb')
    try:
        text = ast(config, o_file.read())
        if not config['acceptempty'] and len(text) <= 2:
            return None
        return Document(name, text)
    except (SyntaxError, UnicodeEncodeError):
        return None
    finally:
        o_file.close()
示例#28
0
 def test_reviewing_process_with_evaluation(self):
     submission = Document('Submission', 'First submission', 1,
                           ['/tmp/edms/samples/submission.pdf'], 'pdf')
     submission_id = self._document_manager.add_document(submission)
     request_1 = Document('request_1', 'First request', 1,
                          ['/tmp/edms/samples/request_1.pdf'], 'pdf')
     request_1_id = self._document_manager.add_document(request_1)
     request_2 = Document('request_2', 'Second request', 1,
                          ['/tmp/edms/samples/request_2.pdf'], 'pdf')
     request_2_id = self._document_manager.add_document(request_2)
     response_1 = Document('response_1', 'First response', 1,
                           ['/tmp/edms/samples/response_1.pdf'], 'pdf')
     response_1_id = self._document_manager.add_document(response_1)
     response_2 = Document('response_2', 'Second response', 1,
                           ['/tmp/edms/samples/response_2.pdf'], 'pdf')
     response_2_id = self._document_manager.add_document(response_2)
     evaluation_result = Document(
         'response_2', 'Second response', 1,
         ['/tmp/edms/samples/evaluation_result.pdf'], 'pdf')
     evaluation_result_id = self._document_manager.add_document(
         evaluation_result)
     self._review_manager.select_document(submission_id)
     self._review_manager.submit_document(self._author_id, self._manager_id,
                                          submission_id)
     self._review_manager.save_review()
     self._review_manager.send_reviewing_request_1(self._manager_id,
                                                   self._reviewer_1_id,
                                                   request_1_id)
     self._review_manager.save_review()
     self._review_manager.send_reviewing_request_2(self._manager_id,
                                                   self._reviewer_2_id,
                                                   request_2_id)
     self._review_manager.save_review()
     self._review_manager.send_review_1(self._reviewer_1_id,
                                        self._manager_id, response_1_id)
     self._review_manager.save_review()
     self._review_manager.send_review_2(self._reviewer_2_id,
                                        self._manager_id, response_2_id)
     self._review_manager.save_review()
     self._review_manager.send_evaluation(self._manager_id, self._author_id,
                                          evaluation_result_id)
     self._review_manager.save_review()
示例#29
0
def importbib(db, bibfile, tags=[], overwrite=False):
    errors = []

    sources = Sources()

    for entry in sorted(Bibtex(bibfile), key=lambda entry: entry.key):
        print >> sys.stderr, entry.key

        try:
            docs = []

            # check for doc with this bibkey
            bdoc = db.doc_for_bib(entry.key)
            if bdoc:
                docs.append(bdoc)

            # check for known sids
            for source in sources.scan_bibentry(entry):
                sdoc = db.doc_for_source(source.sid)
                # FIXME: why can't we match docs in list?
                if sdoc and sdoc.docid not in [doc.docid for doc in docs]:
                    docs.append(sdoc)

            if len(docs) == 0:
                doc = Document(db)
            elif len(docs) > 0:
                if len(docs) > 1:
                    print >> sys.stderr, "  Multiple distinct docs found for entry.  Using first found."
                doc = docs[0]
                print >> sys.stderr, "  Updating id:%d..." % (doc.docid)

            doc.add_bibentry(entry)

            filepath = entry.get_file()
            if filepath:
                print >> sys.stderr, "  Adding file: %s" % filepath
                doc.add_file(filepath)

            doc.add_tags(tags)

            doc.sync()

        except BibtexError as e:
            print >> sys.stderr, "  Error processing entry %s: %s" % (
                entry.key, e)
            print >> sys.stderr
            errors.append(entry.key)

    if errors:
        print >> sys.stderr
        print >> sys.stderr, "Failed to import %d" % (len(errors)),
        if len(errors) == 1:
            print >> sys.stderr, "entry",
        else:
            print >> sys.stderr, "entries",
        print >> sys.stderr, "from bibtex:"
        for error in errors:
            print >> sys.stderr, "  %s" % (error)
        sys.exit(1)
    else:
        sys.exit(0)
示例#30
0
def add(db, query_string, infile=None, sid=None, tags=None, prompt=False):

    doc = None
    bibtex = None

    sources = Sources()
    doc_sid = sid
    source = None
    file_data = None

    if infile and infile is not True:
        infile = os.path.expanduser(infile)

    ##################################
    # if query provided, find single doc to update

    if query_string:
        if db.count(query_string) != 1:
            print >> sys.stderr, "Search '%s' did not match a single document." % query_string
            print >> sys.stderr, "Aborting."
            sys.exit(1)

        for doc in db.search(query_string):
            break

    ##################################
    # do fancy option prompting

    if prompt:
        doc_sids = []
        if doc_sid:
            doc_sids = [doc_sid]
        # scan the file for source info
        if infile is not True:
            infile = prompt_for_file(infile)

            print >> sys.stderr, "Scanning document for source identifiers..."
            try:
                ss = sources.scan_file(infile)
            except ParseError as e:
                print >> sys.stderr, "\n"
                print >> sys.stderr, "Parse error: %s" % e
                sys.exit(1)
            if len(ss) == 0:
                print >> sys.stderr, "0 source ids found."
            else:
                if len(ss) == 1:
                    print >> sys.stderr, "1 source id found:"
                else:
                    print >> sys.stderr, "%d source ids found:" % (len(ss))
                for sid in ss:
                    print >> sys.stderr, "  %s" % (sid)
                doc_sids += [s.sid for s in ss]
        doc_sid = prompt_for_source(db, doc_sids)
        tags = prompt_for_tags(db, tags)

    if not query_string and not infile and not doc_sid:
        print >> sys.stderr, "Must specify file or source to import, or query to update existing document."
        sys.exit(1)

    ##################################
    # process source and get bibtex

    # check if source is a file, in which case interpret it as bibtex
    if doc_sid and os.path.exists(doc_sid):
        bibtex = doc_sid

    elif doc_sid:
        # get source object for sid string
        try:
            source = sources.match_source(doc_sid)
        except SourceError as e:
            print >> sys.stderr, e
            sys.exit(1)

        # check that the source doesn't match an existing doc
        sdoc = db.doc_for_source(source.sid)
        if sdoc:
            if doc and sdoc != doc:
                print >> sys.stderr, "A different document already exists for source '%s'." % (
                    doc_sid)
                print >> sys.stderr, "Aborting."
                sys.exit(1)
            print >> sys.stderr, "Source '%s' found in database.  Updating existing document..." % (
                doc_sid)
            doc = sdoc

        try:
            print >> sys.stderr, "Retrieving bibtex...",
            bibtex = source.fetch_bibtex()
            print >> sys.stderr, "done."
        except SourceError as e:
            print >> sys.stderr, "\n"
            print >> sys.stderr, "Could not retrieve bibtex: %s" % e
            sys.exit(1)

        if infile is True:
            try:
                print >> sys.stderr, "Retrieving file...",
                file_name, file_data = source.fetch_file()
                print >> sys.stderr, "done."
            except SourceError as e:
                print >> sys.stderr, "\n"
                print >> sys.stderr, "Could not retrieve file: %s" % e
                sys.exit(1)

    elif infile is True:
        print >> sys.stderr, "Must specify source with retrieve file option."
        sys.exit(1)

    if infile and not file_data:
        with open(infile, 'r') as f:
            file_data = f.read()
        file_name = os.path.basename(infile)

    ##################################

    # if we still don't have a doc, create a new one
    if not doc:
        doc = Document(db)

    ##################################
    # add stuff to the doc

    if bibtex:
        try:
            print >> sys.stderr, "Adding bibtex...",
            doc.add_bibtex(bibtex)
            print >> sys.stderr, "done."
        except BibtexError as e:
            print >> sys.stderr, "\n"
            print >> sys.stderr, e
            print >> sys.stderr, "Bibtex must be a plain text file with a single bibtex entry."
            sys.exit(1)
        except:
            print >> sys.stderr, "\n"
            raise

    # add source sid if it hasn't been added yet
    if source and not doc.get_sids():
        doc.add_sid(source.sid)

    if infile:
        try:
            print >> sys.stderr, "Adding file...",
            doc.add_file_data(file_name, file_data)
            print >> sys.stderr, "done."
        except ParseError as e:
            print >> sys.stderr, "\n"
            print >> sys.stderr, "Parse error: %s" % e
            sys.exit(1)
        except:
            print >> sys.stderr, "\n"
            raise

    if tags:
        try:
            print >> sys.stderr, "Adding tags...",
            doc.add_tags(tags)
            print >> sys.stderr, "done."
        except:
            print >> sys.stderr, "\n"
            raise

    ##################################
    # sync the doc to db and disk

    try:
        print >> sys.stderr, "Syncing document...",
        doc.sync()
        print >> sys.stderr, "done.\n",
    except:
        print >> sys.stderr, "\n"
        raise

    print_doc_summary(doc)
    return doc.docid
示例#31
0
def add(db, query_string, infile=None, sid=None, tags=None, prompt=False):

    doc = None
    bibtex = None

    sources = Sources()
    doc_sid = sid
    source = None
    file_data = None

    if infile and infile is not True:
        infile = os.path.expanduser(infile)

    ##################################
    # if query provided, find single doc to update

    if query_string:
        if db.count(query_string) != 1:
            print >>sys.stderr, "Search '%s' did not match a single document." % query_string
            print >>sys.stderr, "Aborting."
            sys.exit(1)

        for doc in db.search(query_string):
            break

    ##################################
    # do fancy option prompting

    if prompt:
        doc_sids = []
        if doc_sid:
            doc_sids = [doc_sid]
        # scan the file for source info
        if infile is not True:
            infile = prompt_for_file(infile)

            print >>sys.stderr, "Scanning document for source identifiers..."
            try:
                ss = sources.scan_file(infile)
            except ParseError as e:
                print >>sys.stderr, "\n"
                print >>sys.stderr, "Parse error: %s" % e
                sys.exit(1)
            if len(ss) == 0:
                print >>sys.stderr, "0 source ids found."
            else:
                if len(ss) == 1:
                    print >>sys.stderr, "1 source id found:"
                else:
                    print >>sys.stderr, "%d source ids found:" % (len(ss))
                for sid in ss:
                    print >>sys.stderr, "  %s" % (sid)
                doc_sids += [s.sid for s in ss]
        doc_sid = prompt_for_source(db, doc_sids)
        tags = prompt_for_tags(db, tags)

    if not query_string and not infile and not doc_sid:
        print >>sys.stderr, "Must specify file or source to import, or query to update existing document."
        sys.exit(1)

    ##################################
    # process source and get bibtex

    # check if source is a file, in which case interpret it as bibtex
    if doc_sid and os.path.exists(doc_sid):
        bibtex = doc_sid

    elif doc_sid:
        # get source object for sid string
        try:
            source = sources.match_source(doc_sid)
        except SourceError as e:
            print >>sys.stderr, e
            sys.exit(1)

        # check that the source doesn't match an existing doc
        sdoc = db.doc_for_source(source.sid)
        if sdoc:
            if doc and sdoc != doc:
                print >>sys.stderr, "A different document already exists for source '%s'." % (doc_sid)
                print >>sys.stderr, "Aborting."
                sys.exit(1)
            print >>sys.stderr, "Source '%s' found in database.  Updating existing document..." % (doc_sid)
            doc = sdoc

        try:
            print >>sys.stderr, "Retrieving bibtex...",
            bibtex = source.fetch_bibtex()
            print >>sys.stderr, "done."
        except SourceError as e:
            print >>sys.stderr, "\n"
            print >>sys.stderr, "Could not retrieve bibtex: %s" % e
            sys.exit(1)

        if infile is True:
            try:
                print >>sys.stderr, "Retrieving file...",
                file_name, file_data = source.fetch_file()
                print >>sys.stderr, "done."
            except SourceError as e:
                print >>sys.stderr, "\n"
                print >>sys.stderr, "Could not retrieve file: %s" % e
                sys.exit(1)

    elif infile is True:
        print >>sys.stderr, "Must specify source with retrieve file option."
        sys.exit(1)

    if infile and not file_data:
        with open(infile, 'r') as f:
            file_data = f.read()
        file_name = os.path.basename(infile)

    ##################################

    # if we still don't have a doc, create a new one
    if not doc:
        doc = Document(db)

    ##################################
    # add stuff to the doc

    if bibtex:
        try:
            print >>sys.stderr, "Adding bibtex...",
            doc.add_bibtex(bibtex)
            print >>sys.stderr, "done."
        except BibtexError as e:
            print >>sys.stderr, "\n"
            print >>sys.stderr, e
            print >>sys.stderr, "Bibtex must be a plain text file with a single bibtex entry."
            sys.exit(1)
        except:
            print >>sys.stderr, "\n"
            raise

    # add source sid if it hasn't been added yet
    if source and not doc.get_sids():
        doc.add_sid(source.sid)

    if infile:
        try:
            print >>sys.stderr, "Adding file...",
            doc.add_file_data(file_name, file_data)
            print >>sys.stderr, "done."
        except ParseError as e:
            print >>sys.stderr, "\n"
            print >>sys.stderr, "Parse error: %s" % e
            sys.exit(1)
        except:
            print >>sys.stderr, "\n"
            raise

    if tags:
        try:
            print >>sys.stderr, "Adding tags...",
            doc.add_tags(tags)
            print >>sys.stderr, "done."
        except:
            print >>sys.stderr, "\n"
            raise

    ##################################
    # sync the doc to db and disk

    try:
        print >>sys.stderr, "Syncing document...",
        doc.sync()
        print >>sys.stderr, "done.\n",
    except:
        print >>sys.stderr, "\n"
        raise

    print_doc_summary(doc)
    return doc.docid
示例#32
0
    def test_export_documents(self):
        try:
            shutil.rmtree('/tmp/test_repo3')
            shutil.rmtree('/tmp/exported_documents')
        except OSError as error:
            pass

        for test_file in ['part1.pdf', 'part2.pdf', 'data.doc']:
            file_to_path = reduce(os.path.join,
                                  [SAMPLE_DIR_PATH, 'importable', test_file])
            file_from_path = reduce(os.path.join, [
                SAMPLE_DIR_PATH, 'importable', 'export_files_for_test',
                test_file
            ])
            if not os.path.exists(file_to_path):
                copy2(file_from_path, file_to_path)

        repository = Repository('Empty', '/tmp/test_repo3')
        alice = User('Alice', 'Smith', date(1980, 10, 10), '*****@*****.**',
                     '****')
        bob = User('Bob', 'Marker', date(1970, 11, 11), '*****@*****.**', '****')
        alice_id = repository._user_manager.add_user(alice)
        bob_id = repository._user_manager.add_user(bob)
        first_document = Document(
            'Some important doc', 'Contains various documentations', alice_id,
            [
                SAMPLE_DIR_PATH + '/importable/part1.pdf',
                SAMPLE_DIR_PATH + '/importable/part2.pdf'
            ], 'pdf')
        first_document.make_public()
        first_document.change_state('pending')
        first_document.change_state('accepted')
        second_document = Document('Data report', 'Figures and graphs mainly',
                                   bob_id,
                                   [SAMPLE_DIR_PATH + '/importable/data.doc'],
                                   'doc')
        second_document.make_public()
        second_document.change_state('pending')
        second_document.change_state('accepted')
        first_id = repository._document_manager.add_document(first_document)
        second_id = repository._document_manager.add_document(second_document)
        repository.export_documents([first_id, second_id],
                                    '/tmp/exported_documents')

        self.assertTrue(os.path.exists('/tmp/exported_documents/part1.pdf'))
        self.assertTrue(os.path.exists('/tmp/exported_documents/part2.pdf'))
        self.assertTrue(os.path.exists('/tmp/exported_documents/data.doc'))
        self.assertTrue(os.path.exists('/tmp/exported_documents/1.edd'))
        self.assertTrue(os.path.exists('/tmp/exported_documents/2.edd'))

        with open('/tmp/exported_documents/1.edd') as edd_file:
            lines = edd_file.readlines()
            self.assertIn('[document]\n', lines)
            self.assertIn('title=Some important doc\n', lines)
            self.assertIn('description=Contains various documentations\n',
                          lines)
            self.assertIn('author=Alice Smith\n', lines)
            self.assertIn("files=['part1.pdf', 'part2.pdf']\n", lines)
            self.assertIn('doc_format=pdf\n', lines)

        with open('/tmp/exported_documents/2.edd') as edd_file:
            lines = edd_file.readlines()
            self.assertIn('[document]\n', lines)
            self.assertIn('title=Data report\n', lines)
            self.assertIn('description=Figures and graphs mainly\n', lines)
            self.assertIn('author=Bob Marker\n', lines)
            self.assertIn("files=['data.doc']\n", lines)
            self.assertIn('doc_format=doc\n', lines)

        shutil.rmtree('/tmp/exported_documents')
        shutil.rmtree('/tmp/test_repo3')
示例#33
0
def main():

    # Parse command line arguments
    parser = argparse.ArgumentParser(prog='cliner evaluate')
    parser.add_argument(
        "--txt",
        dest="txt",
        help="Glob of .txt files of discharge summaries",
    )
    parser.add_argument(
        "--predictions",
        dest="pred",
        help="Directory where predictions  are stored.",
    )
    parser.add_argument(
        "--gold",
        dest="gold",
        help="Directory where gold standard is stored.",
    )
    parser.add_argument("--format", dest="format", help="Data format ( con )")
    parser.add_argument(
        "--output",
        dest="output",
        help="Write the evaluation to a file rather than STDOUT",
    )
    args = parser.parse_args()

    if not args.txt:
        print '\n\tERROR: must provide --txt argument\n'
        parser.print_help(sys.stderr)
        print >> sys.stderr, ''
        exit(1)

    if not args.pred:
        print '\n\tERROR: must provide --pred argument\n'
        parser.print_help(sys.stderr)
        print >> sys.stderr, ''
        exit(1)

    if not args.gold:
        print '\n\tERROR: must provide --gold argument\n'
        parser.print_help(sys.stderr)
        print >> sys.stderr, ''
        exit(1)

    if args.format:
        format = args.format
    else:
        print '\n\tERROR: must provide --format argument\n'
        parser.print_help(sys.stderr)
        print >> sys.stderr, ''
        exit(1)

    # Is output destination specified?
    if args.output:
        args.output = open(args.output, "w")
    else:
        args.output = sys.stdout

    # Must specify output format
    if format not in ['i2b2']:
        print >> sys.stderr, '\n\tError: Must specify output format'
        print >> sys.stderr, '\tAvailable formats: i2b2'
        print >> sys.stderr, ''
        parser.print_help(sys.stderr)
        print >> sys.stderr, ''
        exit(1)

    # List of medical text
    txt_files = glob.glob(args.txt)
    txt_files_map = tools.map_files(txt_files)
    wildcard = '*.con'

    # List of gold data
    ref_files = glob.glob(os.path.join(args.gold, wildcard))
    ref_files_map = tools.map_files(ref_files)

    # List of predictions
    pred_files = glob.glob(os.path.join(args.pred, wildcard))
    pred_files_map = tools.map_files(pred_files)

    # Grouping of text, predictions, gold
    files = []
    for k in txt_files_map:
        if k in pred_files_map and k in ref_files_map:
            files.append(
                (txt_files_map[k], pred_files_map[k], ref_files_map[k]))

    # txt          <- medical text
    # annotations  <- predictions
    # gold         <- gold standard

    if len(files) == 0:
        print "No files to be evaluated"
        exit()

    print
    for txt, annotations, gold in files:

        # Read predictions and gols standard data
        cnote = Document(txt, annotations)
        rnote = Document(txt, gold)

        sents = rnote.getTokenizedSentences()

        # Note - can also get first pass (IOB labels)
        ref = rnote.conlist()
        pred = cnote.conlist()

        for i, toks, pline, rline in zip(range(len(sents)), sents, pred, ref):
            for j, token, rlab, plab in zip(range(len(pline)), toks, rline,
                                            pline):
                if rlab != plab:
                    ind = max(0, j - 3)
                    #print 'ref:  ', rline[j-3:j+3]
                    #print 'pred: ', pline[j-3:j+3]
                    print token
                    for k in range(ind, j):
                        print ' ' * (len(toks[k]) + 4),
                    print '<>'
                    print toks[j - 3:j + 3]
                    print '\tpred: ', plab
                    print '\tref:  ', rlab
                    print '\n'