def test_queue(api_client):
    """
    /queue should queue a work order.
    """

    for i in range(100):
        Document.create(path=str(i))

    r = api_client.post('/queue',
                        data=dict(
                            model_import='osp.corpus.models.Document',
                            job_import='osp.corpus.jobs.ext_text',
                            worker_count=20,
                            offset=10,
                        ))

    # Should queue meta-job.
    assert config.rq.count == 1

    # Run the queue-job.
    meta = config.rq.dequeue()
    meta.perform()

    # Should spool the work jobs.
    for i, doc in enumerate(Document.page_cursor(20, 10)):
        assert config.rq.jobs[i].func == ext_text
        assert config.rq.jobs[i].args == (doc.id, )
예제 #2
0
def insert_documents():

    """
    Insert documents in the database.
    """

    Document.insert_documents()
def test_insert_documents(mock_osp):

    """
    Corpus.insert_documents() should create a row for each syllabus.
    """

    # 10 segments x 10 files.
    for s in segment_range(10):
        for i in range(10):
            mock_osp.add_file(segment=s, name=s + "-" + str(i))

    # Insert document rows.
    Document.insert_documents()

    # Should create 100 rows.
    assert Document.select().count() == 100

    # All docs should have rows.
    for s in segment_range(10):
        for i in range(10):

            # Path is [segment]/[file]
            path = s + "/" + s + "-" + str(i)

            # Query for the document path.
            query = Document.select().where(Document.path == path)
            assert query.count() == 1
def test_queue(api_client):

    """
    /queue should queue a work order.
    """

    for i in range(100):
        Document.create(path=str(i))

    r = api_client.post('/queue', data=dict(

        model_import    = 'osp.corpus.models.Document',
        job_import      = 'osp.corpus.jobs.ext_text',
        worker_count    = 20,
        offset          = 10,

    ))

    # Should queue meta-job.
    assert config.rq.count == 1

    # Run the queue-job.
    meta = config.rq.dequeue()
    meta.perform()

    # Should spool the work jobs.
    for i, doc in enumerate(Document.page_cursor(20, 10)):
        assert config.rq.jobs[i].func == ext_text
        assert config.rq.jobs[i].args == (doc.id,)
예제 #5
0
def queue_text():
    """
    Queue text extraction tasks in the worker.
    """

    for doc in query_bar(Document.select()):
        config.rq.enqueue(ext_text, doc.id)
예제 #6
0
def queue_text():

    """
    Queue text extraction tasks in the worker.
    """

    for doc in query_bar(Document.select()):
        config.rq.enqueue(ext_text, doc.id)
예제 #7
0
def run_doc_to_inst():

    """
    Match documents -> institutions.
    """

    for doc in query_bar(Document.select()):
        try: doc_to_inst(doc.id)
        except: pass
예제 #8
0
def run_doc_to_fields():

    """
    Match documents -> fields.
    """

    for doc in query_bar(Document.select()):
        try: doc_to_fields(doc.id)
        except: pass
def run_doc_to_fields():
    """
    Match documents -> fields.
    """

    for doc in query_bar(Document.select()):
        try:
            doc_to_fields(doc.id)
        except:
            pass
예제 #10
0
def run_doc_to_inst():
    """
    Match documents -> institutions.
    """

    for doc in query_bar(Document.select()):
        try:
            doc_to_inst(doc.id)
        except:
            pass
예제 #11
0
def test_syllabus(mock_osp):

    """
    Document#syllabus should provide a Syllabus instance bound to the file
    referenced by the document row.
    """

    path = mock_osp.add_file('000', name='123')
    doc = Document.create(path='000/123')

    assert isinstance(doc.syllabus, Syllabus)
    assert doc.syllabus.path == path
def test_format_counts():
    """
    Document.format_counts()
    """

    d1 = Document.create(path='1')
    d2 = Document.create(path='2')
    d3 = Document.create(path='3')
    d4 = Document.create(path='4')
    d5 = Document.create(path='5')
    d6 = Document.create(path='6')

    # 1 doc with 'format1'.
    f1 = Document_Format.create(document=d1, format='format1')

    # 2 docs with 'format2'.
    f2 = Document_Format.create(document=d2, format='format2')
    f3 = Document_Format.create(document=d3, format='format2')

    # 3 docs with 'format3'.
    f4 = Document_Format.create(document=d4, format='format3')
    f5 = Document_Format.create(document=d5, format='format3')
    f6 = Document_Format.create(document=d6, format='format3')

    assert Document_Format.format_counts() == [('format3', 3), ('format2', 2),
                                               ('format1', 1)]
def test_format_counts():

    """
    Document.format_counts()
    """

    d1 = Document.create(path="1")
    d2 = Document.create(path="2")
    d3 = Document.create(path="3")
    d4 = Document.create(path="4")
    d5 = Document.create(path="5")
    d6 = Document.create(path="6")

    # 1 doc with 'format1'.
    f1 = Document_Format.create(document=d1, format="format1")

    # 2 docs with 'format2'.
    f2 = Document_Format.create(document=d2, format="format2")
    f3 = Document_Format.create(document=d3, format="format2")

    # 3 docs with 'format3'.
    f4 = Document_Format.create(document=d4, format="format3")
    f5 = Document_Format.create(document=d5, format="format3")
    f6 = Document_Format.create(document=d6, format="format3")

    assert Document_Format.format_counts() == [("format3", 3), ("format2", 2), ("format1", 1)]
    def link(cls):

        """
        Link documents -> institutions.
        """

        domain_to_inst = defaultdict(list)

        # Map domain -> [(regex, inst), ...]
        for inst in ServerSide(Institution.select()):

            domain = parse_domain(inst.url)

            regex = seed_to_regex(inst.url)

            domain_to_inst[domain].append((regex, inst))

        for doc in query_bar(Document.select()):

            try:

                # TODO: Get rid of @property.
                url = doc.syllabus.url

                domain = parse_domain(url)

                # Find institutions with matching URLs.
                matches = []
                for pattern, inst in domain_to_inst[domain]:

                    match = pattern.search(url)

                    if match:
                        matches.append((match.group(), inst))

                if matches:

                    # Sort by length of match, descending.
                    matches = sorted(
                        matches,
                        key=lambda x: len(x[0]),
                        reverse=True,
                    )

                    # Link to the institution with the longest match.
                    cls.create(
                        institution=matches[0][1],
                        document=doc,
                    )

            except Exception as e:
                print(e)
예제 #15
0
    def _doc(*args, **kwargs):

        # Write a file.
        path = mock_osp.add_file(*args, **kwargs)
        syllabus = Syllabus(path)

        # Insert the document row.
        document = Document.create(path=syllabus.relative_path)

        # Extract text.
        text = ext_text(document.id)

        return document
def test_text_extraction_fails(mock_osp):
    """
    If no text can be extracted, don't write the row.
    """

    # Add an empty file.
    path = mock_osp.add_file(content='')
    document = Document.create(path=path)

    ext_text(document.id)

    # Shouldn't write a row.
    assert Document_Text.select().count() == 0
    def _doc(*args, **kwargs):

        # Write a file.
        path = mock_osp.add_file(*args, **kwargs)
        syllabus = Syllabus(path)

        # Insert the document row.
        document = Document.create(path=syllabus.relative_path)

        # Extract text.
        text = ext_text(document.id)

        return document
예제 #18
0
def test_read_format(mock_osp):
    """
    read_format() should write the format to the `document_format` table.
    """

    # Add a file, create a document row.
    path = mock_osp.add_file()
    document = Document.create(path=path)

    ext_format(document.id)

    # Pop out the new row.
    row = Document_Format.get(Document_Format.document == document)
    assert row.format == 'text/plain'
def test_text_extraction_fails(mock_osp):

    """
    If no text can be extracted, don't write the row.
    """

    # Add an empty file.
    path = mock_osp.add_file(content='')
    document = Document.create(path=path)

    ext_text(document.id)

    # Shouldn't write a row.
    assert Document_Text.select().count() == 0
예제 #20
0
def ext_format(doc_id):

    """
    Write the libmagic file format.

    Args:
        doc_id (int): The document id.
    """

    doc = Document.get(Document.id==doc_id)

    return Document_Format.create(
        format=doc.syllabus.libmagic_file_type,
        document=doc
    )
def test_text_extraction_succeeds(mock_osp):
    """
    read_text() should extract text for a document and write the result into
    the `document_text` table.
    """

    # Add a file, create a document row.
    path = mock_osp.add_file(content='text')
    document = Document.create(path=path)

    ext_text(document.id)

    # Pop out the new row.
    row = Document_Text.get(Document_Text.document == document)
    assert row.text == 'text'
def test_read_format(mock_osp):

    """
    read_format() should write the format to the `document_format` table.
    """

    # Add a file, create a document row.
    path = mock_osp.add_file()
    document = Document.create(path=path)

    ext_format(document.id)

    # Pop out the new row.
    row = Document_Format.get(Document_Format.document==document)
    assert row.format == 'text/plain'
def test_text_extraction_succeeds(mock_osp):

    """
    read_text() should extract text for a document and write the result into
    the `document_text` table.
    """

    # Add a file, create a document row.
    path = mock_osp.add_file(content='text')
    document = Document.create(path=path)

    ext_text(document.id)

    # Pop out the new row.
    row = Document_Text.get(Document_Text.document==document)
    assert row.text == 'text'
def ext_text(doc_id):

    """
    Write the document as plain text.

    Args:
        doc_id (int): The document id.
    """

    doc = Document.get(Document.id==doc_id)

    if doc.syllabus.text:

        return Document_Text.create(
            text=doc.syllabus.text,
            document=doc
        )
def doc_to_inst(doc_id):

    """
    Match a document with an institution.
    """

    doc = Document.get(Document.id==doc_id)

    inst = (
        Institution
        .select()
        .where(Institution.domain==doc.syllabus.domain)
        .first()
    )

    if inst:

        Institution_Document.create(
            institution=inst,
            document=doc,
        )
def test_insert_new_documents(mock_osp):

    """
    When new documents are added to the corpus, just the new documents should
    be registered in the database.
    """

    # 10 files in `000`.
    for i in range(10):
        mock_osp.add_file(segment="000", name="000-" + str(i))

    # Should add 10 docs.
    Document.insert_documents()
    assert Document.select().count() == 10

    # 10 new files in `001`.
    for i in range(10):
        mock_osp.add_file(segment="001", name="001-" + str(i))

    # Should add 10 docs.
    Document.insert_documents()
    assert Document.select().count() == 20
예제 #27
0
def insert_documents():
    """
    Insert documents in the database.
    """

    Document.insert_documents()