def test_insert_documents(mock_osp):

    """
    Corpus.insert_documents() should create a row for each syllabus.
    """

    # 10 segments x 10 files.
    for s in segment_range(10):
        for i in range(10):
            mock_osp.add_file(segment=s, name=s + "-" + str(i))

    # Insert document rows.
    Document.insert_documents()

    # Should create 100 rows.
    assert Document.select().count() == 100

    # All docs should have rows.
    for s in segment_range(10):
        for i in range(10):

            # Path is [segment]/[file]
            path = s + "/" + s + "-" + str(i)

            # Query for the document path.
            query = Document.select().where(Document.path == path)
            assert query.count() == 1
예제 #2
0
def queue_text():
    """
    Queue text extraction tasks in the worker.
    """

    for doc in query_bar(Document.select()):
        config.rq.enqueue(ext_text, doc.id)
예제 #3
0
def queue_text():

    """
    Queue text extraction tasks in the worker.
    """

    for doc in query_bar(Document.select()):
        config.rq.enqueue(ext_text, doc.id)
예제 #4
0
def run_doc_to_fields():

    """
    Match documents -> fields.
    """

    for doc in query_bar(Document.select()):
        try: doc_to_fields(doc.id)
        except: pass
예제 #5
0
def run_doc_to_inst():

    """
    Match documents -> institutions.
    """

    for doc in query_bar(Document.select()):
        try: doc_to_inst(doc.id)
        except: pass
예제 #6
0
def run_doc_to_inst():
    """
    Match documents -> institutions.
    """

    for doc in query_bar(Document.select()):
        try:
            doc_to_inst(doc.id)
        except:
            pass
def run_doc_to_fields():
    """
    Match documents -> fields.
    """

    for doc in query_bar(Document.select()):
        try:
            doc_to_fields(doc.id)
        except:
            pass
    def link(cls):

        """
        Link documents -> institutions.
        """

        domain_to_inst = defaultdict(list)

        # Map domain -> [(regex, inst), ...]
        for inst in ServerSide(Institution.select()):

            domain = parse_domain(inst.url)

            regex = seed_to_regex(inst.url)

            domain_to_inst[domain].append((regex, inst))

        for doc in query_bar(Document.select()):

            try:

                # TODO: Get rid of @property.
                url = doc.syllabus.url

                domain = parse_domain(url)

                # Find institutions with matching URLs.
                matches = []
                for pattern, inst in domain_to_inst[domain]:

                    match = pattern.search(url)

                    if match:
                        matches.append((match.group(), inst))

                if matches:

                    # Sort by length of match, descending.
                    matches = sorted(
                        matches,
                        key=lambda x: len(x[0]),
                        reverse=True,
                    )

                    # Link to the institution with the longest match.
                    cls.create(
                        institution=matches[0][1],
                        document=doc,
                    )

            except Exception as e:
                print(e)
def test_insert_new_documents(mock_osp):

    """
    When new documents are added to the corpus, just the new documents should
    be registered in the database.
    """

    # 10 files in `000`.
    for i in range(10):
        mock_osp.add_file(segment="000", name="000-" + str(i))

    # Should add 10 docs.
    Document.insert_documents()
    assert Document.select().count() == 10

    # 10 new files in `001`.
    for i in range(10):
        mock_osp.add_file(segment="001", name="001-" + str(i))

    # Should add 10 docs.
    Document.insert_documents()
    assert Document.select().count() == 20