def test_match_paths(add_doc, add_institution):

    """
    If the document has a path, it should be matched "greedily" against
    the institutions - find the institution with the longest shared path.
    """

    i1 = add_institution(url='http://yale.edu')
    i2 = add_institution(url='http://yale.edu/p1')
    i3 = add_institution(url='http://yale.edu/p1/p2')
    i4 = add_institution(url='http://yale.edu/p1/p2/p3')

    d1 = add_doc(log=dict(url='http://yale.edu/syllabus.pdf'))
    d2 = add_doc(log=dict(url='http://yale.edu/p1/syllabus.pdf'))
    d3 = add_doc(log=dict(url='http://yale.edu/p1/p2/syllabus.pdf'))
    d4 = add_doc(log=dict(url='http://yale.edu/p1/p2/p3/syllabus.pdf'))

    Institution_Document.link()

    for i, d in [
        (i1, d1),
        (i2, d2),
        (i3, d3),
        (i4, d4),
    ]:

        assert Institution_Document.select().where(
            Institution_Document.institution==i,
            Institution_Document.document==d,
        )
def test_link(add_doc, add_institution):

    """
    .link() should link documents -> institutions.
    """

    i1 = add_institution(url='http://d1.edu')
    i2 = add_institution(url='http://d2.edu')
    i3 = add_institution(url='http://d3.edu')

    d1 = add_doc(log=dict(url='http://d1.edu/syllabus.pdf'))
    d2 = add_doc(log=dict(url='http://d2.edu/syllabus.pdf'))
    d3 = add_doc(log=dict(url='http://d3.edu/syllabus.pdf'))

    Institution_Document.link()

    for i, d in [
        (i1, d1),
        (i2, d2),
        (i3, d3),
    ]:

        assert Institution_Document.select().where(
            Institution_Document.institution==i,
            Institution_Document.document==d,
        )
def test_index_institution_refs(add_citation, add_institution):

    """
    When the document is linked with an institution, an institution reference
    should be included in the document.
    """

    citation = add_citation()

    institution = add_institution(state='CA', country='US')

    # Link inst -> citation.
    Institution_Document.create(
        institution=institution,
        document=citation.document,
    )

    Citation_Index.es_insert()

    doc = config.es.get(
        index='citation',
        id=citation.id,
    )

    assert doc['_source']['institution_id'] == institution.id
    assert doc['_source']['state'] == 'CA'
    assert doc['_source']['country'] == 'US'
def doc_to_inst():

    """
    Link document -> institution.
    """

    Institution_Document.link()
def test_index_institution_refs(add_citation, add_institution):
    """
    When the document is linked with an institution, an institution reference
    should be included in the document.
    """

    citation = add_citation()

    institution = add_institution(state='CA', country='US')

    # Link inst -> citation.
    Institution_Document.create(
        institution=institution,
        document=citation.document,
    )

    Citation_Index.es_insert()

    doc = config.es.get(
        index='citation',
        id=citation.id,
    )

    assert doc['_source']['institution_id'] == institution.id
    assert doc['_source']['state'] == 'CA'
    assert doc['_source']['country'] == 'US'
def test_match(add_doc, add_institution):

    """
    When a doc URL matches an institution domain, write a link.
    """

    doc = add_doc(log={
        'url': 'http://yale.edu/syllabus.pdf'
    })

    yale = add_institution(
        name='Yale University',
        domain='yale.edu',
    )

    harvard = add_institution(
        name='Harvard University',
        domain='harvard.edu',
    )

    doc_to_inst(doc.id)

    # Should write a link.
    assert Institution_Document.select().count() == 1

    # Should link the right rows.
    assert Institution_Document.select().where(
        Institution_Document.institution==yale,
        Institution_Document.document==doc,
    )
def test_match_subdomains(add_doc, add_institution):

    """
    If the document has a subdomain(s), it should be matched "greedily" against
    the institutions - find the institution with the most shared subdomains.
    """

    i1 = add_institution(url='http://yale.edu')
    i2 = add_institution(url='http://sub1.yale.edu')
    i3 = add_institution(url='http://sub2.sub1.yale.edu')
    i4 = add_institution(url='http://sub3.sub2.sub1.yale.edu')

    d1 = add_doc(log=dict(url='http://yale.edu/syllabus.pdf'))
    d2 = add_doc(log=dict(url='http://sub1.yale.edu/syllabus.pdf'))
    d3 = add_doc(log=dict(url='http://sub2.sub1.yale.edu/syllabus.pdf'))
    d4 = add_doc(log=dict(url='http://sub3.sub2.sub1.yale.edu/syllabus.pdf'))

    Institution_Document.link()

    for i, d in [
        (i1, d1),
        (i2, d2),
        (i3, d3),
        (i4, d4),
    ]:

        assert Institution_Document.select().where(
            Institution_Document.institution==i,
            Institution_Document.document==d,
        )
def test_institution(add_doc, add_institution, add_citation):

    """
    Citation#institution should provide the document's institution.
    """

    document = add_doc()

    institution = add_institution()

    # Link inst -> document.
    Institution_Document.create(
        institution=institution,
        document=document,
    )

    citation = add_citation(document=document)

    assert citation.institution.id == institution.id
Exemplo n.º 9
0
def test_unique_pairs(add_doc, add_institution):
    """
    Don't allow duplicate links between the same doc -> inst pair.
    """

    inst = add_institution()

    doc = add_doc()

    Institution_Document.create(
        institution=inst,
        document=doc,
    )

    with pytest.raises(IntegrityError):

        Institution_Document.create(
            institution=inst,
            document=doc,
        )
def doc_to_inst(doc_id):

    """
    Match a document with an institution.
    """

    doc = Document.get(Document.id==doc_id)

    inst = (
        Institution
        .select()
        .where(Institution.domain==doc.syllabus.domain)
        .first()
    )

    if inst:

        Institution_Document.create(
            institution=inst,
            document=doc,
        )
def test_merge_included(add_institution, add_citation):
    """
    When ids are passed for institutions that fall outside of the default page,
    merge the extra facets into the baseline ranking.
    """

    i1 = add_institution(name='Institution 1')
    i2 = add_institution(name='Institution 2')
    i3 = add_institution(name='Institution 3')

    for i in range(3):
        c = add_citation()
        Institution_Document.create(institution=i1, document=c.document)

    for i in range(2):
        c = add_citation()
        Institution_Document.create(institution=i2, document=c.document)

    for i in range(1):
        c = add_citation()
        Institution_Document.create(institution=i3, document=c.document)

    Citation_Index.es_insert()
    Institution_Index.es_insert()

    facets = institution_facets(depth=2, include=[i2.id, i3.id])

    assert facets == [
        dict(label='Institution 1', value=i1.id, count=3),
        dict(label='Institution 2', value=i2.id, count=2),  # Dedupe 2.
        dict(label='Institution 3', value=i3.id, count=1),  # Append 3.
    ]
def test_institution_facets(add_institution, add_citation):
    """
    institution_facets() should provide a list of label/value/count dicts.
    """

    i1 = add_institution(name='Institution 1')
    i2 = add_institution(name='Institution 2')
    i3 = add_institution(name='Institution 3')

    for i in range(3):
        c = add_citation()
        Institution_Document.create(institution=i1, document=c.document)

    for i in range(2):
        c = add_citation()
        Institution_Document.create(institution=i2, document=c.document)

    for i in range(1):
        c = add_citation()
        Institution_Document.create(institution=i3, document=c.document)

    Citation_Index.es_insert()
    Institution_Index.es_insert()

    facets = institution_facets()

    assert facets == [
        dict(label='Institution 1', value=i1.id, count=3),
        dict(label='Institution 2', value=i2.id, count=2),
        dict(label='Institution 3', value=i3.id, count=1),
    ]
Exemplo n.º 13
0
def test_state_facets(add_institution, add_citation):
    """
    state_facets() should provide a list of label/value/count dicts.
    """

    i1 = add_institution(state='CA')
    i2 = add_institution(state='AL')
    i3 = add_institution(state='MA')

    for i in range(3):
        c = add_citation()
        Institution_Document.create(institution=i1, document=c.document)

    for i in range(2):
        c = add_citation()
        Institution_Document.create(institution=i2, document=c.document)

    for i in range(1):
        c = add_citation()
        Institution_Document.create(institution=i3, document=c.document)

    Citation_Index.es_insert()

    facets = state_facets()

    assert facets == [
        dict(label='California', value=i1.state, count=3),
        dict(label='Alabama', value=i2.state, count=2),
        dict(label='Massachusetts', value=i3.state, count=1),
    ]
Exemplo n.º 14
0
def test_filter_country(add_text, add_citation, add_institution):
    """
    Filter on country as a keyword value.
    """

    t1 = add_text()
    t2 = add_text()
    t3 = add_text()

    i1 = add_institution(country='USA')
    i2 = add_institution(country='CAN')

    for i in range(3):
        c = add_citation(text=t1)
        Institution_Document.create(institution=i1, document=c.document)

    for i in range(2):
        c = add_citation(text=t2)
        Institution_Document.create(institution=i1, document=c.document)

    for i in range(1):
        c = add_citation(text=t3)
        Institution_Document.create(institution=i2, document=c.document)

    Citation_Index.es_insert()

    ranks = Citation_Index.compute_ranking(dict(country='USA'))

    assert ranks == {
        str(t1.id): 3,
        str(t2.id): 2,
    }
Exemplo n.º 15
0
def test_filter_institution(add_text, add_citation, add_institution):
    """
    Filter by institution.
    """

    t1 = add_text()
    t2 = add_text()
    t3 = add_text()

    i1 = add_institution()
    i2 = add_institution()

    for i in range(3):
        c = add_citation(text=t1)
        Institution_Document.create(institution=i1, document=c.document)

    for i in range(2):
        c = add_citation(text=t2)
        Institution_Document.create(institution=i1, document=c.document)

    for i in range(1):
        c = add_citation(text=t3)
        Institution_Document.create(institution=i2, document=c.document)

    Citation_Index.es_insert()

    ranks = Citation_Index.compute_ranking(dict(institution_id=i1.id))

    assert ranks == {
        str(t1.id): 3,
        str(t2.id): 2,
    }
def test_state_facets(add_institution, add_citation):

    """
    state_facets() should provide a list of label/value/count dicts.
    """

    i1 = add_institution(state='CA')
    i2 = add_institution(state='AL')
    i3 = add_institution(state='MA')

    for i in range(3):
        c = add_citation()
        Institution_Document.create(institution=i1, document=c.document)

    for i in range(2):
        c = add_citation()
        Institution_Document.create(institution=i2, document=c.document)

    for i in range(1):
        c = add_citation()
        Institution_Document.create(institution=i3, document=c.document)

    Citation_Index.es_insert()

    facets = state_facets()

    assert facets == [
        dict(label='California', value=i1.state, count=3),
        dict(label='Alabama', value=i2.state, count=2),
        dict(label='Massachusetts', value=i3.state, count=1),
    ]
def test_institution_facets(add_institution, add_citation):

    """
    institution_facets() should provide a list of label/value/count dicts.
    """

    i1 = add_institution(name='Institution 1')
    i2 = add_institution(name='Institution 2')
    i3 = add_institution(name='Institution 3')

    for i in range(3):
        c = add_citation()
        Institution_Document.create(institution=i1, document=c.document)

    for i in range(2):
        c = add_citation()
        Institution_Document.create(institution=i2, document=c.document)

    for i in range(1):
        c = add_citation()
        Institution_Document.create(institution=i3, document=c.document)

    Citation_Index.es_insert()
    Institution_Index.es_insert()

    facets = institution_facets()

    assert facets == [
        dict(label='Institution 1', value=i1.id, count=3),
        dict(label='Institution 2', value=i2.id, count=2),
        dict(label='Institution 3', value=i3.id, count=1),
    ]
def test_country_facets(add_institution, add_citation):

    """
    country_facets() should provide a list of label/value/count dicts.
    """

    i1 = add_institution(country='AU')
    i2 = add_institution(country='CA')
    i3 = add_institution(country='NZ')

    for i in range(3):
        c = add_citation()
        Institution_Document.create(institution=i1, document=c.document)

    for i in range(2):
        c = add_citation()
        Institution_Document.create(institution=i2, document=c.document)

    for i in range(1):
        c = add_citation()
        Institution_Document.create(institution=i3, document=c.document)

    Citation_Index.es_insert()

    facets = country_facets()

    assert facets == [
        dict(label='Australia', value=i1.country, count=3),
        dict(label='Canada', value=i2.country, count=2),
        dict(label='New Zealand', value=i3.country, count=1),
    ]
def test_merge_included_facets(add_institution, add_citation):

    """
    Don't duplicate included facets are already present in the ranking.
    """

    i1 = add_institution(name='Institution 1')
    i2 = add_institution(name='Institution 2')
    i3 = add_institution(name='Institution 3')

    for i in range(3):
        c = add_citation()
        Institution_Document.create(institution=i1, document=c.document)

    for i in range(2):
        c = add_citation()
        Institution_Document.create(institution=i2, document=c.document)

    for i in range(1):
        c = add_citation()
        Institution_Document.create(institution=i3, document=c.document)

    Citation_Index.es_insert()

    counts = Citation_Index.count_facets(
        'institution_id',
        include=[i2.id, i3.id],
    )

    # Dedupe 2 and 3.
    assert counts == [
        (i1.id, 3),
        (i2.id, 2),
        (i3.id, 1),
    ]
Exemplo n.º 20
0
def test_country_facets(add_institution, add_citation):
    """
    country_facets() should provide a list of label/value/count dicts.
    """

    i1 = add_institution(country='AU')
    i2 = add_institution(country='CA')
    i3 = add_institution(country='NZ')

    for i in range(3):
        c = add_citation()
        Institution_Document.create(institution=i1, document=c.document)

    for i in range(2):
        c = add_citation()
        Institution_Document.create(institution=i2, document=c.document)

    for i in range(1):
        c = add_citation()
        Institution_Document.create(institution=i3, document=c.document)

    Citation_Index.es_insert()

    facets = country_facets()

    assert facets == [
        dict(label='Australia', value=i1.country, count=3),
        dict(label='Canada', value=i2.country, count=2),
        dict(label='New Zealand', value=i3.country, count=1),
    ]
def test_filter_country(add_text, add_citation, add_institution):

    """
    Filter on country as a keyword value.
    """

    t1 = add_text()
    t2 = add_text()
    t3 = add_text()

    i1 = add_institution(country='USA')
    i2 = add_institution(country='CAN')

    for i in range(3):
        c = add_citation(text=t1)
        Institution_Document.create(institution=i1, document=c.document)

    for i in range(2):
        c = add_citation(text=t2)
        Institution_Document.create(institution=i1, document=c.document)

    for i in range(1):
        c = add_citation(text=t3)
        Institution_Document.create(institution=i2, document=c.document)

    Citation_Index.es_insert()

    ranks = Citation_Index.compute_ranking(dict(
        country='USA'
    ))

    assert ranks == {
        str(t1.id): 3,
        str(t2.id): 2,
    }
def test_filter_institution(add_text, add_citation, add_institution):

    """
    Filter by institution.
    """

    t1 = add_text()
    t2 = add_text()
    t3 = add_text()

    i1 = add_institution()
    i2 = add_institution()

    for i in range(3):
        c = add_citation(text=t1)
        Institution_Document.create(institution=i1, document=c.document)

    for i in range(2):
        c = add_citation(text=t2)
        Institution_Document.create(institution=i1, document=c.document)

    for i in range(1):
        c = add_citation(text=t3)
        Institution_Document.create(institution=i2, document=c.document)

    Citation_Index.es_insert()

    ranks = Citation_Index.compute_ranking(dict(
        institution_id=i1.id
    ))

    assert ranks == {
        str(t1.id): 3,
        str(t2.id): 2,
    }
def test_append_included_facets(add_institution, add_citation):

    """
    When "included" facets have counts that put them below of the baseline
    ranking, append the extra facets to the bottom of the list.
    """

    i1 = add_institution(name='Institution 1')
    i2 = add_institution(name='Institution 2')
    i3 = add_institution(name='Institution 3')
    i4 = add_institution(name='Institution 4')

    for i in range(4):
        c = add_citation()
        Institution_Document.create(institution=i1, document=c.document)

    for i in range(3):
        c = add_citation()
        Institution_Document.create(institution=i2, document=c.document)

    for i in range(2):
        c = add_citation()
        Institution_Document.create(institution=i3, document=c.document)

    for i in range(1):
        c = add_citation()
        Institution_Document.create(institution=i4, document=c.document)

    Citation_Index.es_insert()

    counts = Citation_Index.count_facets(
        'institution_id',
        include=[i3.id, i4.id],
        depth=2,
    )

    assert counts == [

        (i1.id, 4),
        (i2.id, 3),

        # Include 3 and 4.
        (i3.id, 2),
        (i4.id, 1),

    ]
Exemplo n.º 24
0
def doc_to_inst(out_file, n):

    """
    Dump N institution -> document matches.
    """

    cols = ['inst_url', 'doc_url']
    writer = csv.DictWriter(out_file, cols)
    writer.writeheader()

    # Pull matches.
    matches = Institution_Document.select().limit(n)

    for row in matches:

        writer.writerow(dict(
            inst_url=row.institution.url,
            doc_url=row.document.syllabus.url,
        ))
def test_no_match(add_doc, add_institution):

    """
    When the URL doesn't match an institution, don't write a row.
    """

    doc = add_doc(log={
        'url': 'http://yale.edu/syllabus.pdf'
    })

    harvard = add_institution(
        name='Harvard University',
        domain='harvard.edu',
    )

    doc_to_inst(doc.id)

    # Shouldn't write a link.
    assert Institution_Document.select().count() == 0