Пример #1
0
def test_train():
    mc = DocClassifier(picklefile)
    doc = Doc(url='http://umsu.de/papers/variations.pdf')
    doc.content = readfile(os.path.join(testdir, 'attitudes.txt'))
    mc.train([doc], [True])
    mc.save()
    assert True
Пример #2
0
def test_not_SEP_article():
    url = 'http://plato.stanford.edu/index.html'
    status, r = util.request_url(url)
    r.encoding = 'utf-8'
    doc = Doc(url=url, r=r)
    doc.page = Webpage(url, html=r.text)

    res = pparser.parse(doc)
    assert res == False
Пример #3
0
def test_process_file():
    doc = Doc(filetype='pdf')
    doc.link = Link(url='foo')
    doc.link.context = 'Lorem ipsum dolor sit amet'
    doc.link.anchortext = 'Lorem ipsum dolor sit amet'
    doc.source = Source(url='foo', html='<b>Lorem ipsum dolor sit amet</b>')
    doc.tempfile = os.path.join(testdir, 'simple.pdf')
    scraper.process_file(doc)
    assert doc.title == 'Lorem ipsum dolor sit amet'
Пример #4
0
def test_SEP_ActionPerception():
    url = 'http://plato.stanford.edu/entries/action-perception/'
    status, r = util.request_url(url)
    r.encoding = 'utf-8'
    doc = Doc(url=url, r=r)
    doc.page = Webpage(url, html=r.text)

    res = pparser.parse(doc)
    assert res == True
    assert doc.authors == 'Robert Briscoe, Rick Grush'
    assert doc.title == 'Action-based Theories of Perception'
    assert doc.abstract[:10] == 'Action is '
    assert doc.abstract[-10:] == 'd of view.'
    assert 'The tactual ideas' in doc.content
    assert doc.numwords > 1000
Пример #5
0
def test_SEP_Abilities():
    url = 'http://plato.stanford.edu/entries/abilities/'
    status, r = util.request_url(url)
    r.encoding = 'utf-8'
    doc = Doc(url=url, r=r)
    doc.page = Webpage(url, html=r.text)

    res = pparser.parse(doc)
    assert res == True
    assert doc.authors == 'John Maier'
    assert doc.title == 'Abilities'
    assert doc.abstract[:10] == 'In the acc'
    assert doc.abstract[-10:] == 'imes true.'
    assert 'General and specific abilities' in doc.content
    assert doc.numwords > 1000
Пример #6
0
def run():
    """
    retrieve and process new blog posts that have been put in the db
    by opp-web:feedhandler
    """
    cur = db.cursor()
    query = "SELECT doc_id FROM docs WHERE doctype = 'blogpost' AND status = 0"
    cur.execute(query)
    debug(4, cur._last_executed)
    posts = cur.fetchall()
    if not posts:
        return debug(3, "no new blog posts")
    for id in posts:
        post = Doc(doc_id=id)
        post.load_from_db()
        process_blogpost(post)
Пример #7
0
def test_classify():
    mc = DocClassifier(picklefile)
    ham = Doc(url='http://umsu.de/papers/variations.pdf')
    ham.content = readfile(os.path.join(testdir, 'attitudes.txt'))
    spam = Doc(url='http://umsu.de/papers/spam.pdf')
    spam.content = """ 
       Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do
       eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut
       enim ad minim veniam, quis nostrud exercitation ullamco laboris
       nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor
       in reprehenderit in voluptate velit esse cillum dolore eu
       fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
       proident, sunt in culpa qui officia deserunt mollit anim id est
       laborum. 
    """
    spam.content *= 50
    mc.train([ham, spam], [True, False])
    ham.content += 'foo bar'
    prob = mc.classify(ham)
    assert prob > 0.5
Пример #8
0
def test_Doc(testdb):
    doc = Doc(url='http://umsu.de/papers/magnetism.pdf')
    doc.update_db(authors='wo')
    assert doc.doc_id > 0
    doc2 = Doc(url='http://umsu.de/papers/magnetism.pdf')
    doc2.load_from_db()
    assert doc2.authors == 'wo'
Пример #9
0
def test_cv():
    doc = Doc(url="http://umsu.de/papers/cv.pdf")
    doc.link = Link(url="http://umsu.de/papers/cv.pdf")
    doc.link.anchortext = "CV"
    doc.link.context = "CV"
    doc.content = readfile(os.path.join(testdir, "cv.txt"))
    doc.numwords = 10200
    doc.numpages = 22
    doc.meta_confidence = 92
    assert paperfilter.evaluate(doc) < 0.4
Пример #10
0
def test_gooddoc_badlink():
    doc = Doc(url="http://umsu.de/papers/variations.pdf")
    doc.link = Link(url="http://umsu.de/papers/variations.pdf")
    doc.link.anchortext = "slides"
    doc.link.context = "The slides for my talk"
    doc.content = readfile(os.path.join(testdir, "attitudes.txt"))
    doc.numwords = 10200
    doc.numpages = 22
    doc.meta_confidence = 92
    assert paperfilter.evaluate(doc) < 0.8
Пример #11
0
def test_gooddoc():
    doc = Doc(url="http://umsu.de/papers/variations.pdf")
    doc.link = Link(url="http://umsu.de/papers/variations.pdf")
    doc.link.anchortext = "Download"
    doc.link.context = "Foo bar"
    doc.content = readfile(os.path.join(testdir, "attitudes.txt"))
    doc.numwords = 10200
    doc.numpages = 22
    doc.meta_confidence = 92
    assert paperfilter.evaluate(doc) > 0.98
Пример #12
0
def test_gooddoc(setups):
    doc = Doc(url='http://umsu.de/papers/variations.pdf')
    doc.content = readfile(os.path.join(testdir, 'attitudes.txt'))
    assert philosophyfilter.evaluate(doc) > 0.6
Пример #13
0
def setups():
    """set up classifier if not yet trained"""
    if philosophyfilter.is_ready():
        return
    db.close()
    db.connection(db='test_opp')
    ham = Doc(url='http://umsu.de/papers/magnetism2.pdf')
    ham.load_from_db()
    ham.content = readfile(os.path.join(testdir, 'attitudes.txt'))
    ham.update_db()
    spam = Doc(url='http://umsu.de/papers/spam.pdf')
    spam.load_from_db()
    spam.content = """ 
       Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do
       eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut
       enim ad minim veniam, quis nostrud exercitation ullamco laboris
       nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor
       in reprehenderit in voluptate velit esse cillum dolore eu
       fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
       proident, sunt in culpa qui officia deserunt mollit anim id est
       laborum. 
    """
    spam.update_db()
    cur = db.cursor()
    query = "SELECT cat_id FROM cats WHERE label=%s LIMIT 1"
    cur.execute(query, ('philosophy',))
    cat_id = cur.fetchall()[0]
    query = ("INSERT IGNORE INTO docs2cats (doc_id, cat_id, strength, is_training)"
             "VALUES (%s, %s, %s, %s)")
    cur.execute(query, (ham.doc_id, cat_id, 1, 1))
    cur.execute(query, (spam.doc_id, cat_id, 0, 1))
    philosophyfilter.update()
Пример #14
0
def process_link(li, force_reprocess=False, redir_url=None, keep_tempfiles=False,
                 recurse=0):
    """
    Fetch url, check for http errors and steppingstones, filter spam,
    parse candidate papers, check for duplicates, check if published
    before last year.

    Links often lead to intermediate pages (e.g. on repositories) with
    another link to the actual paper. In this case, we only store the
    original link in the 'links' table, so the 'doc' entry has a url
    that doesn't match any link. To process the new link, process_link
    is called again, with redir_url set to the new url and recurse +=
    1.

    If force_reprocess is False and the link has already been checked
    at some point, if_modified_since and etag headers are sent.
    """

    # ignore links to old and published papers:
    li.context = li.html_context()
    debug(2, "link context: %s", li.context)
    if context_suggests_published(li.context):
        return li.update_db(status=1, doc_id=None)
    
    # fetch url and handle errors, redirects, etc.:
    url = redir_url or li.url
    r = li.fetch(url=url, only_if_modified=not(force_reprocess))
    # note: li.fetch() updates the link entry in case of errors
    if not r:
        return 0
        
    if r.url != url: # redirected
        url = util.normalize_url(r.url)
        # now we treat li as if it directly led to the redirected document

    if r.filetype not in ('html', 'pdf', 'doc', 'rtf'):
        li.update_db(status=error.code['unsupported filetype'])
        return debug(1, "unsupported filetype: %s", r.filetype)

    doc = Doc(url=url, r=r, link=li, source=li.source)
    
    if doc.load_from_db() and not force_reprocess:
        li.update_db(status=1, doc_id=doc.doc_id)
        return debug(1, "%s is already in docs table", url)
    
    if r.filetype == 'html':
        r.encoding = 'utf-8'
        try:
            doc.page = Webpage(url, html=r.text)
        except UnparsableHTMLException:
            li.update_db(status=error.code['unsupported filetype'])
            return debug(1, "unparsable html")

        debug(6, "\n====== %s ======\n%s\n======\n", url, r.text)

        # check for steppingstone pages with link to a paper:
        target_url = check_steppingstone(doc.page)
        if target_url and recurse < 3:
            debug(1, "steppingstone to %s", target_url)
            return process_link(li, redir_url=target_url, 
                                force_reprocess=force_reprocess, recurse=recurse+1)

        # Genuine papers are almost never in HTML format, and almost
        # every HTML page is not a paper. The few exceptions (such as
        # entries on SEP) tend to require special parsing. Hence the
        # following special treatment. If people start posting
        # articles on medium or in plain HTML, we might return to the
        # old procedure of converting the page to pdf and treating it
        # like any candidate paper.
        from .docparser import webpageparser as htmlparser
        if not htmlparser.parse(doc):
            debug(1, "page ignored")
            li.update_db(status=1)
            return 0

    else:
        try:
            doc.tempfile = save_local(r)
        except:
            return li.update_db(status=error.code['cannot save local file'])
        try:
            process_file(doc, keep_tempfiles=keep_tempfiles)
        except Exception as e:
            debug(1, 'could not process %s: %s', doc.tempfile, e)
            return li.update_db(status=error.code.get(str(e), 10))
            
    # estimate whether doc is a handout, cv etc.:
    from .doctyper import paperfilter
    paperprob = paperfilter.evaluate(doc)
    doc.is_paper = int(paperprob * 100)
    if doc.is_paper < 25:
        li.update_db(status=1)
        debug(1, "spam: paper score %s < 50", doc.is_paper)
        return 0
        
    # estimate whether doc is on philosophy:
    from .doctyper import classifier
    philosophyfilter = classifier.get_classifier('philosophy')
    try:
        doc.is_philosophy = int(philosophyfilter.classify(doc) * 100)
    except UntrainedClassifierException as e:
        doc.is_philosophy = 90
    if doc.is_philosophy < 25:
        li.update_db(status=1)
        debug(1, "spam: philosophy score %s < 50", doc.is_philosophy)
        return 0
        
    if li.doc_id:
        # check for revisions:
        olddoc = Doc(doc_id=li.doc_id)
        olddoc.load_from_db()
        if doc.content != olddoc.content:
            sm = SequenceMatcher(None, doc.content, olddoc.content)
            match_ratio = sm.ratio()
            if match_ratio < 0.8:
                debug(1, "substantive revisions, ratio %s", match_ratio)
                doc.earlier_id = olddoc.doc_id
        if not doc.earlier_id:
            li.update_db(status=1)
            debug(1, "no substantive revisions")
            return 0
    
    else:
        # check for duplicates:
        dupe = get_duplicate(doc)
        if dupe:
            debug(1, "duplicate of document %s", dupe.doc_id)
            li.update_db(status=1, doc_id=dupe.doc_id)
            return 0
    
        # ignore old and published paper:
        if paper_is_old(doc):
            li.update_db(status=1, doc_id=None)
            debug(1, "ignoring already published paper")
            return 0

        # flag for manual approval if confidence low or dubious relevance:
        if doc.is_paper < 60 or doc.is_philosophy < 60 or doc.meta_confidence < 60:
            debug(1, "flagging for manual approval")
            doc.hidden = True

        # don't show papers (incl HTML pages) from newly added source
        # pages in news feed:
        if doc.source.status == 0:
            debug(2, "new source page: setting found_date to 1970")
            doc.found_date = datetime(1970, 1, 1)
        
    doc.update_db()
    li.update_db(status=1, doc_id=doc.doc_id)

    # categorize, but only if doc has more than 1000 words --
    # otherwise categorization is pretty random:
    if doc.numwords > 700:
        for (cat_id, cat) in categories():
            clf = classifier.get_classifier(cat)
            try:
                strength = int(clf.classify(doc) * 100)
                debug(3, "%s score %s", cat, strength)
            except UntrainedClassifierException as e:
                continue 
            doc.assign_category(cat_id, strength)

    return 1
Пример #15
0
def test_get_duplicate(testdb):
    doc = Doc(url='http://umsu.de/papers/driver-2011.pdf')
    doc.link = Link(url='http://umsu.de/papers/driver-2011.pdf')
    doc.content = readfile(os.path.join(testdir, 'attitudes.txt'))
    doc.numwords = 13940
    doc.numpages = 26
    doc.authors = 'Wolfang Schwarz'
    doc.title = 'Lost memories and useless coins: Revisiting the absentminded driver'
    doc.update_db()
    doc2 = Doc(url='http://download.springer.com/static/pdf/307/art%253A10.1007%252Fs11229-015-0699-z.pdf')
    doc2.link = Link(url=doc2.url)
    doc2.content = 'abcdefghjik'+readfile(os.path.join(testdir, 'attitudes.txt'))
    doc2.numwords = 14130
    doc2.numpages = 29
    doc2.authors = 'Wolfang Schwarz'
    doc2.title = 'Lost memories and useless coins: revisiting the absentminded driver'
    dupe = scraper.get_duplicate(doc2)
    assert dupe.doc_id == doc.doc_id