예제 #1
0
def test_debug(caplog):
    debuglevel(4)
    debug(4, 'hi there')
    assert 'hi there' in caplog.text()
    debug(5, 'secret')
    assert 'secret' not in caplog.text()
    debuglevel(5)
예제 #2
0
def test_linkcontext(basefile, title):
    debuglevel(5)
    html = readfile(os.path.join(testdir, basefile+'.html'))
    content = readfile(os.path.join(testdir, basefile+'.txt')).strip()
    doc = models.Doc(title=title)
    res = blogpostparser.extract_content(html, doc)
    assert re.sub(r'\s+', ' ', content) == re.sub(r'\s+', ' ', res)
예제 #3
0
def test_linkcontext(page, link, context, caplog):
    caplog.setLevel(logging.CRITICAL, logger='selenium')
    caplog.setLevel(logging.DEBUG, logger='opp')
    debuglevel(5)
    curpath = os.path.abspath(os.path.dirname(__file__))
    testdir = os.path.join(curpath, 'sourcepages')
    browser = Browser(reuse_browser=True, use_virtual_display=VDISPLAY)
    src = 'file://'+testdir+'/'+page
    browser.goto(src)
    el = browser.find_elements_by_xpath('//a[@href="{}"]'.format(link))[0]
    li = Link(element=el)
    res = li.html_context()
    assert res == context
예제 #4
0
파일: doctyper.py 프로젝트: wo/opp-tools
def evaluate(doc):
    debug(4, 'trying to guess document type')
    probs = {
        'book': bookfilter.test(doc, debug=debuglevel()>3, smooth=False),
        'chapter': chapterfilter.test(doc, debug=debuglevel()>3, smooth=True),
        'thesis': thesisfilter.test(doc, debug=debuglevel()>3, smooth=False),
        'review': reviewfilter.test(doc, debug=debuglevel()>3, smooth=True)
    }
    debug(2, 'doctyper: %s', ', '.join(['{} {}'.format(k,v) for k,v in probs.items()]))
    if max(probs.values()) > 0.5:
        return max(probs, key=probs.get)
    else:
        return 'article'
예제 #5
0
def test_process_link(testdb, caplog):
    source = Source(url='http://umsu.de/papers/')
    source.load_from_db()
    browser = scraper.Browser(use_virtual_display=VDISPLAY)
    browser.goto(source.url)
    source.set_html(browser.page_source)
    link = 'options.pdf'
    el = browser.find_element_by_xpath("//a[@href='{}']".format(link))
    url = source.make_absolute(link)
    li = Link(url=url, source=source, element=el)
    li.load_from_db()
    debuglevel(2)
    scraper.process_link(li, force_reprocess=True, keep_tempfiles=True)
    debuglevel(5)
    assert 'Options and Actions' in caplog.text()
    assert 'But even if we know' in caplog.text()
예제 #6
0
파일: paperparser.py 프로젝트: wo/opp-tools
def extractor(xmlfile):
    cmd = [PERL, join(path, 'Extractor.pm'), "-v{}".format(debuglevel()), xmlfile]
    debug(2, ' '.join(cmd))
    try:
        output = subprocess.check_output(cmd, stderr=subprocess.STDOUT, timeout=60)
        output = output.decode('utf-8', 'ignore')
    except subprocess.CalledProcessError as e:
        debug(1, e.output)
        return False
    except subprocess.TimeoutExpired as e:
        debug(1, 'Extractor timeout!')
        return False
    json_separator = '=========== RESULT ===========\n'
    if not json_separator in output:
        debug(1, 'Extractor failed:\n%s', output)
        return False
    log,jsonstr = output.split(json_separator, 1)
    debug(1, log)
    res = json.loads(jsonstr)
    return res
예제 #7
0
logger = logging.getLogger('opp')
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)

ap = argparse.ArgumentParser()
ap.add_argument('filename', help='file to process')
ap.add_argument('-d', '--debuglevel', default=1, type=int)
ap.add_argument('-k', '--keep', action='store_true', help='keep temporary files')
ap.add_argument('-u', '--url', type=str, help='link url')
ap.add_argument('-l', '--linkcontext', type=str, help='link context')
ap.add_argument('-a', '--anchortext', type=str, help='anchortext')
ap.add_argument('-s', '--sourcehtml', type=str, help='source page html')
args = ap.parse_args()

debug.debuglevel(args.debuglevel or 2)

# set up doc for processing:
filetype = 'pdf' if 'pdf' in args.filename else 'doc'
doc = scraper.Doc(filetype=filetype)
doc.link = scraper.Link(url=args.url or 'foo')
doc.link.context = args.linkcontext or 'foo'
doc.link.anchortext = args.anchortext or 'foo'
doc.source = scraper.Source(url='foo', html=(args.sourcehtml or 'foo'))
doc.tempfile = args.filename

# process
scraper.process_file(doc, keep_tempfiles=args.keep)
예제 #8
0
 def start(self):
     debuglevel(3)
     super().start()
     self.run()
예제 #9
0
#!/usr/bin/env python3
import sys
import logging
import findmodules
from opp import db, debug
from opp.doctyper import classifier

logger = logging.getLogger('opp')
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)

debug.debuglevel(4)

cur = db.cursor()
query = ("SELECT label FROM cats")
cur.execute(query)
for row in cur.fetchall():
   classifier.update_classifier(row[0])
예제 #10
0
#!/usr/bin/env python3
import pytest
import os.path
from opp.doctyper import doctyper
from opp import scraper
from opp.debug import debuglevel

debuglevel(3)

curpath = os.path.abspath(os.path.dirname(__file__))
testdir = os.path.join(curpath, 'testdocs')

def test_simplepaper():
    doc = scraper.Doc(url='http://umsu.de/papers/variations.pdf')
    doc.link = scraper.Link(url='http://umsu.de/papers/variations.pdf')
    doc.link.anchortext = 'Download'
    doc.link.context = 'Foo bar'
    doc.content = readfile(os.path.join(testdir, 'attitudes.txt'))
    doc.numwords = 10200
    doc.numpages = 22
    assert doctyper.evaluate(doc) == 'article'

def test_pretendbook():
    doc = scraper.Doc(url='http://umsu.de/papers/variations.pdf')
    doc.link = scraper.Link(url='http://umsu.de/papers/variations.pdf')
    doc.link.anchortext = 'Download'
    doc.link.context = 'Foo bar'
    doc.content = readfile(os.path.join(testdir, 'attitudes.txt')) * 10
    doc.numwords = 10200 * 10
    doc.numpages = 22 * 10
    assert doctyper.evaluate(doc) == 'book'
예제 #11
0
파일: paperfilter.py 프로젝트: wo/opp-tools
def evaluate(doc):
    debug(4, 'trying to guess if document is a paper')
    debugflag = debuglevel() > 3
    return classifier.test(doc, debug=debugflag, smooth=True)
예제 #12
0
파일: test_models.py 프로젝트: wo/opp-tools
import os.path
import sys
import json
from datetime import datetime
from opp.models import Source, Link, Doc
from opp.debug import debuglevel
from opp import db

"""
To run these tests, create a test database called test_opp and
give the standard mysql user access to it.
"""

VDISPLAY = True

debuglevel(5)

curpath = os.path.abspath(os.path.dirname(__file__))
testdir = os.path.join(curpath, 'testdocs')

@pytest.fixture(scope='module')
def testdb():
    """set up test database"""
    db.close()
    db.connection(db='test_opp')
    cur = db.cursor()
    for t in ('sources', 'links', 'docs'):
        cur.execute('DELETE FROM {}'.format(t))
    db.commit()
    Source(
        url='http://umsu.de/papers/',
예제 #13
0
파일: scrape.py 프로젝트: wo/opp-tools
from opp import db, scraper, debug

logger = logging.getLogger('opp')
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)

ap = argparse.ArgumentParser()
ap.add_argument('url', help='(part of) url of source page to scrape')
ap.add_argument('-d', '--debug_level', default=1, type=int)
ap.add_argument('-k', '--keep', action='store_true', help='keep temporary files')
ap.add_argument('-l', '--link', type=str, help='only process this link')
args = ap.parse_args()

debug.debuglevel(args.debug_level)

cur = db.dict_cursor()
query = "SELECT * FROM sources WHERE url LIKE %s LIMIT 1"
cur.execute(query, ('%'+args.url+'%',))
sources = cur.fetchall()
if not sources:
   raise Exception(args.url+' not in sources table')
source = scraper.Source(**sources[0])

if args.link:
    browser = scraper.Browser(use_virtual_display=True)
    browser.goto(source.url)
    source.set_html(browser.page_source)
    try:
        el = browser.find_element_by_xpath("//a[contains(@href, '{}')]".format(args.link))