示例#1
0
def test_process_file():
    doc = Doc(filetype='pdf')
    doc.link = Link(url='foo')
    doc.link.context = 'Lorem ipsum dolor sit amet'
    doc.link.anchortext = 'Lorem ipsum dolor sit amet'
    doc.source = Source(url='foo', html='<b>Lorem ipsum dolor sit amet</b>')
    doc.tempfile = os.path.join(testdir, 'simple.pdf')
    scraper.process_file(doc)
    assert doc.title == 'Lorem ipsum dolor sit amet'
示例#2
0
logger = logging.getLogger('opp')
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)

ap = argparse.ArgumentParser()
ap.add_argument('filename', help='file to process')
ap.add_argument('-d', '--debuglevel', default=1, type=int)
ap.add_argument('-k', '--keep', action='store_true', help='keep temporary files')
ap.add_argument('-u', '--url', type=str, help='link url')
ap.add_argument('-l', '--linkcontext', type=str, help='link context')
ap.add_argument('-a', '--anchortext', type=str, help='anchortext')
ap.add_argument('-s', '--sourcehtml', type=str, help='source page html')
args = ap.parse_args()

debug.debuglevel(args.debuglevel or 2)

# set up doc for processing:
filetype = 'pdf' if 'pdf' in args.filename else 'doc'
doc = scraper.Doc(filetype=filetype)
doc.link = scraper.Link(url=args.url or 'foo')
doc.link.context = args.linkcontext or 'foo'
doc.link.anchortext = args.anchortext or 'foo'
doc.source = scraper.Source(url='foo', html=(args.sourcehtml or 'foo'))
doc.tempfile = args.filename

# process
scraper.process_file(doc, keep_tempfiles=args.keep)