def main(): import getopt def usage(): print '''usage: extract.py [-d)ebug] [-S)trict] [-t pat_threshold] [-T diffscore_threshold] [-M mainscore_threshold] [-c default_charset] [-C codec_out] [-a accept_pat] [-j reject_pat] [-P mangle_pat] patfile zipfile ...''' sys.exit(2) try: (opts, args) = getopt.getopt(sys.argv[1:], 'dSt:T:M:c:C:a:j:P:') except getopt.GetoptError: usage() (debug, pat_threshold, diffscore_threshold, mainscore_threshold, default_charset, codec_out, strict) = \ (0, 0.8, 0.5, 50, 'iso-8859-1', 'utf-8', False) acldb = None mangle_pat = None for (k, v) in opts: if k == '-d': debug += 1 elif k == '-S': strict = True elif k == '-t': pat_threshold = float(v) elif k == '-T': diffscore_threshold = float(v) elif k == '-M': mainscore_threshold = float(v) elif k == '-c': default_charset = v elif k == '-C': codec_out = v elif k == '-a': if not acldb: acldb = ACLDB() acldb.add_allow(v) elif k == '-j': if not acldb: acldb = ACLDB() acldb.add_deny(v) elif k == '-P': mangle_pat = v if not args: usage() patternset = LayoutPatternSet(debug=debug) fp = file(args[0]) patternset.read(fp) fp.close() if mangle_pat: patternset.set_encoder(mangle_pat) del args[0] consumer = TextExtractor(patternset, pat_threshold, diffscore_threshold, mainscore_threshold, default_charset=default_charset, codec_out=codec_out, strict=strict, debug=debug) if not args: args = ['-'] for fname in args: if fname.endswith('.zip'): ZipLoader(consumer, fname, acldb=acldb, debug=debug).run() elif fname == '-': consumer.feed_page('stdin', sys.stdin) else: fp = file(fname) consumer.feed_page(fname, fp) fp.close() return
def main(): import getopt def usage(): print '''usage: analyze.py [-d] [-t cluster_threshold] [-T title_threshold] [-S score_threshold] [-L linkinfo] [-c default_charset] [-a accept_pat] [-j reject_pat] [-P mangle_pat] files ...''' sys.exit(2) try: (opts, args) = getopt.getopt(sys.argv[1:], 'dt:T:S:L:c:a:j:P:') except getopt.GetoptError: usage() (debug, cluster_threshold, title_threshold, score_threshold, default_charset) = (0, 0.97, 0.6, 100, 'utf-8') acldb = None mangle_pat = None linkinfo = 'linkinfo' for (k, v) in opts: if k == '-d': debug += 1 elif k == '-t': cluster_threshold = float(v) elif k == '-T': title_threshold = float(v) elif k == '-S': score_threshold = float(v) elif k == '-L': linkinfo = '' elif k == '-c': default_charset = v elif k == '-a': if not acldb: acldb = ACLDB() acldb.add_allow(v) elif k == '-j': if not acldb: acldb = ACLDB() acldb.add_deny(v) elif k == '-P': mangle_pat = v if not args: usage() # analyzer = LayoutAnalyzer(debug=debug) if mangle_pat: analyzer.set_encoder(mangle_pat) print '### version=%s' % WEBSTEMMER_VERSION for fname in args: print '### fname=%r' % fname feeder = PageFeeder(analyzer, linkinfo=linkinfo, acldb=acldb, default_charset=default_charset, debug=debug) if fname.endswith('.zip'): ZipLoader(feeder, fname, debug=debug).run() elif fname.endswith('.list') or fname == '-': if fname == '-': fp = sys.stdin else: fp = file(fname) for line in fp: name = line.strip() if debug: print >>stderr, 'Loading: %r' % name fp2 = file(name) data = fp2.read() fp2.close() feeder.feed_page(name, data) fp.close() else: fp = file(fname) data = fp.read() fp.close() feeder.feed_page(fname, data) feeder.close() print '### cluster_threshold=%f' % cluster_threshold print '### title_threshold=%f' % title_threshold print '### pages=%d' % len(analyzer.pages) print if mangle_pat: print '!mangle_pat=%r' % mangle_pat print for c in analyzer.analyze(cluster_threshold, title_threshold): if c.pattern and score_threshold <= c.score: c.dump() return
def main(): import getopt def usage(): print( 'usage: analyze.py [-d] [-t cluster_threshold] [-T title_threshold]' ' [-S score_threshold] [-m max_sample] [-L linkinfo] [-c default_charset]' ' [-a accept_pat] [-j reject_pat] [-P mangle_pat] files ...') sys.exit(2) try: (opts, args) = getopt.getopt(sys.argv[1:], 'dt:T:S:m:L:c:a:j:P:') except getopt.GetoptError: usage() debug = 0 cluster_threshold = 0.97 title_threshold = 0.6 score_threshold = 100 max_sample = 0 default_charset = 'utf-8' acldb = None mangle_pat = None linkinfo = 'linkinfo' for (k, v) in opts: if k == '-d': debug += 1 elif k == '-t': cluster_threshold = float(v) elif k == '-T': title_threshold = float(v) elif k == '-S': score_threshold = float(v) elif k == '-m': max_sample = int(v) elif k == '-L': linkinfo = '' elif k == '-c': default_charset = v elif k == '-a': if not acldb: acldb = ACLDB() acldb.add_allow(v) elif k == '-j': if not acldb: acldb = ACLDB() acldb.add_deny(v) elif k == '-P': mangle_pat = v if not args: usage() # analyzer = LayoutAnalyzer(debug=debug) if mangle_pat: analyzer.set_encoder(mangle_pat) print '### version=%s' % WEBSTEMMER_VERSION for fname in args: print '### fname=%r' % fname feeder = PageFeeder(analyzer, linkinfo=linkinfo, acldb=acldb, default_charset=default_charset, debug=debug) if fname.endswith('.zip'): ZipLoader(feeder, fname, debug=debug).run() elif fname.endswith('.list') or fname == '-': if fname == '-': fp = sys.stdin else: fp = file(fname) for line in fp: name = line.strip() if debug: print >> stderr, 'Loading: %r' % name fp2 = file(name) data = fp2.read() fp2.close() feeder.feed_page(name, data) fp.close() else: fp = file(fname) data = fp.read() fp.close() feeder.feed_page(fname, data) feeder.close() print '### cluster_threshold=%f' % cluster_threshold print '### title_threshold=%f' % title_threshold print '### pages=%d' % len(analyzer.pages) print if mangle_pat: print '!mangle_pat=%r' % mangle_pat print for c in analyzer.analyze(cluster_threshold, title_threshold, max_sample): if c.pattern and score_threshold <= c.score: c.dump() return
def main(): import getopt def usage(): print '''usage: textcrawler.py -o outfile [-d] [-b baseid] [-a accept_pat] [-j reject_pat] [-i index_html] [-m level] [-k cookie_file] [-c default_charset] [-U urldb] [-D delay] [-T timeout] [-L linkinfo] [url ...]''' sys.exit(2) try: (opts, args) = getopt.getopt(sys.argv[1:], 'db:a:j:i:m:k:c:C:U:o:D:T:L:') except getopt.GetoptError: usage() (debug, maxlevel, cookie_file, delay) = (0, 1, None, 0) (index_html, default_charset, urldb, timeout) = ('', 'iso-8859-1', None, 300) (baseid, outfile, linkinfo) = (None, None, 'linkinfo') reftxtdb = None acldb = None for (k, v) in opts: if k == '-d': debug += 1 elif k == '-b': baseid = v elif k == '-a': if not acldb: acldb = ACLDB() acldb.add_allow(v) elif k == '-j': if not acldb: acldb = ACLDB() acldb.add_deny(v) elif k == '-m': maxlevel = int(v) elif k == '-i': index_html = v elif k == '-k': cookie_file = v elif k == '-c': default_charset = v elif k == '-U': urldb = URLDB(v) elif k == '-D': delay = int(v) elif k == '-o': outfile = v elif k == '-T': timeout = int(v) elif k == '-L': linkinfo = v if not args: usage() if not baseid: baseid = time.strftime('%Y%m%d%H%M') if not acldb: acldb = ACLDB() acldb.add_deny( r'\.(jpg|jpeg|gif|png|tiff|swf|mov|wmv|wma|ram|rm|rpm|gz|zip|class)\b' ) for starturl in args: acldb.add_allow('^' + re.escape(urljoin(starturl, '.'))) if linkinfo: reftxtdb = RefTextDB(baseid) dumper = None if outfile: dumper = ZipDumper(outfile, baseid) else: dumper = NullDumper() # crawling only for starturl in args: try: TextCrawler(dumper, starturl, baseid, reftxtdb=reftxtdb, index_html=index_html, maxlevel=maxlevel, cookie_file=cookie_file, default_charset=default_charset, acldb=acldb, urldb=urldb, delay=delay, timeout=timeout, debug=debug).run() except CrawlerFatalError: pass if linkinfo: dumper.feed_page(linkinfo, reftxtdb.dump()) dumper.close() return
def main(): import getopt def usage(): print '''usage: textcrawler.py -o outfile [-d] [-b baseid] [-a accept_pat] [-j reject_pat] [-i index_html] [-m level] [-k cookie_file] [-c default_charset] [-U urldb] [-D delay] [-T timeout] [-L linkinfo] [url ...]''' sys.exit(2) try: (opts, args) = getopt.getopt(sys.argv[1:], 'db:a:j:i:m:k:c:C:U:o:D:T:L:') except getopt.GetoptError: usage() (debug, maxlevel, cookie_file, delay) = (0, 1, None, 0) (index_html, default_charset, urldb, timeout) = ('', 'iso-8859-1', None, 300) (baseid, outfile, linkinfo) = (None, None, 'linkinfo') reftxtdb = None acldb = None for (k, v) in opts: if k == '-d': debug += 1 elif k == '-b': baseid = v elif k == '-a': if not acldb: acldb = ACLDB() acldb.add_allow(v) elif k == '-j': if not acldb: acldb = ACLDB() acldb.add_deny(v) elif k == '-m': maxlevel = int(v) elif k == '-i': index_html = v elif k == '-k': cookie_file = v elif k == '-c': default_charset = v elif k == '-U': urldb = URLDB(v) elif k == '-D': delay = int(v) elif k == '-o': outfile = v elif k == '-T': timeout = int(v) elif k == '-L': linkinfo = v if not args: usage() if not baseid: baseid = time.strftime('%Y%m%d%H%M') if not acldb: acldb = ACLDB() acldb.add_deny(r'\.(jpg|jpeg|gif|png|tiff|swf|mov|wmv|wma|ram|rm|rpm|gz|zip|class)\b') for starturl in args: acldb.add_allow('^'+re.escape(urljoin(starturl, '.'))) if linkinfo: reftxtdb = RefTextDB(baseid) dumper = None if outfile: dumper = ZipDumper(outfile, baseid) else: dumper = NullDumper() # crawling only for starturl in args: try: TextCrawler(dumper, starturl, baseid, reftxtdb=reftxtdb, index_html=index_html, maxlevel=maxlevel, cookie_file=cookie_file, default_charset=default_charset, acldb=acldb, urldb=urldb, delay=delay, timeout=timeout, debug=debug).run() except CrawlerFatalError: pass if linkinfo: dumper.feed_page(linkinfo, reftxtdb.dump()) dumper.close() return