f = ArchiveIterator(open(options.input, 'rb')) elif options.input == sys.stdin: f = ArchiveIterator(options.input.buffer) else: f = ArchiveIterator(open(options.input, 'rb')) if options.output == sys.stdout: fo = WARCWriter(options.output.buffer, gzip=True) else: fo = WARCWriter(open(options.output, 'wb'), gzip=True) if options.pdfpass is not None: po = WARCWriter(open(options.pdfpass, 'wb'), gzip=True) if not options.pdfpass and options.pdfextract: extractor = ExtrP() cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False) if options.output == sys.stdout: filename = options.input else: filename = options.output fo.write_record(fo.create_warcinfo_record(filename=filename, info={'software': 'bitextor/bitextor-warc2htmlwarc.py', 'format': 'WARC File Format 1.0'})) for record in f: # Initial checks if record.rec_type != 'response' and record.rec_type != 'resource': continue if record.rec_headers.get_header('WARC-Target-URI')[0] == '<' and record.rec_headers.get_header('WARC-Target-URI')[-1] == '>':
f = ArchiveIterator(open(options.input, 'rb')) if options.output == sys.stdout or options.output == '-': fo = WARCWriter(sys.stdout.buffer, gzip=True) else: fo = WARCWriter(open(options.output, 'wb'), gzip=not options.disable_output_gzip) if options.pdfpass is not None: po = WARCWriter(open(options.pdfpass, 'wb'), gzip=not options.disable_pdfs_gzip) if not options.pdfpass and options.pdfextract: from pdfextract.extract import Extractor as ExtrP extractor = ExtrP(configFile=options.configFile, sentenceJoinPath=options.sentenceJoinPath, kenlmPath=options.kenlmPath) cleaner = None if options.cleanhtml: from lxml.html.clean import Cleaner cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False) if options.output == sys.stdout or options.output == '-': filename = "" else: filename = options.output