def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: parser.error("no pattern") pattern, input_files = input_files[0], input_files[1:] invert = options.invert out = sys.stdout pattern = re.compile(pattern) if not input_files: fh = ArchiveRecord.open_archive(file_handle=sys.stdin, gzip=None) filter_archive(fh, options, pattern, out) else: for name in input_files: fh = ArchiveRecord.open_archive(name, gzip="auto") filter_archive(fh, options, pattern,out) fh.close() return 0
def main(argv): (options, args) = parser.parse_args(args=argv[1:]) out = sys.stdout if options.output: if not os.path.exists(options.output): os.makedirs(options.output) output_dir = options.output else: output_dir = os.getcwd() collisions = 0 if len(args) < 1: log_file = sys.stdout if not options.log_file else open(options.log_file, 'wb') log_headers(log_file) with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh: collisions += unpack_records('<stdin>', fh, output_dir, options.default_name, log_file, options.wayback) else: for filename in args: log_file = os.path.join(output_dir, os.path.basename(filename)+ '.index.txt') if not options.log_file else options.log_file log_file = open(log_file, 'wb') log_headers(log_file) try: with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh: collisions+=unpack_records(filename, fh, output_dir, options.default_name, log_file, options.wayback) except StandardError, e: print >> sys.stderr, "exception in handling", filename, e
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: parser.error("no imput warc file(s)") correct=True fh=None try: for name in input_files: fh = ArchiveRecord.open_archive(name, gzip="auto") for (offset, record, errors) in fh.read_records(limit=None): if errors: print >> sys.stderr, "warc errors at %s:%d"%(name, offset) print >> sys.stderr, errors correct=False break elif record is not None and record.validate(): # ugh name, returns errorsa print >> sys.stderr, "warc errors at %s:%d"%(name, offset) print >> sys.stderr, record.validate() correct=False break except StandardError, e: correct=False
def parse_metadata(self): fh = ArchiveRecord.open_archive(self.file, gzip="auto", mode="r") for (offset, record, errors) in fh.read_records(limit=None, offsets=True): self.offset = offset if record: if record.type != 'metadata': continue ### precalculated data that is used multiple times self.headers, self.content = self.parse_headers_and_content(record) result = None if (self.parseType == "hopinfo"): result = self.get_hopinfo(record) elif (self.parseType == "outlinks"): result = self.get_outlinks(record) else: sys.exit("Invalid parseType option: " + self.parseType) if result: print result elif errors: sys.exit("Exiting with the following errors:\n" + str(errors)) else: pass # tail fh.close()
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: parser.error("no imput warc file(s)") print '#WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length' for name in input_files: fh = ArchiveRecord.open_archive(name, gzip="auto") for (offset, record, errors) in fh.read_records(limit=None): if record: print name, offset, record.type, record.url, record.id, record.content_type, record.content_length elif errors: pass # ignore else: pass # no errors at tail fh.close() return 0
def expandWarcFile(warcFile): # if (len(argv) < 1): # print >> sys.stderr, "usage: processWarcDir.py -d <directory> -i <collection_id> -e <event> -t <event_type>" # sys.exit() # # if (argv[0] == "-h" or len(argv) < 4): # print >> sys.stderr, "usage: processWarcDir.py -d <directory> -i <collection_id> -e <event> -t <event_type>" # sys.exit() rootdir = os.path.dirname(warcFile) filename = os.path.basename(warcFile) filePath =warcFile if filename.endswith(".warc") or filename.endswith(".warc.gz"):# or filename.endswith(".arc.gz"): # processWarcFile(filePath, collection_id, event, event_type) splitext = filePath.split('.') output_dir = splitext[0] + "/" log_file = os.path.join(output_dir, filePath[filePath.rfind("/")+1:] + '.index.txt') # output_file = output_dir + filePath.split("/")[1] + ".index.txt" if os.path.exists(output_dir) == False: os.makedirs(output_dir) # unpackWarcAndRetrieveHtml(filePath, collection_id, event, event_type) # output_dir = filePath.split(".")[0] + "/" default_name = 'crawlerdefault' wayback = "http://wayback.archive-it.org/" collisions = 0 #log_file = os.path.join(output_dir, filePath[filePath.rfind("/")+1:] + '.index.txt') log_fileh = open(log_file, 'w+b') warcunpack_ia.log_headers(log_fileh) try: with closing(ArchiveRecord.open_archive(filename=filePath, gzip="auto")) as fh: collisions += warcunpack_ia.unpack_records(filePath, fh, output_dir, default_name, log_fileh, wayback) except StandardError, e: print "exception in handling", filePath, e return else: print "Directory Already Exists" #print "Warc unpack finished" html_files = parseLogFileForHtml(log_file) #print "Log file parsed for html files pathes" #print len(html_files) # for i in html_files: # extractTextAndIndexToSolr(i["file"], i["url"], i["wayback_url"], collection_id, event, event_type) tf,urls = extractText(html_files) #print "extracting Text finished" return tf,urls
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: parser.error("no pattern") pattern, input_files = input_files[0], input_files[1:] invert = options.invert out = sys.stdout pattern = re.compile(pattern) if not input_files: fh = ArchiveRecord.open_archive(file_handle=sys.stdin, gzip=None) filter_archive(fh, options, pattern, out) else: for name in input_files: fh = ArchiveRecord.open_archive(name, gzip="auto") filter_archive(fh, options, pattern, out) fh.close() return 0
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: dump_archive(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None), name="-",offsets=False) else: for name in input_files: fh = ArchiveRecord.open_archive(name, gzip="auto") dump_archive(fh,name) fh.close() return 0
def main(argv): (options, input_files) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(input_files) < 1: dump_archive(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None), name="-", offsets=False) else: for name in input_files: fh = ArchiveRecord.open_archive(name, gzip="auto") dump_archive(fh, name) fh.close() return 0
def main(argv): (options, args) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(args) < 1: # dump the first record on stdin with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh: dump_record(fh) else: filename = args[0] zipfilename = args[1] with ZipFile(zipfilename, "w") as outzip: with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh: dump_record(fh, outzip) return 0
def _make_cdx(self, stats): self.out_file.write(b' CDX ' + self.format + b'\n') #print header fh = ArchiveRecord.open_archive(self.file, gzip="auto", mode="r") for (offset, record, errors) in fh.read_records(limit=None, offsets=True): if not record: if errors: raise ParseError(str(errors)) continue # tail stats['num_records_processed'] += 1 handler = self.dispatcher.get_handler(record, offset=offset, cdx_writer=self) if not handler: continue ### arc files from the live web proxy can have a negative content length and a missing payload ### check the content_length from the arc header, not the computed payload size returned by record.content_length content_length_str = record.get_header(record.CONTENT_LENGTH) if content_length_str is not None and int(content_length_str) < 0: continue surt = handler.massaged_url if self.should_exclude(surt): stats['num_records_filtered'] += 1 continue ### precalculated data that is used multiple times # self.headers, self.content = self.parse_headers_and_content(record) # self.mime_type = self.get_mime_type(record, use_precalculated_value=False) values = [ b'-' if v is None else v for v in self.fieldgetter(handler) ] self.out_file.write(b' '.join(values) + b'\n') #record.dump() stats['num_records_included'] += 1 fh.close()
def main(argv): (options, args) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(args) < 1: # dump the first record on stdin with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh: dump_record(fh) else: filename = args[0] zipfilename = args[1] with ZipFile(zipfilename, "w") as outzip: with closing( ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh: dump_record(fh, outzip) return 0
def main(argv): (options, input_files) = parser.parse_args() out = sys.stdout if not options.output_directory: parser.error("option -o is mandatory") if not os.path.isdir(options.output_directory): os.makedirs(options.output_directory) if len(input_files) < 1: parser.error("list of warc files is mandatory") else: for name in input_files: fh = ArchiveRecord.open_archive(name, gzip="auto") dump_archive(fh,name,options.output_directory) fh.close() return 0
def main(argv): (options, args) = parser.parse_args(args=argv[1:]) out = sys.stdout if len(args) < 1: # dump the first record on stdin with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh: dump_record(fh) else: # dump a record from the filename, with optional offset filename = args[0] if len(args) > 1: offset = int(args[1]) else: offset = 0 with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh: fh.seek(offset) dump_record(fh) return 0
def _make_cdx(self, stats): self.out_file.write(b' CDX ' + self.format + b'\n') #print header fh = ArchiveRecord.open_archive(self.file, gzip="auto", mode="r") for (offset, record, errors) in fh.read_records(limit=None, offsets=True): if not record: if errors: raise ParseError(str(errors)) continue # tail stats['num_records_processed'] += 1 handler = self.dispatcher.get_handler(record, offset=offset, cdx_writer=self) if not handler: continue ### arc files from the live web proxy can have a negative content length and a missing payload ### check the content_length from the arc header, not the computed payload size returned by record.content_length content_length_str = record.get_header(record.CONTENT_LENGTH) if content_length_str is not None and int(content_length_str) < 0: continue surt = handler.massaged_url if self.should_exclude(surt): stats['num_records_filtered'] += 1 continue ### precalculated data that is used multiple times # self.headers, self.content = self.parse_headers_and_content(record) # self.mime_type = self.get_mime_type(record, use_precalculated_value=False) values = [b'-' if v is None else v for v in self.fieldgetter(handler)] self.out_file.write(b' '.join(values) + b'\n') #record.dump() stats['num_records_included'] += 1 fh.close()
def expandWarcFile(warcFile): # if (len(argv) < 1): # print >> sys.stderr, "usage: processWarcDir.py -d <directory> -i <collection_id> -e <event> -t <event_type>" # sys.exit() # # if (argv[0] == "-h" or len(argv) < 4): # print >> sys.stderr, "usage: processWarcDir.py -d <directory> -i <collection_id> -e <event> -t <event_type>" # sys.exit() rootdir = os.path.dirname(warcFile) filename = os.path.basename(warcFile) filePath = warcFile if filename.endswith(".warc") or filename.endswith( ".warc.gz"): # or filename.endswith(".arc.gz"): # processWarcFile(filePath, collection_id, event, event_type) splitext = filePath.split('.') output_dir = splitext[0] + "/" log_file = os.path.join( output_dir, filePath[filePath.rfind("/") + 1:] + '.index.txt') # output_file = output_dir + filePath.split("/")[1] + ".index.txt" if os.path.exists(output_dir) == False: os.makedirs(output_dir) # unpackWarcAndRetrieveHtml(filePath, collection_id, event, event_type) # output_dir = filePath.split(".")[0] + "/" default_name = 'crawlerdefault' wayback = "http://wayback.archive-it.org/" collisions = 0 #log_file = os.path.join(output_dir, filePath[filePath.rfind("/")+1:] + '.index.txt') log_fileh = open(log_file, 'w+b') warcunpack_ia.log_headers(log_fileh) try: with closing( ArchiveRecord.open_archive(filename=filePath, gzip="auto")) as fh: collisions += warcunpack_ia.unpack_records( filePath, fh, output_dir, default_name, log_fileh, wayback) except StandardError, e: print "exception in handling", filePath, e return else: print "Directory Already Exists" #print "Warc unpack finished" html_files = parseLogFileForHtml(log_file) #print "Log file parsed for html files pathes" #print len(html_files) # for i in html_files: # extractTextAndIndexToSolr(i["file"], i["url"], i["wayback_url"], collection_id, event, event_type) tf, urls = extractText(html_files) #print "extracting Text finished" return tf, urls
def main(argv): # if (len(argv) < 1): # print >> sys.stderr, "usage: processWarcDir.py -d <directory> -i <collection_id> -e <event> -t <event_type>" # sys.exit() # # if (argv[0] == "-h" or len(argv) < 4): # print >> sys.stderr, "usage: processWarcDir.py -d <directory> -i <collection_id> -e <event> -t <event_type>" # sys.exit() #Done #argv = ["","/home/mohamed/IACollections/rem","","3647","","Texas Fertilizer Plant Explosion","","Accident"] #argv = ["","/home/mohamed/IACollections/3437","","3437","","Connecticut School Shooting","","Shooting"] #argv = ["","/home/mohamed/IACollections/2305","","2305","","Tucson Shooting","","Shooting"] #argv = ["","/home/mohamed/IACollections/2823","","2823","","Russia Plane Crash","","Plane_Crash"] #argv = ["","/home/mohamed/IACollections/2379","","2379","","Youngstown Shooting","","Shooting"] #argv = ["","/home/mohamed/IACollections/2772","","2772","","Norway Shooting","","Shooting"] #argv = ["","/home/mohamed/IACollections/694","","694","","April 16 Shooting","","Shooting"] #argv = ["","/home/mohamed/IACollections/2892","","2892","","Somalia_Bomb_Blast","","Bombing"] #argv = ["","/home/mohamed/IACollections/2838","","2838","","Nevada_AirRace_Crash","","Plane_Crash"] #argv = ["","/home/mohamed/IACollections/2822","","2822","","Texas_Wild_Fire","","Fire"] #argv = ["","/home/mohamed/IACollections/2882","","2882","","Encephalitis","","Disease_Outbreak"] #argv = ["","/home/mohamed/IACollections/2842","","2842","","China_Flood","","Flood"] #argv = ["","/home/mohamed/IACollections/2836","","2836","","Pakistan_Flood","","Flood"] #argv = ["","/home/mohamed/IACollections/3535","","3535","","Brazil_NightClub_Fire","","Fire"] #argv = ["","/home/mohamed/IACollections/2316","","2316","","Haiti_Earthquake_Anniversary","","Earthquake"] #argv = ["","/home/mohamed/IACollections/2406","","2406","","New_Zealand_Earthquake","","Earthquake"] #argv = ["","/home/mohamed/IACollections/2821","","2821","","Virginia_Earthquake","","Earthquake"] #Not Yet argv = [ "", "/home/mohamed/IACollections/2903", "", "2903", "", "Turkey_Earthquake", "", "Earthquake" ] rootdir = argv[1] collection_id = argv[3] event = argv[5] event_type = argv[7] for root, subFolders, files in os.walk(rootdir): for filename in files: filePath = os.path.join(root, filename) if filename.endswith(".warc") or filename.endswith( ".warc.gz"): # or filename.endswith(".arc.gz"): # processWarcFile(filePath, collection_id, event, event_type) splitext = filePath.split('.') output_dir = splitext[0] + "/" log_file = os.path.join( output_dir, filePath[filePath.rfind("/") + 1:] + '.index.txt') # output_file = output_dir + filePath.split("/")[1] + ".index.txt" if os.path.exists(output_dir) == False: os.makedirs(output_dir) # unpackWarcAndRetrieveHtml(filePath, collection_id, event, event_type) # output_dir = filePath.split(".")[0] + "/" default_name = 'crawlerdefault' wayback = "http://wayback.archive-it.org/" collisions = 0 #log_file = os.path.join(output_dir, filePath[filePath.rfind("/")+1:] + '.index.txt') log_fileh = open(log_file, 'w+b') warcunpack_ia.log_headers(log_fileh) try: with closing( ArchiveRecord.open_archive(filename=filePath, gzip="auto")) as fh: collisions += warcunpack_ia.unpack_records( filePath, fh, output_dir, default_name, log_fileh, wayback) except StandardError, e: print >> sys.stderr, "exception in handling", filePath, e return print "Warc unpack finished" html_files = parseLogFileForHtml(log_file) print "Log file parsed for html files pathes" # for i in html_files: # extractTextAndIndexToSolr(i["file"], i["url"], i["wayback_url"], collection_id, event, event_type) extractTextAndIndexToSolr(html_files, collection_id, event, event_type) print "Storing and Indexing finished"
def make_cdx(self): if isinstance(self.out_file, basestring): self.out_file = open(self.out_file, 'wb') self.out_file.write(' CDX ' + self.format + '\n') #print header if not self.all_records: #filter cdx lines if --all-records isn't specified allowed_record_types = set(['response', 'revisit']) disallowed_content_types = set(['text/dns']) stats = { 'num_records_processed': 0, 'num_records_included': 0, 'num_records_filtered': 0, } fh = ArchiveRecord.open_archive(self.file, gzip="auto", mode="r") for (offset, record, errors) in fh.read_records(limit=None, offsets=True): self.offset = offset if record: stats['num_records_processed'] += 1 if self.screenshot_mode: if record.type != 'metadata': continue elif not self.all_records and (record.type not in allowed_record_types or record.content_type in disallowed_content_types): continue ### arc files from the live web proxy can have a negative content length and a missing payload ### check the content_length from the arc header, not the computed payload size returned by record.content_length content_length_str = record.get_header(record.CONTENT_LENGTH) if content_length_str is not None and int(content_length_str) < 0: continue self.surt = self.get_massaged_url(record, use_precalculated_value=False) if self.should_exclude(self.surt): stats['num_records_filtered'] += 1 continue ### precalculated data that is used multiple times self.headers, self.content = self.parse_headers_and_content(record) self.mime_type = self.get_mime_type(record, use_precalculated_value=False) self.response_code = self.get_response_code(record, use_precalculated_value=False) self.meta_tags = self.parse_meta_tags(record) s = u'' for field in self.format.split(): if not field in self.field_map: raise ParseError('Unknown field: ' + field) endpoint = self.field_map[field].replace(' ', '_') response = getattr(self, 'get_' + endpoint)(record) #print self.offset #print record.compressed_record_size #print record.content_length #print record.headers #print len(self.content) #print repr(record.content[1]) #print endpoint #print repr(response) s += response + ' ' self.out_file.write(s.rstrip().encode('utf-8')+'\n') #record.dump() stats['num_records_included'] += 1 elif errors: raise ParseError(str(errors)) else: pass # tail fh.close() if self.stats_file is not None: f = open(self.stats_file, 'w') json.dump(stats, f, indent=4) f.close()
def main(argv): # if (len(argv) < 1): # print >> sys.stderr, "usage: processWarcDir.py -d <directory> -i <collection_id> -e <event> -t <event_type>" # sys.exit() # # if (argv[0] == "-h" or len(argv) < 4): # print >> sys.stderr, "usage: processWarcDir.py -d <directory> -i <collection_id> -e <event> -t <event_type>" # sys.exit() #Done #argv = ["","/home/mohamed/IACollections/rem","","3647","","Texas Fertilizer Plant Explosion","","Accident"] #argv = ["","/home/mohamed/IACollections/3437","","3437","","Connecticut School Shooting","","Shooting"] #argv = ["","/home/mohamed/IACollections/2305","","2305","","Tucson Shooting","","Shooting"] #argv = ["","/home/mohamed/IACollections/2823","","2823","","Russia Plane Crash","","Plane_Crash"] #argv = ["","/home/mohamed/IACollections/2379","","2379","","Youngstown Shooting","","Shooting"] #argv = ["","/home/mohamed/IACollections/2772","","2772","","Norway Shooting","","Shooting"] #argv = ["","/home/mohamed/IACollections/694","","694","","April 16 Shooting","","Shooting"] #argv = ["","/home/mohamed/IACollections/2892","","2892","","Somalia_Bomb_Blast","","Bombing"] #argv = ["","/home/mohamed/IACollections/2838","","2838","","Nevada_AirRace_Crash","","Plane_Crash"] #argv = ["","/home/mohamed/IACollections/2822","","2822","","Texas_Wild_Fire","","Fire"] #argv = ["","/home/mohamed/IACollections/2882","","2882","","Encephalitis","","Disease_Outbreak"] #argv = ["","/home/mohamed/IACollections/2842","","2842","","China_Flood","","Flood"] #argv = ["","/home/mohamed/IACollections/2836","","2836","","Pakistan_Flood","","Flood"] #argv = ["","/home/mohamed/IACollections/3535","","3535","","Brazil_NightClub_Fire","","Fire"] #argv = ["","/home/mohamed/IACollections/2316","","2316","","Haiti_Earthquake_Anniversary","","Earthquake"] #argv = ["","/home/mohamed/IACollections/2406","","2406","","New_Zealand_Earthquake","","Earthquake"] #argv = ["","/home/mohamed/IACollections/2821","","2821","","Virginia_Earthquake","","Earthquake"] #Not Yet argv = ["","/home/mohamed/IACollections/2903","","2903","","Turkey_Earthquake","","Earthquake"] rootdir = argv[1] collection_id = argv[3] event = argv[5] event_type = argv[7] for root, subFolders, files in os.walk(rootdir): for filename in files: filePath = os.path.join(root, filename) if filename.endswith(".warc") or filename.endswith(".warc.gz"):# or filename.endswith(".arc.gz"): # processWarcFile(filePath, collection_id, event, event_type) splitext = filePath.split('.') output_dir = splitext[0] + "/" log_file = os.path.join(output_dir, filePath[filePath.rfind("/")+1:] + '.index.txt') # output_file = output_dir + filePath.split("/")[1] + ".index.txt" if os.path.exists(output_dir) == False: os.makedirs(output_dir) # unpackWarcAndRetrieveHtml(filePath, collection_id, event, event_type) # output_dir = filePath.split(".")[0] + "/" default_name = 'crawlerdefault' wayback = "http://wayback.archive-it.org/" collisions = 0 #log_file = os.path.join(output_dir, filePath[filePath.rfind("/")+1:] + '.index.txt') log_fileh = open(log_file, 'w+b') warcunpack_ia.log_headers(log_fileh) try: with closing(ArchiveRecord.open_archive(filename=filePath, gzip="auto")) as fh: collisions += warcunpack_ia.unpack_records(filePath, fh, output_dir, default_name, log_fileh, wayback) except StandardError, e: print >> sys.stderr, "exception in handling", filePath, e return print "Warc unpack finished" html_files = parseLogFileForHtml(log_file) print "Log file parsed for html files pathes" # for i in html_files: # extractTextAndIndexToSolr(i["file"], i["url"], i["wayback_url"], collection_id, event, event_type) extractTextAndIndexToSolr(html_files, collection_id, event, event_type) print "Storing and Indexing finished"