def ShrinkRay(): # TODO: # figure out length of video and develop number of frames to # drop out of every FPS interval. print( "********************* \n\n Shrinking Video. (This will take a while) \n\n*********************" ) os.environ["FFREPORT"] = "file=ffmpeg-shrinking.log" # shrink; using the webm format at this resolution cuts the file size by # *about* an order of magnitude, while still maintaining more-or-less # perfectly crisp detail and motion. I'm thinking we don't need to drop # frames, and that cutting the resolution down to this ~240P-level # resolution is good enough. # We really need to check for resolution and select an output resolution # appropriately; this one-liner only works for 16:9 inputs ffmpegshrinkargs = shlex.split( "ffmpeg -i samplethis.flv -c:v libvpx -b:v 500K -c:a libvorbis -s 432x243 shrunken-to-webm.webm" ) call(ffmpegshrinkargs) # The final size of snapshots and shrunken video is anywhere from a fifth to # a seventh of the original file size. os.environ["FFREPORT"] = "" # add ffmpeg log record ffmpegshrinkheader = warc.WARCHeader({ "WARC-Type": "resource", "WARC-Warcinfo-ID": warcinfo_record_ID, "Content-Type": "text/plain", "WARC-Concurrent-To": metadata_record_ID }) ffmpegshrinkpayload = StringIO( open("ffmpeg-shrinking.log").read()).getvalue() ffmpegshrinkrecord = warc.WARCRecord(headers=ffmpegshrinkheader, payload=ffmpegshrinkpayload) new_warc_file.write_record(ffmpegshrinkrecord) # add actual shrunken webm record shrinkrecord = warc.WARCRecord( headers=warc.WARCHeader({ "WARC-Type": "conversion", "Content-Type": "video/webm", "WARC-Refers-To": truncated_record_ID }), payload=StringIO(open("shrunken-to-webm.webm").read()).getvalue()) new_warc_file.write_record(shrinkrecord) # remove log file call(shlex.split("rm snapshots.tar.gz ffmpeg-shrinking.log"))
def make_resp_dummy(resp, date, http_ver="1.1"): body = resp.raw.data temp = [ bytes("HTTP/{} {} {}".format(http_ver, resp.status_code, RESPONSES[resp.status_code]), "ascii"), ] applied_keys = [] for key in resp.headers: if key.lower() in ["transfer-encoding"]: continue elif key.lower() == "content-length" and resp.headers["content-length"] != str(len(body)): # recalculate decoded size below continue temp.append(bytes("{}: {}".format(key, resp.headers[key]), "utf-8")) applied_keys.append(key.lower()) if "content-length" not in applied_keys: temp.append(bytes("content-length: {}".format(len(body)), "ascii")) temp.append(b"") temp.append(body) dummy = b"\r\n".join(temp) header = warc.WARCHeader({ "WARC-Type": "response", "WARC-Target-URI": resp.url, "WARC-Date": date.strftime("%Y-%m-%dT%H:%M:%SZ"), }, defaults=True) return warc.WARCRecord(header, payload=dummy)
def make_req_dummy(req, record, http_ver="1.1"): o = urlparse(req.url) path = o.path if not path: path = "/" temp = [ bytes("{} {} HTTP/{}".format(req.method, path, http_ver), "ascii") ] for key in req.headers: temp.append(bytes("{}: {}".format(key, req.headers[key]), "utf-8")) temp.append(b"") if req.body: temp.append(req.body) dummy = b"\r\n".join(temp) header = warc.WARCHeader({ "WARC-Type": "request", "WARC-Target-URI": req.url, # ISO 28500 Section 5.4 WARC-Date # > Multiple records written as part of a single capture event (see section 5.7) # > shall use the same WARC-Date, even though the times of their writing # > will not be exactly synchronized. "WARC-Date": record.header["WARC-Date"], "WARC-Concurrent-To": record.header["WARC-Record-ID"], }, defaults=True) return warc.WARCRecord(header, payload=dummy)
def create(self, filename, fileobj=None, operator=None): """ :rtype: warc.WARCFile """ assert useragent.POLICY is not None if fileobj is None: fileobj = io.BytesIO() self.fileobj = fileobj self.warc = warc.WARCFile(fileobj=fileobj) header = warc.WARCHeader({ "WARC-Type": "warcinfo", "WARC-Filename": filename, }, defaults=True) body = [ b"software: owlbot/"+bytes(version.STR, "ascii"), b"format: WARC File Format 1.0", # policy from .OWLBOT_POLICY or os.environ["OWLBOT_POLICY"] b"robots: " + bytes(useragent.POLICY, "ascii"), ] if operator is not None: body.append(b"operator: " + operator.encode("utf-8")) self.warc.write_record( warc.WARCRecord(header, payload=b"\r\n".join(body)) )
def createWarcInfoReacord(filename): H = warc.WARCHeader({"WARC-Type": "warcinfo", \ "WARC-Filename" : filename}, \ defaults=True) Content = "software: WARCMerge/1.0" + "\r\n" \ + "format: WARC File Format 1.0" + "\r\n" \ + "description: "+" Merging WARC files into a single one " + "\r\n" + \ "robots: ignore" + "\r\n" R = warc.WARCRecord(H, Content) return R
def update_warc_metadata_from_item(record, item): """update a WARC metadata record from a scrapy Item""" # make empty header object to use for fields # XXX WARCHeader messes up capitalization here fields = warc.WARCHeader({}, defaults=False) fields['x-crawl-depth'] = item['depth'] fields['hopsFromSeed'] = item['hops_from_seed'] fields['x-source-anchor'] = item['source_anchor'] fields['x-source-url'] = item['source_url'] buf = BytesIO() fields.write_to(buf, version_line=False, extra_crlf=False) record.update_payload(buf.getvalue())
def update_warc_info_from_spider(record, spider): """update a WARC warcinfo record from a scrapy Spider""" # make empty header object to use for fields # XXX WARCHeader messes up capitalization here fields = warc.WARCHeader({}, defaults=False) fields['software'] = 'osp_scraper' fields['hostname'] = socket.getfqdn() fields['x-spider-name'] = spider.name fields['x-spider-run-id'] = spider.run_id fields['x-spider-revision'] = git_revision fields['x-spider-parameters'] = json.dumps(spider.get_parameters()) buf = BytesIO() fields.write_to(buf, version_line=False, extra_crlf=False) record.update_payload(buf.getvalue())
def new_warc(kind): """return a new WARCRecord @arg kind: what flavor of WARC to create; see `warc.WarcHeader.CONTENT_TYPES` for flavors """ # ripped from WARCHeader.init_defaults() headers = { 'WARC-Type': kind, 'WARC-Record-ID': "<urn:uuid:%s>" % uuid.uuid1(), 'Content-Type': warc.WARCHeader.CONTENT_TYPES[kind], 'WARC-Date': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) } return warc.WARCRecord(header=warc.WARCHeader(headers, defaults=False), defaults=False)
def record_log(self, warc_info_id): log_payload = json.dumps(self.output_log, ensure_ascii=False) log_header = { 'Content-Length': str(len(log_payload)), 'WARC-Target-URI': 'urn:X-archive-team-ftp-gov-deduplicate:log', 'WARC-Date': time.strftime('%Y-%m-%dT%H:%M:%SZ'), 'WARC-Block-Digest': "sha1:{}" \ .format(base64.b32encode(hashlib.sha1(log_payload).digest()).decode()), 'WARC-Record-ID': '<{}>'.format(uuid.uuid4().urn), 'WARC-Warcinfo-ID': warc_info_id, 'Content-Type': 'application/json', 'WARC-Type': 'resource' } return warc.WARCRecord( header=warc.WARCHeader(log_header, defaults=False), payload=log_payload, defaults=False )
def resolve_dns(self, hostname, date): ttl = self.robot.ctx.check_ttl(hostname) cache = self.robot.ctx.resolve_dns(hostname) if ttl: header = warc.WARCHeader({ "WARC-Type": "response", "WARC-Target-URI": "dns:{}".format(hostname), "WARC-Date": date.strftime("%Y-%m-%dT%H:%M:%SZ"), "Content-Type": "text/dns", }, defaults=True) body = ( # RFC 2540 section 2.2 Text Format [cache.created_at.strftime("%Y%m%d%H%M%S")] + [x.to_text() for x in cache.answers] ) record = warc.WARCRecord(header, payload=bytes("\r\n".join(body), "ascii")) self.warc.write_record(record) temp = [] for anser in cache.answers: temp += [x for x in anser.items if x.rdtype == dns.rdatatype.A] return str(secrets.choice(temp))
urls = [ 'https://elpais.com/', 'https://elpais.com/tag/gente/a', 'https://politica.elpais.com/', 'https://elpais.com/internacional/' ] f = warc.open("test.warc.gz", "w") for u in urls: fp = urllib.request.urlopen(u) mybytes = fp.read() mystr = mybytes.decode("utf8") fp.close() header = h = warc.WARCHeader({"WARC-Type": "response"}, defaults=True) header['WARC-Target-URI:'] = u record = warc.WARCRecord(header, mybytes) f.write_record(record) f.close() for u in urls: f = warc.open("test_trozos.warc.gz", "a") fp = urllib.request.urlopen(u) mybytes = fp.read() mystr = mybytes.decode("utf8") fp.close()
def SnapShot(): # TODO: # figure out length of video and develop native-resolution frame # sampling rate based off of this length. print( "********************* \n\n Getting snapshots. \n\n*********************" ) os.environ["FFREPORT"] = "file=ffmpeg-snapshots.log" # snapshot # This is the "proper" way to handle complex command lines with lots of args # https://stackoverflow.com/questions/8581140/python-subprocess-call-with-arguments-having-multiple-quotations ffmpegsnapshotargs = shlex.split( "ffmpeg -i samplethis.flv -vf fps=fps=1/15 -f image2 -q:v 1 images%05d.jpg" ) call(ffmpegsnapshotargs) print( "********************* \n\n Compressing snapshots. \n\n*********************" ) imagelist = glob.glob("*.jpg") imageliststring = ' '.join(imagelist) tarcommand = "tar -czvf snapshots.tar.gz " + imageliststring # compress all the snapshots tarargs = shlex.split(tarcommand) call(tarargs) # delete jpgs rmcommand = "rm " + imageliststring rmargs = shlex.split(rmcommand) call(rmargs) os.environ["FFREPORT"] = "" # Add ffmpeg log record ffmpegsampleheader = warc.WARCHeader({ "WARC-Type": "resource", "WARC-Warcinfo-ID": warcinfo_record_ID, "Content-Type": "text/plain", "WARC-Concurrent-To": metadata_record_ID }) ffmpegsamplepayload = StringIO( open("ffmpeg-snapshots.log").read()).getvalue() ffmpegsamplerecord = warc.WARCRecord(headers=ffmpegsampleheader, payload=ffmpegsamplepayload) new_warc_file.write_record(ffmpegsamplerecord) # Add the actual snapshot record snapshotrecord = warc.WARCRecord( headers=warc.WARCHeader({ "WARC-Type": "conversion", "Content-Type": "application/x-gtar", "WARC-Refers-To": truncated_record_ID }), payload=StringIO(open("snapshots.tar.gz").read()).getvalue()) new_warc_file.write_record(snapshotrecord) # remove snapshots and log call(shlex.split("rm snapshots.tar.gz ffmpeg-snapshots.log"))
def process(self, item): # assert that this item is flagged for sampling. If not, # return immediately. We don't want to butcher uploads that # have been determined to be worth saving in their original # state. # # Presumably, the tracker is tagging these items as something # appropriate. Alternately, one could create a "Phase 3" grab # and know for a fact that we are only receiving videos that # should be sampled. In which case, one may skip the item_type # check and proceed directly to sampling. item_name = item['item_name'] item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('video-bulk', 'url-bulk') # Item type is not marked as "video-bulk" from tracker. # Carry on. Nothing to do here. if item_type != 'video-bulk' or 'url-bulk': return # ok. This is an item that needs to be sampled. # remember where we started from so we can get back there and # not mess up the expectations for the rest of stages in the # pipeline original_path = os.getcwd() # get to item_dir ; begin work os.chdir(item['item_dir']) # we will need some data from the warcfile warcinfo_record_ID = "" metadata_record_ID = "" truncated_record_ID = "" # set up old and new warc files for reading and writing, respectively. # If a file ends in *.gz for writing, the warc library handles gz # compression transparently. old_warc_file = warc.open("%(warc_file_base)s.warc.gz" % item) new_warc_file = warc.open( "%(warc_file_base)s-POSTPROCESSED.warc.gz" % item, "w") # ------------------------ Start of main for loop -------------------# # and here... we... go for record in old_warc_file: # Firstly, we detect whether the record we're iterating over holds # data we'll need later. If so, behave appropriately. After the # if-elif-elif dance, we proceed to copy each record into a new # record in the %(warc_file_base)s-POSTPROCESSED.warc.gz file, # modifying as necesary (truncated long records, etc) # ------------------------ Check for data -------------------------# # Grab the lengthy payload (the flv file); if the content-length is # longer than ~5MiB, and the record is of the "response" type, then # this record *probably* has the flv file. if ((long(record['Content-Length']) >= 5000000) and record['WARC-Type'] == "response"): # need the record id of the original flv record. Will refernece # it in truncated record. truncated_record_id = record['warc-record-id'] # add "WARC-Truncated" to this record, indicating that it has # been truncated due to length. record['warc-truncated'] = "length" # extract the payload tempfile = open("intermediate.int", 'wb') for line in record.payload: tempfile.write(line) tempfile.close() # put the payload back; iterating through record.payload # invokes a generator on the payload that seems to # "eat up" the payload in the original file. I say so because # attempting to, say, write the payload out twice (to TWO files) # will fail, as will any attempt to read out the payload again # without first "putting it back." (I'd love an explanation for # just what's going on here; but for now, this hack works) # (for the record with the long content-length, we end up reading # the payload twice; once here, to get it to a separate file, and # once again, in COPY PAYLOAD, to write out a truncated version to # the new warc file) stream = StringIO(open("intermediate.dat", 'rb').read()) stream.seek(0, os.SEEK_END) streamlength = stream.tell() stream.seek(0) record.payload = warc.utils.FilePart(fileobj=stream, length=streamlength) # can't close the stream yet for some reason. This might # introduce leaks of some sort, so keep an eye on it. # The relevant error: "IO Operation on a closed file." # I suspect this operation occurs somewhere in the warc library, # and i'm hoping that the stream object just falls out of scope # at some point other than when the entire pipeline shuts down. # stream.close() # Adjust the warcinfo record to note that we also utilized ffmpeg elif (record['WARC-Type'] == "warcinfo"): # grab the record-id for later use in resource records warcinfo_record_ID = record['warc-record-id'] # gotta add another "software" key to the content-block of the # warcinfo record that indicates the use of ffmpeg. warcinfo_stream = StringIO() for line in record.payload: warcinfo_stream.write(line) # trailing \r\n\r\n is already present in the payload; just seek back # two bytes (yes, the second \r\n will get clobbered; potential unicode # byte-length issues here) and then tack on the additional lines you # need to like so: warcinfo_stream.seek(-2, os.SEEK_END) warcinfo_stream.write("software: ffmpeg/2.3.1\r\n\r\n") warcinfo_stream.seek(0, os.SEEK_END) warcinfo_stream_len = warcinfo_stream.tell() warcinfo_stream.seek(0) record.payload = warc.utils.FilePart( fileobj=warcinfo_stream, length=warcinfo_stream_len) # Get the metadata record's warc-record-id for later resource # records. elif (record['WARC-Type'] == "metadata"): metadata_record_ID = record['warc-record-id'] # End of conditionals. Proceed to write the new record to the # post-processed warcfile. # ------------------------ Copy Record -------------------------# # COPY HEADER # Should we add defaults=False ? It seems that some additional headers # are added in WARCHeader as well as WARCRecord. However, they don't # seem harmful: digests and timestamps. new_header = warc.WARCHeader(record.header) # COPY PAYLOAD # if the current record gets truncated, then set the content-length # to the new, truncated length as per spec. truncated_flag = None # SHORT record payloads if long(record['content-length']) < 500000: #print "Copying payload..." new_payload = StringIO() for line in record.payload: new_payload.write(line) #if we don't seek back to 0, new_payload.read() is empty new_payload.seek(0) #print "Done copying payload." # LONG record payloads (the one that probably has video data) else: #print "Found long content-length. Truncating..." new_payload = StringIO() decrement = 25 #Grab some lines #print "Gonna grab some lines. Decrement: ", decrement for line in record.payload: #print "Grabbing a line." new_payload.write(line) decrement -= 1 #print "Decrement: ", decrement if decrement == 0: break # be kind: rewind new_payload.seek(0) truncated_flag = True #print "Done truncating." # CREATE RECORD FROM HEADER AND PAYLOAD new_rec = warc.WARCRecord(payload=new_payload.read(), headers=new_header, defaults=False) # if this record happened to be one that got truncated, then we # need to adjust its content-length header. if truncated_flag: #print "Adjusting content-length header" # From page 9 of the ISO WARC Standard: # # "The WARC-Truncated field may be used on any WARC record. The WARC # field Content-Length shall still report the actual truncated size of # the record block." # Get the length of the truncated content-block and set # Content-Length header appropriately new_payload.seek(0) new_payload.seek(0, os.SEEK_END) thelength = new_payload.tell() new_rec['content-length'] = str(thelength) new_payload.seek(0) # WRITE THE NEW RECORD OUT TO THE NEW WARCFILE # (the warc library handles the gz-compression and putting each record # in a separate gz "member" transparently; no need to much with the gzip # library ourselves) #print "Copying record to new .warc.gz" new_warc_file.write_record(new_rec) #print "Done copying record to new .warc.gz" #print "\n\n" #------------------------ END OF MAIN FOR LOOP ------------------------# # at this point, we have a new warcfile with copied and truncated # records; now, we need to sample the content and add these "conversion" # records to the warc file. # Should probably delete old warc at this point, since new warcfile has all # of the old records, and we've already got another copy of the main # payload. If we proceed to write out the full newfile with the shrunken # payload before deleting the old warc, we'll basically be using nearly # 3x the interim diskspace rather than 2x. (Don't get me wrong, I'd love # to have more of a generator-like setup that negates the need to use # twice the disk space, but it's beyond the scope of my abilities at the # moment and I don't think I'd be able to get up to speed before the # deadline for this project drops (August 27 2014) Update: LOL Twitch is # already deleting things on August 26; oh well, I suppose this code # could come in handy if the IA suddenly needs to compress lots of # material) # Now, we need to convert the flv, and add conversion records # Our "payload.flv" is not quite an flv yet; the payload still includes the # HTTP Response headers. We need to grep for "CRLFCRLF" and then chop off # anything prior to it, including it, leaving nothing but the flv file for # ffmpeg to work with. thefile = open("intermediate.int").read() # NOT A FILE; just a "str" theflv = thefile.split('\r\n\r\n')[1] writetheflv = open("samplethis.flv", "w") writetheflv.write(theflv) writetheflv.close() # Get Snapshots SnapShot() # Get shrinked video ShrinkRay() # Clean up print( "********************* \n\n Removing temporary files; cleaning up \n\n*********************" ) # remove original file intermediates: "intermediate.int" and "samplethis.flv" rmargs = shlex.split("rm intermediate.int samplethis.flv") call(rmargs) # And we're done! new_warc_file.close() os.chdir(original_path)
wait=wait, iterable=download_links, threads=threads) hostname = subprocess.check_output("hostname -f", shell=True).decode('utf-8') user = subprocess.check_output("echo $USER", shell=True).decode('utf-8') body = "robots: classic\r\nhostname: " + str( hostname) + "software: page_downloader.py\r\nisPartOf: Cs_media\r\n" body += "operator: " + str( user ) + "description: Downloading pages\r\npublisher: KNOT (https://knot.fit.vutbr.cz/)\r\n" body += "format: WARC File Format 1.0\r\nconformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n" warc_header = warc.WARCHeader( { "WARC-Type": "warcinfo", "WARC-Filename": settings['output'][0] }, defaults=True) warc_record = warc.WARCRecord(warc_header, body.encode()) warc_record.write_to(out) for page in generator: warc_header = warc.WARCHeader( { "WARC-Type": "response", "WARC-Target-URI": page['url'] }, defaults=True) response = page['response'] if not (response.endswith('\r\n\r\n')): response += '\r\n\r\n'