def getHeaders(warcf, warcr): dat = '' headers = {} b = WBloc(warcf, warcr, False, BLOCKSIZE) while True: nd = b.getNext() if nd: dat += nd else: break if len(dat) > 0: bits = dat.split('\r\n\r\n') content = "\r\n".join(bits[1:]) header = bits[0].split('\r\n') for line in header: line = line.strip() if len(line.strip()) == 0: break else: if ':' in line: bits = line.split(':') headers[bits[0].lower()] = ''.join(bits[1:]) elif line.startswith('HTTP'): parts = line.split(' ') try: headers['protocol'] = parts[0] headers['status'] = parts[1] headers['code'] = parts[2] except: pass b.destroy() return headers
def getHeaders( warcf , warcr ): dat = '' headers = {} b = WBloc(warcf, warcr, False , BLOCKSIZE ) while True: nd = b.getNext() if nd: dat += nd else: break if len(dat) > 0 : bits = dat.split( '\r\n\r\n' ) content = "\r\n".join( bits[1:] ) header = bits[0].split('\r\n' ) for line in header: line = line.strip() if len( line.strip() ) == 0: break else: if ':' in line: bits = line.split(':') headers[bits[0].lower() ] = ''.join( bits[1:] ) elif line.startswith('HTTP' ): parts = line.split(' ') try: headers[ 'protocol' ] = parts[0] headers[ 'status' ] = parts[1] headers[ 'code' ] = parts[2] except: pass b.destroy() return headers
def indexWarc(warcFileName): tempdir = tempfile.mkdtemp(prefix='opds-crawler-') print 'created tempdir ' + tempdir w = WFile(warcFileName, config['max_warc_size'], warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, tempdir) assert w while (w.hasMoreRecords()): r = w.nextRecord() if None == r: w.destroy() print "bad record.. bailing!" return url = r.getTargetUri() print 'processing ' + url b = WBloc(w, r, False, 64 * 1024) content = '' while True: buf = b.getNext() if buf: content += buf #sys.stdout.write(buf) else: break if 'application/atom+xml' == r.getContentType(): ingestor = bookserver.catalog.ingest.OpdsToCatalog(content, url) c = ingestor.getCatalog() provider = getProvider(url) renderer = bookserver.catalog.output.CatalogToSolr(c, provider) str = renderer.toString() solr_import_xml = tempdir + "/solr_import.xml" f = open(solr_import_xml, 'w') f.write(str) f.close() command = """/solr/example/exampledocs/post.sh '%s'""" % ( solr_import_xml) (ret, out) = commands.getstatusoutput(command) if -1 == out.find('<int name="status">0</int>'): print out assert 0 == ret os.unlink(solr_import_xml) b.destroy() r.destroy() os.rmdir(tempdir) w.destroy()
def indexWarc(warcFileName): tempdir = tempfile.mkdtemp(prefix='opds-crawler-') print 'created tempdir ' + tempdir w = WFile (warcFileName, config['max_warc_size'], warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, tempdir) assert w while ( w.hasMoreRecords() ) : r = w.nextRecord() if None == r: w.destroy () print "bad record.. bailing!" return url = r.getTargetUri() print 'processing ' + url b = WBloc (w, r, False, 64 * 1024) content = '' while True: buf = b.getNext() if buf: content += buf #sys.stdout.write(buf) else: break if 'application/atom+xml' == r.getContentType(): ingestor = bookserver.catalog.ingest.OpdsToCatalog(content, url) c = ingestor.getCatalog() provider = getProvider(url) renderer = bookserver.catalog.output.CatalogToSolr(c, provider) str = renderer.toString() solr_import_xml = tempdir + "/solr_import.xml" f = open(solr_import_xml, 'w') f.write(str) f.close() command = """/solr/example/exampledocs/post.sh '%s'""" % (solr_import_xml) (ret, out) = commands.getstatusoutput(command) if -1 == out.find('<int name="status">0</int>'): print out assert 0 == ret os.unlink(solr_import_xml) b.destroy() r.destroy() os.rmdir(tempdir) w.destroy()
def getRecord(warcname, offset, tempdir='.'): w = WFile(warcname, CONSTANT, warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, tempdir) w.seek(int(offset)) rec = w.nextRecord() b = WBloc(w, rec, False, BLOCKSIZE) dat = '' while True: nd = b.getNext() if nd: dat += nd else: break headers = {} if len(dat) > 0: bits = dat.split('\r\n\r\n') content = "\r\n\r\n".join(bits[1:]) header = bits[0].split('\r\n') for line in header: line = line.strip() if len(line.strip()) == 0: break else: if ':' in line: bits = line.split(':') headers[bits[0]] = ''.join(bits[1:]) elif line.startswith('HTTP'): parts = line.split(' ') try: headers['protocol'] = parts[0] headers['status'] = parts[1] headers['code'] = parts[2] except: pass b.destroy() rec.destroy() w.destroy() return (headers, content) return (w, rec)
def getRecord( warcname , offset , tempdir ='.' ): w = WFile ( warcname , CONSTANT , warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, tempdir ) w.seek( int(offset )) rec = w.nextRecord() b = WBloc( w , rec , False , BLOCKSIZE ) dat = '' while True: nd = b.getNext() if nd: dat += nd else: break headers = {} if len(dat) > 0 : bits = dat.split( '\r\n\r\n' ) content = "\r\n\r\n".join( bits[1:] ) header = bits[0].split('\r\n' ) for line in header: line = line.strip() if len( line.strip() ) == 0: break else: if ':' in line: bits = line.split(':') headers[bits[0] ] = ''.join( bits[1:] ) elif line.startswith('HTTP' ): parts = line.split(' ') try: headers[ 'protocol' ] = parts[0] headers[ 'status' ] = parts[1] headers[ 'code' ] = parts[2] except: pass b.destroy() rec.destroy() w.destroy() return ( headers , content ) return ( w , rec )
def main () : usage = "./app/python/wgetbloc.py <-f file.warc> <-o offset> [-e] [-v] [-t <working_dir>]\n "\ "\t-f : valid WARC file name\n"\ "\t-o : record offset\n"\ "\t[-e] : print HTTP response headers (default 'no')\n"\ "\t[-t] : temporary working directory (default './')\n"\ "./app/python/wgetbloc.py -f foo.warc.gz -n 7" parser = OptionParser(usage) parser.add_option("-f", "--file", dest="filename", help="read data from FILENAME") parser.add_option("-o", "--offset", dest="offset", help="record offset", type="int") parser.add_option("-e", "--headers", action="store_false", default=True, dest="headers") parser.add_option("-t", "--tempdir", dest="tmpdir", help="Temporary working directory", default=".") (options, args) = parser.parse_args() if len (args) != 0 : parser.error(" Incorrect arguments") if (not (options.filename)) : parser.error(" You must give WARC file name") if options.offset == None: parser.error(" You must provide a valid record offset") w = WFile (options.filename, 600 * 1024 * 1024, warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, options.tmpdir) if (not (w)) : print "WARC file not found " # go to the specified offset w.seek(options.offset); if w . hasMoreRecords (): r = w . nextRecord () else: print "End of file reached, or no record at this offset", options.offset sys.exit(0); # choose your buffer size (ex. 64K = 64 * 1024) to read the payload # (with the HTTP headers or not, use the boolean flag) chunk by chunk b = WBloc (w, r, options.headers, 64 * 1024) while True: buff = b.getNext() if buff: # the chunk size is returned by calling "b.getLastChunkSize()" #sys.stderr.write("chunk size:" + b.getLastChunkSize()) sys.stdout.write(buff) else: # no more data to read. reach the end of record break b.destroy () r.destroy () w.destroy ()
def main(): usage = "./app/python/wgetbloc.py <-f file.warc> <-o offset> [-e] [-v] [-t <working_dir>]\n "\ "\t-f : valid WARC file name\n"\ "\t-o : record offset\n"\ "\t[-e] : print HTTP response headers (default 'no')\n"\ "\t[-t] : temporary working directory (default './')\n"\ "./app/python/wgetbloc.py -f foo.warc.gz -n 7" parser = OptionParser(usage) parser.add_option("-f", "--file", dest="filename", help="read data from FILENAME") parser.add_option("-o", "--offset", dest="offset", help="record offset", type="int") parser.add_option("-e", "--headers", action="store_false", default=True, dest="headers") parser.add_option("-t", "--tempdir", dest="tmpdir", help="Temporary working directory", default=".") (options, args) = parser.parse_args() if len(args) != 0: parser.error(" Incorrect arguments") if (not (options.filename)): parser.error(" You must give WARC file name") if options.offset == None: parser.error(" You must provide a valid record offset") w = WFile(options.filename, 600 * 1024 * 1024, warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, options.tmpdir) if (not (w)): print "WARC file not found " # go to the specified offset w.seek(options.offset) if w.hasMoreRecords(): r = w.nextRecord() else: print "End of file reached, or no record at this offset", options.offset sys.exit(0) # choose your buffer size (ex. 64K = 64 * 1024) to read the payload # (with the HTTP headers or not, use the boolean flag) chunk by chunk b = WBloc(w, r, options.headers, 64 * 1024) while True: buff = b.getNext() if buff: # the chunk size is returned by calling "b.getLastChunkSize()" #sys.stderr.write("chunk size:" + b.getLastChunkSize()) sys.stdout.write(buff) else: # no more data to read. reach the end of record break b.destroy() r.destroy() w.destroy()