Пример #1
0
def getHeaders(warcf, warcr):
    dat = ''
    headers = {}
    b = WBloc(warcf, warcr, False, BLOCKSIZE)
    while True:
        nd = b.getNext()
        if nd:
            dat += nd
        else:
            break
    if len(dat) > 0:
        bits = dat.split('\r\n\r\n')
        content = "\r\n".join(bits[1:])
        header = bits[0].split('\r\n')
        for line in header:
            line = line.strip()
            if len(line.strip()) == 0:
                break
            else:
                if ':' in line:
                    bits = line.split(':')
                    headers[bits[0].lower()] = ''.join(bits[1:])
                elif line.startswith('HTTP'):
                    parts = line.split(' ')
                    try:
                        headers['protocol'] = parts[0]
                        headers['status'] = parts[1]
                        headers['code'] = parts[2]
                    except:
                        pass
    b.destroy()
    return headers
Пример #2
0
def getHeaders( warcf , warcr ):
    dat = ''
    headers = {}
    b = WBloc(warcf, warcr, False , BLOCKSIZE )        
    while True:
        nd = b.getNext()
        if nd:
            dat += nd
        else:            
            break            
    if len(dat) > 0 :
        bits = dat.split( '\r\n\r\n' )
        content = "\r\n".join( bits[1:] )       
        header = bits[0].split('\r\n' )
        for line in header:
            line = line.strip()
            if len( line.strip() ) == 0:
                break
            else:
                if ':' in line:
                    bits = line.split(':')
                    headers[bits[0].lower() ] = ''.join( bits[1:] )
                elif line.startswith('HTTP' ):
                    parts = line.split(' ')
                    try:
                        headers[ 'protocol' ] = parts[0]
                        headers[ 'status' ] = parts[1]
                        headers[ 'code' ] = parts[2]
                    except:
                        pass
    b.destroy() 
    return headers
Пример #3
0
def indexWarc(warcFileName):
    tempdir = tempfile.mkdtemp(prefix='opds-crawler-')
    print 'created tempdir ' + tempdir

    w = WFile(warcFileName, config['max_warc_size'], warc.WARC_FILE_READER,
              warc.WARC_FILE_DETECT_COMPRESSION, tempdir)
    assert w

    while (w.hasMoreRecords()):

        r = w.nextRecord()
        if None == r:
            w.destroy()
            print "bad record.. bailing!"
            return

        url = r.getTargetUri()
        print 'processing ' + url
        b = WBloc(w, r, False, 64 * 1024)

        content = ''
        while True:
            buf = b.getNext()
            if buf:
                content += buf
                #sys.stdout.write(buf)
            else:
                break

        if 'application/atom+xml' == r.getContentType():
            ingestor = bookserver.catalog.ingest.OpdsToCatalog(content, url)
            c = ingestor.getCatalog()
            provider = getProvider(url)
            renderer = bookserver.catalog.output.CatalogToSolr(c, provider)
            str = renderer.toString()

            solr_import_xml = tempdir + "/solr_import.xml"
            f = open(solr_import_xml, 'w')
            f.write(str)
            f.close()

            command = """/solr/example/exampledocs/post.sh '%s'""" % (
                solr_import_xml)

            (ret, out) = commands.getstatusoutput(command)
            if -1 == out.find('<int name="status">0</int>'):
                print out
            assert 0 == ret

            os.unlink(solr_import_xml)

        b.destroy()
        r.destroy()

    os.rmdir(tempdir)
    w.destroy()
Пример #4
0
def indexWarc(warcFileName):
    tempdir = tempfile.mkdtemp(prefix='opds-crawler-')
    print 'created tempdir ' + tempdir

    w = WFile (warcFileName, config['max_warc_size'], warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, tempdir)
    assert w

    while ( w.hasMoreRecords() ) :

        r = w.nextRecord()
        if None == r:
            w.destroy ()
            print "bad record.. bailing!"
            return
        
        url = r.getTargetUri()
        print 'processing ' + url
        b = WBloc (w, r, False, 64 * 1024)
        
        content = ''
        while True:
            buf = b.getNext()
            if buf:
                content += buf
                #sys.stdout.write(buf)
            else:
                break

        if 'application/atom+xml' == r.getContentType():
            ingestor = bookserver.catalog.ingest.OpdsToCatalog(content, url)
            c = ingestor.getCatalog()
            provider = getProvider(url)
            renderer = bookserver.catalog.output.CatalogToSolr(c, provider)
            str = renderer.toString()
            
            solr_import_xml = tempdir + "/solr_import.xml"
            f = open(solr_import_xml, 'w')
            f.write(str)
            f.close()
                        
            command = """/solr/example/exampledocs/post.sh '%s'""" % (solr_import_xml)
            
            (ret, out) = commands.getstatusoutput(command)
            if -1 == out.find('<int name="status">0</int>'):
                print out
            assert 0 == ret

            os.unlink(solr_import_xml)
            

        b.destroy()
        r.destroy()
        
    os.rmdir(tempdir)
    w.destroy()
Пример #5
0
def getRecord(warcname, offset, tempdir='.'):
    w = WFile(warcname, CONSTANT, warc.WARC_FILE_READER,
              warc.WARC_FILE_DETECT_COMPRESSION, tempdir)
    w.seek(int(offset))
    rec = w.nextRecord()
    b = WBloc(w, rec, False, BLOCKSIZE)
    dat = ''
    while True:
        nd = b.getNext()
        if nd:
            dat += nd
        else:
            break
    headers = {}
    if len(dat) > 0:
        bits = dat.split('\r\n\r\n')
        content = "\r\n\r\n".join(bits[1:])
        header = bits[0].split('\r\n')
        for line in header:
            line = line.strip()
            if len(line.strip()) == 0:
                break
            else:
                if ':' in line:
                    bits = line.split(':')
                    headers[bits[0]] = ''.join(bits[1:])
                elif line.startswith('HTTP'):
                    parts = line.split(' ')
                    try:
                        headers['protocol'] = parts[0]
                        headers['status'] = parts[1]
                        headers['code'] = parts[2]
                    except:
                        pass
    b.destroy()
    rec.destroy()
    w.destroy()
    return (headers, content)

    return (w, rec)
Пример #6
0
def getRecord( warcname , offset , tempdir ='.' ):
    w = WFile ( warcname  ,  CONSTANT , warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, tempdir )
    w.seek( int(offset ))
    rec = w.nextRecord()
    b = WBloc( w , rec , False , BLOCKSIZE )
    dat = ''
    while True:
        nd = b.getNext()
        if nd:
            dat += nd
        else:            
            break       
    headers = {}                            
    if len(dat) > 0 :
        bits = dat.split( '\r\n\r\n' )
        content = "\r\n\r\n".join( bits[1:] )       
        header = bits[0].split('\r\n' )
        for line in header:
            line = line.strip()
            if len( line.strip() ) == 0:
                break
            else:
                if ':' in line:
                    bits = line.split(':')
                    headers[bits[0] ] = ''.join( bits[1:] )
                elif line.startswith('HTTP' ):
                    parts = line.split(' ')
                    try:
                        headers[ 'protocol' ] = parts[0]
                        headers[ 'status' ] = parts[1]
                        headers[ 'code' ] = parts[2]
                    except:
                        pass        
    b.destroy() 
    rec.destroy()
    w.destroy()
    return ( headers , content )
    
    
    return ( w , rec )
Пример #7
0
def main () :
    
    usage =  "./app/python/wgetbloc.py <-f file.warc> <-o offset> [-e] [-v] [-t <working_dir>]\n "\
             "\t-f    : valid WARC file name\n"\
             "\t-o    : record offset\n"\
             "\t[-e]  : print HTTP response headers (default 'no')\n"\
             "\t[-t]  : temporary working directory (default './')\n"\
             "./app/python/wgetbloc.py -f foo.warc.gz -n 7"
 
    parser = OptionParser(usage)

    parser.add_option("-f", "--file", dest="filename",
                      help="read data from FILENAME")

    parser.add_option("-o", "--offset", dest="offset",
                      help="record offset", type="int")

    parser.add_option("-e", "--headers",
                    action="store_false", default=True, dest="headers")
    
    parser.add_option("-t", "--tempdir", dest="tmpdir",
                      help="Temporary working directory", default=".")

    (options, args) = parser.parse_args()

    if len (args) != 0 :
       parser.error(" Incorrect arguments")

    if (not (options.filename)) :
        parser.error(" You must give WARC file name")

    if options.offset == None:
        parser.error(" You must provide a valid record offset")

    w = WFile (options.filename, 600 * 1024 * 1024, warc.WARC_FILE_READER, warc.WARC_FILE_DETECT_COMPRESSION, options.tmpdir)

    if (not (w)) :
        print "WARC file  not found "

    # go to the specified offset
    w.seek(options.offset);
    if w . hasMoreRecords ():
        r  = w . nextRecord ()
    else:
        print "End of file reached, or no record at this offset", options.offset
        sys.exit(0);

    # choose your buffer size (ex. 64K = 64 * 1024) to read the payload
    # (with the HTTP headers or not, use the boolean flag) chunk by chunk 
    b = WBloc (w, r, options.headers, 64 * 1024)
    while True:
        buff = b.getNext()
        if buff:
            # the chunk size is returned by calling "b.getLastChunkSize()"
            #sys.stderr.write("chunk size:" + b.getLastChunkSize())
            sys.stdout.write(buff)
        else: # no more data to read. reach the end of record
            break

    b.destroy ()
    r.destroy ()
    w.destroy ()
Пример #8
0
def main():

    usage =  "./app/python/wgetbloc.py <-f file.warc> <-o offset> [-e] [-v] [-t <working_dir>]\n "\
             "\t-f    : valid WARC file name\n"\
             "\t-o    : record offset\n"\
             "\t[-e]  : print HTTP response headers (default 'no')\n"\
             "\t[-t]  : temporary working directory (default './')\n"\
             "./app/python/wgetbloc.py -f foo.warc.gz -n 7"

    parser = OptionParser(usage)

    parser.add_option("-f",
                      "--file",
                      dest="filename",
                      help="read data from FILENAME")

    parser.add_option("-o",
                      "--offset",
                      dest="offset",
                      help="record offset",
                      type="int")

    parser.add_option("-e",
                      "--headers",
                      action="store_false",
                      default=True,
                      dest="headers")

    parser.add_option("-t",
                      "--tempdir",
                      dest="tmpdir",
                      help="Temporary working directory",
                      default=".")

    (options, args) = parser.parse_args()

    if len(args) != 0:
        parser.error(" Incorrect arguments")

    if (not (options.filename)):
        parser.error(" You must give WARC file name")

    if options.offset == None:
        parser.error(" You must provide a valid record offset")

    w = WFile(options.filename, 600 * 1024 * 1024, warc.WARC_FILE_READER,
              warc.WARC_FILE_DETECT_COMPRESSION, options.tmpdir)

    if (not (w)):
        print "WARC file  not found "

    # go to the specified offset
    w.seek(options.offset)
    if w.hasMoreRecords():
        r = w.nextRecord()
    else:
        print "End of file reached, or no record at this offset", options.offset
        sys.exit(0)

    # choose your buffer size (ex. 64K = 64 * 1024) to read the payload
    # (with the HTTP headers or not, use the boolean flag) chunk by chunk
    b = WBloc(w, r, options.headers, 64 * 1024)
    while True:
        buff = b.getNext()
        if buff:
            # the chunk size is returned by calling "b.getLastChunkSize()"
            #sys.stderr.write("chunk size:" + b.getLastChunkSize())
            sys.stdout.write(buff)
        else:  # no more data to read. reach the end of record
            break

    b.destroy()
    r.destroy()
    w.destroy()