예제 #1
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        parser.error("no imput warc file(s)")

    total = 0
    #    print '#WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length'
    for name in expand_files(input_files):
        fh = WarcRecord.open_archive(name, gzip="auto")

        for (offset, record, errors) in fh.read_records(limit=None):
            if record:
                print name, offset, record.type, record.url, record.id, record.content_type, record.content_length
                total += record.content_length
            elif errors:
                pass
                # ignore
            else:
                pass
                # no errors at tail

        fh.close()
    print total

    return 0
예제 #2
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    out = sys.stdout
    if len(input_files) < 1:
        parser.error("no imput warc file(s)")

    total = 0        
#    print '#WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length'
    for name in expand_files(input_files):
        fh = WarcRecord.open_archive(name, gzip="auto")

        for (offset, record, errors) in fh.read_records(limit=None):
            if record:
                print name, offset, record.type, record.url, record.id, record.content_type, record.content_length
                total += record.content_length
            elif errors:
                pass
                # ignore
            else:
                pass
                # no errors at tail




        fh.close()
    print total


    return 0
예제 #3
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    try: # python3
        out = sys.stdout.buffer
    except AttributeError: # python2
        out = sys.stdout

    if len(input_files) < 1:
        fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)

        for record in fh:
            process(record, out, options)
    else:
        for name in expand_files(input_files):
            fh = WarcRecord.open_archive(name, gzip="auto")
            for record in fh:
                process(record, out, options)

            fh.close()



    return 0
예제 #4
0
def main(argv):
    (options, input_files) = parser.parse_args(args=argv[1:])

    # prepare regular expressions
    link_ignore_expressions = prepare_link_ignore_re(options.ignore_links)

    print "parsing WARC archives"

    all_urls = []

    for filename in expand_files(input_files):

        print "WARC: "+filename

        link_cache_filename = filename+'.urls'
        if options.persist_links and os.path.exists(link_cache_filename):
            url_fh = open(link_cache_filename, 'r')
            urls = pickle.load(url_fh)
            url_fh.close()
            all_urls += urls
        else:
            urls = []
            fh = WarcRecord.open_archive(filename, gzip="auto")
            for record in fh:

                record = record
                """@type : ArchiveRecord """

                if not record.is_response():
                    continue

                urls.append({
                    'url': record.url,
                    'content-type': record.content_content_type
                })

            # urls.sort(cmp=url_cmp)
            if options.persist_links:
                url_fh = open(link_cache_filename, 'w+')
                pickle.dump(urls, url_fh)
                url_fh.close()

            fh.close()
            all_urls += urls

    if options.dump_links is not None:

        f = open(options.dump_links, 'w+')
        all_urls.sort()
        for url in all_urls:
            # skip ignorable links
            skip_addition = False
            for expression in link_ignore_expressions:
                if expression.match(url['url']):
                    skip_addition = True
                    break
            if not skip_addition:
                f.write(url['url'])
                f.write('\n')
        f.close()

    if options.web_start is not False:
        urltree = UrlTree()
        for url in all_urls:
            # skip filtered links via regex
            skip_addition = False
            for expression in link_ignore_expressions:
                if expression.match(url['url']):
                    skip_addition = True
                    break
            # skip links filtered by content_type filter
            if options.content_type:
                if not url['content-type'].startswith(options.content_type):
                        skip_addition = True
            if options.content_type_not:
                if url['content-type'].startswith(options.content_type_not):
                        skip_addition = True

            if not skip_addition:
                urltree.add_url(url['url'])
        print "Total urls: "+str(urltree.childcount)
        webserver.run(urltree)