示例#1
0
def fetch_bag_files(bag,
                    keychain_file=DEFAULT_KEYCHAIN_FILE,
                    config_file=DEFAULT_CONFIG_FILE,
                    force=False,
                    callback=None,
                    filter_expr=None,
                    **kwargs):

    auth = read_keychain(keychain_file)
    config = read_config(config_file)
    cookies = get_request_cookies(config) if kwargs.get("cookie_scan",
                                                        True) else None

    success = True
    current = 0
    total = 0 if not callback else len(set(bag.files_to_be_fetched()))
    start = datetime.datetime.now()
    for entry in map(FetchEntry._make, bag.fetch_entries()):
        filename = urlunquote(entry.filename)
        if filter_expr:
            if not filter_dict(filter_expr, entry._asdict()):
                continue
        output_path = os.path.normpath(os.path.join(bag.path, filename))
        local_size = os.path.getsize(output_path) if os.path.exists(
            output_path) else None
        try:
            remote_size = int(entry.length)
        except ValueError:
            remote_size = None
        missing = True
        if local_size is not None:
            if local_size == remote_size or remote_size is None:
                missing = False

        if not force and not missing:
            if logger.isEnabledFor(logging.DEBUG):
                logger.debug("Not fetching already present file: %s" %
                             output_path)
            pass
        else:
            result_path = fetch_file(entry.url,
                                     output_path,
                                     auth,
                                     size=entry.length,
                                     config=config,
                                     cookies=cookies,
                                     **kwargs)
            if not result_path:
                success = False

        if callback:
            current += 1
            if not callback(current, total):
                logger.warning("Fetch cancelled by user...")
                success = False
                break
    elapsed = datetime.datetime.now() - start
    logger.info("Fetch complete. Elapsed time: %s" % elapsed)
    cleanup_transports()
    return success
示例#2
0
    def test_filter_dict(self):
        logger.info(self.getTestHeader('test filter function'))

        msg = "evaluating filter expression: %s"
        test_url = "http://example.com/files/examples/README.txt"
        test_length = 250624
        test_filename = "data/examples/README.txt"
        test_entry = {"url": test_url,
                      "length": test_length,
                      "filename": test_filename}
        pos_exprs = ["url==%s" % test_url,
                     "url!=http://foo",
                     "url=*/files/",
                     "filename!*/files/",
                     "filename^*data/",
                     "filename$*.txt",
                     "length>250623",
                     "length>=250624",
                     "length<250625",
                     "length<=250624"]
        neg_exprs = ["url!=%s" % test_url,
                     "url==http://foo",
                     "url=*/fils/",
                     "filename!*/examples/",
                     "filename^*dat/",
                     "filename$*.tx",
                     "length>250624",
                     "length>=250625",
                     "length<250624",
                     "length<=250623",
                     "length<=-"]
        bad_exprs = ["url*=http://foo", "url=http://foo"]
        try:
            for expr in pos_exprs:
                result = filter_dict(expr, test_entry)
                self.assertTrue(result, msg % expr)
            for expr in neg_exprs:
                result = filter_dict(expr, test_entry)
                self.assertFalse(result, msg % expr)
            for expr in bad_exprs:
                self.assertRaises(ValueError, filter_dict, expr, test_entry)
        except Exception as e:
            self.fail(get_typed_exception(e))
示例#3
0
def create_rfm_from_file(args):
    if not (args.md5_col or args.sha1_col or args.sha256_col
            or args.sha512_col):
        raise ValueError(
            "At least one checksum algorithm column mapping must be specified."
        )

    with open(args.output_file, 'w') as rfm_file, open(args.input_file,
                                                       'r') as input_file:
        rfm = list()
        if not args.input_format == 'json':
            dialect = Sniffer().sniff(input_file.read(4096))
            input_file.seek(0)
            rows = DictReader(input_file, dialect=dialect)
        else:
            rows = json.load(input_file)

        for row in rows:
            if not filter_dict(args.filter, row):
                continue
            rfm_entry = dict()
            rfm_entry["url"] = row[args.url_col]
            rfm_entry["length"] = int(row[args.length_col])
            rfm_entry["filename"] = urlsplit(
                row[args.filename_col]).path.lstrip("/")
            if args.md5_col:
                rfm_entry["md5"] = row[args.md5_col]
                rfm_entry["md5_base64"] = encode_hex_to_base64(
                    rfm_entry["md5"])
            if args.sha1_col:
                rfm_entry["sha1"] = row[args.sha1_col]
                rfm_entry["sha1_base64"] = encode_hex_to_base64(
                    rfm_entry["sha1"])
            if args.sha256_col:
                rfm_entry["sha256"] = row[args.sha256_col]
                rfm_entry["sha256_base64"] = encode_hex_to_base64(
                    rfm_entry["sha256"])
            if args.sha512_col:
                rfm_entry["sha512"] = row[args.sha512_col]
                rfm_entry["sha512_base64"] = encode_hex_to_base64(
                    rfm_entry["sha512"])
            rfm.append(rfm_entry)

        entries = deduplicate_rfm_entries(rfm)
        logger.info("Writing %d entries to remote file manifest" %
                    len(entries))
        rfm_file.write(json.dumps(entries, sort_keys=True, indent=2))
        logger.info("Successfully created remote file manifest: %s" %
                    args.output_file)
示例#4
0
def create_rfm_from_filesystem(args):
    with open(args.output_file, 'w') as rfm_file:
        rfm = list()
        if not os.path.isdir(args.input_path):
            raise ValueError(
                "The following path does not exist or is not a directory: [%s]"
                % args.input_path)
        for dirpath, dirnames, filenames in os.walk(args.input_path):
            subdirs_count = dirnames.__len__()
            if subdirs_count:
                logger.info(
                    "%s subdirectories found in input directory %s %s" %
                    (subdirs_count, args.input_path, dirnames))
            filenames.sort()
            for fn in filenames:
                rfm_entry = dict()
                input_file = os.path.join(dirpath, fn)
                logger.debug("Processing input file %s" % input_file)
                input_rel_path = input_file.replace(args.input_path, '')
                filepath = args.base_payload_path if args.base_payload_path else ""
                filepath = "".join([filepath, input_rel_path])
                rfm_entry["filename"] = filepath.replace("\\", "/").lstrip("/")
                rfm_entry["url"] = url_format(args.url_formatter,
                                              base_url=args.base_url,
                                              filepath=input_rel_path.replace(
                                                  "\\", "/").lstrip("/"),
                                              filename=fn)
                rfm_entry["length"] = os.path.getsize(input_file)
                rfm_entry.update(compute_file_hashes(input_file,
                                                     args.checksum))

                if not filter_dict(args.filter, rfm_entry):
                    continue

                if args.streaming_json:
                    rfm_file.writelines(''.join(
                        [json.dumps(rfm_entry, sort_keys=True), '\n']))
                else:
                    rfm.append(rfm_entry)
        if not args.streaming_json:
            rfm_file.write(json.dumps(rfm, sort_keys=True, indent=2))
        logger.info("Successfully created remote file manifest: %s" %
                    args.output_file)
示例#5
0
def fetch_bag_files(bag, keychain_file, force=False, callback=None, config=DEFAULT_CONFIG, filter_expr=None):

    success = True
    auth = read_keychain(keychain_file)
    resolvers = config.get(ID_RESOLVER_TAG, DEFAULT_ID_RESOLVERS) if config else DEFAULT_ID_RESOLVERS
    current = 0
    total = 0 if not callback else len(set(bag.files_to_be_fetched()))
    start = datetime.datetime.now()
    for entry in map(FetchEntry._make, bag.fetch_entries()):
        if filter_expr:
            if not filter_dict(filter_expr, entry._asdict()):
                continue
        output_path = os.path.normpath(os.path.join(bag.path, entry.filename))
        local_size = os.path.getsize(output_path) if os.path.exists(output_path) else None
        try:
            remote_size = int(entry.length)
        except ValueError:
            remote_size = None
        missing = True
        if local_size is not None:
            if local_size == remote_size or remote_size is None:
                missing = False

        if not force and not missing:
            if logger.isEnabledFor(logging.DEBUG):
                logger.debug("Not fetching already present file: %s" % output_path)
            pass
        else:
            success = fetch_file(entry.url, entry.length, output_path, auth, resolvers=resolvers)
        if callback:
            current += 1
            if not callback(current, total):
                logger.warning("Fetch cancelled by user...")
                break
    elapsed = datetime.datetime.now() - start
    logger.info("Fetch complete. Elapsed time: %s" % elapsed)
    cleanup_transports()
    return success
示例#6
0
def create_rfm_from_url_list(args):
    keychain_file = args.keychain_file if args.keychain_file else DEFAULT_KEYCHAIN_FILE
    auth = read_keychain(keychain_file)
    with open(args.output_file, 'w') as rfm_file, open(args.input_file,
                                                       'r') as input_file:
        rfm = list()
        for url in input_file.readlines():
            rfm_entry = dict()
            url = url.strip()
            logger.debug("Processing input URL %s" % url)
            try:
                headers = head_for_headers(url, auth, raise_for_status=True)
            except Exception as e:
                logging.warning("HEAD request failed for URL [%s]: %s" %
                                (url, gte(e)))
                continue
            logger.debug("Result headers: %s" % headers)
            length = headers.get("Content-Length")
            content_type = headers.get("Content-Type")
            content_disposition = headers.get("Content-Disposition")
            md5_header = args.md5_header if args.md5_header else "Content-MD5"
            md5 = headers.get(md5_header)
            md5 = get_checksum_from_string_list("md5", md5)
            if md5 and not args.disable_hash_decode_base64:
                rfm_entry["md5_base64"] = md5
                md5 = decode_base64_to_hex(md5)
                rfm_entry["md5"] = md5
            sha256_header = args.sha256_header if args.sha256_header else "Content-SHA256"
            sha256 = headers.get(sha256_header)
            sha256 = get_checksum_from_string_list("sha256", sha256)
            if sha256 and not args.disable_hash_decode_base64:
                rfm_entry["sha256_base64"] = sha256
                sha256 = decode_base64_to_hex(sha256)
                rfm_entry["sha256"] = sha256

            # if content length or both hash values are missing, there is a problem
            if not length:
                logging.warning("Could not determine Content-Length for %s" %
                                url)
            if not (md5 or sha256):
                logging.warning(
                    "Could not locate an MD5 or SHA256 hash for %s" % url)

            # try to construct filename using content_disposition, if available, else fallback to the URL path fragment
            filepath = urlsplit(url).path
            filename = os.path.basename(filepath).split(":")[0] if not content_disposition else \
                parse_content_disposition(content_disposition)
            subdir = args.base_payload_path if args.base_payload_path else ""
            output_path = ''.join(
                [subdir, os.path.dirname(filepath), "/", filename])

            rfm_entry['url'] = url
            rfm_entry['length'] = length
            rfm_entry['filename'] = output_path.lstrip("/")
            if content_type:
                rfm_entry["content_type"] = content_type

            if not filter_dict(args.filter, rfm_entry):
                continue

            if args.streaming_json:
                rfm_file.writelines(''.join(
                    [json.dumps(rfm_entry, sort_keys=True), '\n']))
            else:
                rfm.append(rfm_entry)
        if not args.streaming_json:
            rfm_file.write(
                json.dumps(deduplicate_rfm_entries(rfm),
                           sort_keys=True,
                           indent=2))
        logger.info("Successfully created remote file manifest: %s" %
                    args.output_file)