def run(options): ig_list = options.get("inspectors") for inspector, url in URLS.items(): if (not ig_list) or (inspector in ig_list): logging.debug("[%s] Checking..." % inspector) result = utils.scraper.urlopen(url) match = PAGE_NOT_FOUND_STRING_RE.search(result) if not match: print("False negative for %s" % inspector) data_dir = utils.data_dir() for inspector in os.listdir(data_dir): if (not ig_list or inspector in ig_list) and inspector in IGS_WITH_BAD_404: inspector_path = os.path.join(data_dir, inspector) if os.path.isdir(inspector_path): for dirpath, dirnames, filenames in os.walk(inspector_path): for filename in filenames: path = os.path.join(dirpath, filename) try: f = open(path, 'r', encoding='utf-8') for line in f: if PAGE_NOT_FOUND_STRING_RE.search(line): print("Soft 404 found: %s" % path) except UnicodeDecodeError: f = open(path, 'rb') for line in f: if PAGE_NOT_FOUND_BYTES_RE.search(line): print("Soft 404 found: %s" % path)
def run(options): data_dir = utils.data_dir() ig_list = options.get("inspectors") report_id_history = {} for inspector in os.listdir(data_dir): logging.debug("[%s] Checking..." % inspector) if not ig_list or inspector in ig_list: inspector_path = os.path.join(data_dir, inspector) if os.path.isdir(inspector_path): for year in os.listdir(inspector_path): year_path = os.path.join(inspector_path, year) if os.path.isdir(year_path): for report in os.listdir(year_path): report_path = os.path.join(year_path, report) if os.path.isdir(report_path): json_path = os.path.join(report_path, "report.json") if os.path.isfile(json_path): report_data = json.load(open(json_path, "r", encoding="utf-8")) report_id = report_data["report_id"] if report_id in report_id_history: report_id_history[report_id].append(json_path) print("Duplicate report_id %s in %s" % (repr(report_id), ", ".join(report_id_history[report_id]))) else: report_id_history[report_id] = [json_path] if "global" not in options: report_id_history = {}
def run(options): ig_list = options.get("inspectors") for inspector, url in URLS.items(): if (not ig_list) or (inspector in ig_list): logging.debug("[%s] Checking..." % inspector) result = None status_code_rewritten = False while True: try: verify_options = utils.domain_verify_options(url) response = utils.scraper.get(url, verify=verify_options) result = response.text break except scrapelib.HTTPError as e: if e.response.status_code == 404: status_code_rewritten = True if 'location' in e.response.headers: url = e.response.headers['location'] continue result = e.body break if not status_code_rewritten: print( "False negative for %s (handler did not rewrite error code)" % inspector) match = PAGE_NOT_FOUND_STRING_RE.search(result) if not match: print( "False negative for %s (regular expression did not match error " "page contents)" % inspector) data_dir = utils.data_dir() for inspector in os.listdir(data_dir): if (not ig_list or inspector in ig_list) and inspector in IGS_WITH_BAD_404: inspector_path = os.path.join(data_dir, inspector) if os.path.isdir(inspector_path): for dirpath, dirnames, filenames in os.walk(inspector_path): for filename in filenames: path = os.path.join(dirpath, filename) try: f = open(path, 'r', encoding='utf-8') for line in f: if PAGE_NOT_FOUND_STRING_RE.search(line): print("Soft 404 found: %s" % path) except UnicodeDecodeError: f = open(path, 'rb') for line in f: if PAGE_NOT_FOUND_BYTES_RE.search(line): print("Soft 404 found: %s" % path)
def run(options): ig_list = options.get("inspectors") data_dir = utils.data_dir() for inspector in os.listdir(data_dir): if not ig_list or inspector in ig_list: logging.debug("[%s] Checking..." % inspector) inspector_path = os.path.join(data_dir, inspector) if os.path.isdir(inspector_path): for dirpath, dirnames, filenames in os.walk(inspector_path): for filename in filenames: _, extension = os.path.splitext(filename.lower()) if extension == ".pdf": try: original = os.path.join(dirpath, filename) decrypted_file, decrypted_path = tempfile.mkstemp( suffix=".pdf") os.close(decrypted_file) decrypted_file = None logging.debug("Decrypting %s to %s" % (original, decrypted_path)) subprocess.check_call([ "qpdf", "--decrypt", original, decrypted_path ]) try: extract_dir = tempfile.mkdtemp() logging.debug( "Extracting %s to %s" % (decrypted_path, extract_dir)) subprocess.check_call([ "pdftk", decrypted_path, "unpack_files" ], cwd=extract_dir) attachments = os.listdir(extract_dir) if attachments: print( "%s has the following attachments: %s" % (original, ', '.join(attachments))) finally: shutil.rmtree(extract_dir) except subprocess.CalledProcessError as e: print(e) finally: try: if decrypted_file: os.close(decrypted_file) decrypted_file = None finally: os.remove(decrypted_path)
def run(options): ig_list = options.get("inspectors") for inspector, url in URLS.items(): if (not ig_list) or (inspector in ig_list): logging.debug("[%s] Checking..." % inspector) result = None status_code_rewritten = False while True: try: verify_options = utils.domain_verify_options(url) response = utils.scraper.get(url, verify=verify_options) result = response.text break except scrapelib.HTTPError as e: if e.response.status_code == 404: status_code_rewritten = True if 'location' in e.response.headers: url = e.response.headers['location'] continue result = e.body break if not status_code_rewritten: print("False negative for %s (handler did not rewrite error code)" % inspector) match = PAGE_NOT_FOUND_STRING_RE.search(result) if not match: print("False negative for %s (regular expression did not match error " "page contents)" % inspector) data_dir = utils.data_dir() for inspector in os.listdir(data_dir): if (not ig_list or inspector in ig_list) and inspector in IGS_WITH_BAD_404: inspector_path = os.path.join(data_dir, inspector) if os.path.isdir(inspector_path): for dirpath, dirnames, filenames in os.walk(inspector_path): for filename in filenames: path = os.path.join(dirpath, filename) try: f = open(path, 'r', encoding='utf-8') for line in f: if PAGE_NOT_FOUND_STRING_RE.search(line): print("Soft 404 found: %s" % path) except UnicodeDecodeError: f = open(path, 'rb') for line in f: if PAGE_NOT_FOUND_BYTES_RE.search(line): print("Soft 404 found: %s" % path)
def run(options): ig_list = options.get("inspectors") dedup = Deduplicator() data_dir = utils.data_dir() for inspector in os.listdir(data_dir): if not ig_list or inspector in ig_list: logging.debug("[%s] Checking..." % inspector) inspector_path = os.path.join(data_dir, inspector) if os.path.isdir(inspector_path): for dirpath, dirnames, filenames in os.walk(inspector_path): for filename in filenames: result = dedup.add_and_check_file(os.path.join(dirpath, filename)) if result: print("Duplicate files: " + ", ".join(result))
def run(options): ig_list = options.get("inspectors") dedup = Deduplicator() data_dir = utils.data_dir() for inspector in os.listdir(data_dir): if not ig_list or inspector in ig_list: logging.debug("[%s] Checking..." % inspector) inspector_path = os.path.join(data_dir, inspector) if os.path.isdir(inspector_path): for dirpath, dirnames, filenames in os.walk(inspector_path): for filename in filenames: result = dedup.add_and_check_file( os.path.join(dirpath, filename)) if result: print("Duplicate files: " + ", ".join(result))
def run(options): ig_list = options.get("inspectors") data_dir = utils.data_dir() for inspector in os.listdir(data_dir): if not ig_list or inspector in ig_list: logging.debug("[%s] Checking..." % inspector) inspector_path = os.path.join(data_dir, inspector) if os.path.isdir(inspector_path): for dirpath, dirnames, filenames in os.walk(inspector_path): for filename in filenames: _, extension = os.path.splitext(filename.lower()) if extension == ".pdf": try: original = os.path.join(dirpath, filename) decrypted_file, decrypted_path = tempfile.mkstemp(suffix=".pdf") os.close(decrypted_file) decrypted_file = None logging.debug("Decrypting %s to %s" % (original, decrypted_path)) subprocess.check_call(["qpdf", "--decrypt", original, decrypted_path]) try: extract_dir = tempfile.mkdtemp() logging.debug("Extracting %s to %s" % (decrypted_path, extract_dir)) subprocess.check_call(["pdftk", decrypted_path, "unpack_files"], cwd=extract_dir) attachments = os.listdir(extract_dir) if attachments: print("%s has the following attachments: %s" % (original, ', '.join(attachments))) finally: shutil.rmtree(extract_dir) except subprocess.CalledProcessError as e: print(e) finally: try: if decrypted_file: os.close(decrypted_file) decrypted_file = None finally: os.remove(decrypted_path)
def run(options): data_dir = utils.data_dir() ig_list = options.get("inspectors") report_id_history = {} for inspector in os.listdir(data_dir): logging.debug("[%s] Checking..." % inspector) if not ig_list or inspector in ig_list: inspector_path = os.path.join(data_dir, inspector) if os.path.isdir(inspector_path): for year in os.listdir(inspector_path): year_path = os.path.join(inspector_path, year) if os.path.isdir(year_path): for report in os.listdir(year_path): report_path = os.path.join(year_path, report) if os.path.isdir(report_path): json_path = os.path.join( report_path, "report.json") if os.path.isfile(json_path): report_data = json.load( open(json_path, "r", encoding="utf-8")) report_id = report_data["report_id"] if report_id in report_id_history: report_id_history[report_id].append( json_path) print( "Duplicate report_id %s in %s" % (repr(report_id), ", ".join( report_id_history[report_id]))) else: report_id_history[report_id] = [ json_path ] if "global" not in options: report_id_history = {}