def reprocess_errors(db_root, sources): res = dict() for source in sources: provider_db = Provider(db_root, source.SOURCE_NAME) for date_string in provider_db.get_all_days(): errors_by_batch = provider_db.get_errors2_per_batch(date_string) for (time, errors) in errors_by_batch: if errors: print source.SOURCE_NAME, date_string, time, "found {0} errors".format(len(errors)) batch_directory = os.path.join(db_root, source.SOURCE_NAME, date_string, time) articles, deleted_articles, errors, raw_data = reprocess_batch_errors(source, date_string, time, errors) save_reprocessed_data(batch_directory, articles, deleted_articles, raw_data) update_errors_file(batch_directory, errors)
def list_errors(db_root, outfile, source_list): res = dict() all_errors = dict() if not source_list: source_names = get_all_provider_names(db_root) else: source_names = source_list.split(",") for source_name in source_names: provider_db = Provider(db_root, source_name) error_count = 0 all_errors[source_name] = dict() all_errors[source_name] = list() for date_string in provider_db.get_all_days(): errors_by_batch = provider_db.get_errors2_per_batch(date_string) for (batch_time, errors) in errors_by_batch: errors = it.chain(*errors) #errors = flatten_list(errors) errors = filter_identical_ErrorLogEntries(errors) error_count += len(errors) if errors: #print source_name, date_string, batch_time for e in errors: new_item = ((u"{0}/{1}".format(date_string, batch_time)), (e.url, e.title, e.stacktrace)) print u"+++ [{0}] {1} ({2})".format(new_item[0], new_item[1][1], new_item[1][0]) all_errors[source_name].append(new_item) source_parser = NAME_TO_SOURCE_MODULE_MAPPING[source_name] res[source_name] = error_count print "\n" * 4 for name, error_count in res.items(): print "{0}: Had {1} errors".format(name, error_count) print "{0}: Had {1} errors".format(name, len(all_errors[name])) with open(outfile, 'w') as f: json.dump(all_errors, f, indent=2)