def process_report(self, filename, sanitised_streams, raw_streams): target = get_luigi_target(filename) sanitised_yaml_filename = os.path.basename(filename) if not sanitised_yaml_filename.endswith(".gz"): sanitised_yaml_filename = sanitised_yaml_filename + ".gz" sanitised_yaml = get_luigi_target(os.path.join( self.dst_public, "reports-sanitised", "yaml", self.date.strftime("%Y-%m-%d"), sanitised_yaml_filename )).open('w') logger.info("Sanitising %s" % filename) with target.open('r') as in_file: report = Report(in_file, self.bridge_db, target.path) for sanitised_entry, raw_entry in report.entries(): try: logger.debug("writing sanitised entry to stream") sanitised_streams.write(json_dumps(sanitised_entry)) sanitised_streams.write("\n") logger.debug("writing raw entry to stream") raw_streams.write(json_dumps(raw_entry)) raw_streams.write("\n") logger.debug("writing sanitised yaml file") yaml_dump(sanitised_entry, sanitised_yaml) except Exception: logger.error("error in dumping %s" % filename) logger.error(traceback.format_exc()) sanitised_yaml.close()
def output(self): output = {} for report_file in self.report_files: dst = os.path.join(self.dst_private, os.path.basename(report_file)) output[report_file] = get_luigi_target( dst, ssh_key_file=config.core.ssh_private_key_file, no_host_key_check=True) return output
def output(self): output_path = os.path.join(self.dst, "{software_name}-{test_name}" "-interesting-{date}.json".format( date=self.date, test_name=self.test_name, software_name=self.software_name)) return get_luigi_target(output_path)
def output(self): sanitised_streams = get_luigi_target(os.path.join( self.dst_public, "reports-sanitised", "streams", self.date.strftime("%Y-%m-%d.json") )) raw_streams = get_luigi_target(os.path.join( self.dst_private, "reports-raw", "streams", self.date.strftime("%Y-%m-%d.json") )) return { "raw_streams": raw_streams, "sanitised_streams": sanitised_streams }
def run(self): output = self.output() for report_file in self.report_files: logger.info("Copying %s to %s" % (report_file, output[report_file].path)) t = get_luigi_target(report_file, ssh_key_file=config.core.ssh_private_key_file, no_host_key_check=True) with t.open('r') as in_file: out_file = output[report_file].open('w') shutil.copyfileobj(in_file, out_file) out_file.close() t.remove()
def run(self): with get_luigi_target(config.ooni.bridge_db_path).open('r') as f: self.bridge_db = json.load(f) output = self.output() raw_streams = output["raw_streams"].open('w') sanitised_streams = output["sanitised_streams"].open('w') reports_path = os.path.join(self.src, self.date.strftime("%Y-%m-%d")) logger.debug("listing path %s" % reports_path) for filename in list_report_files(reports_path, config.aws.access_key_id, config.aws.secret_access_key): logger.debug("got filename %s" % filename) try: self.process_report(filename, sanitised_streams, raw_streams) except Exception: logger.error("error in processing %s" % filename) logger.error(traceback.format_exc()) raw_streams.close() sanitised_streams.close()
def input(self): return get_luigi_target(os.path.join(self.src, "%s.json" % self.date))