def main(): parser = optparse.OptionParser() parser.add_option("-b", "--begindate", help="In format YYYY-MM-DD.") parser.add_option("-e", "--enddate", help="In format YYYY-MM-DD.") options, dummy = parser.parse_args() today = datetime.datetime.combine(datetime.date.today(), datetime.time()) yesterday = today - datetime.timedelta(days=1) if options.begindate and options.enddate: start_date = options.begindate end_date = options.enddate else: start_date = yesterday.strftime(DATE_FORMAT) end_date = today.strftime(DATE_FORMAT) jobflow, status = run_hive_jobs(start_date, end_date) print "Jobflow %s ended with status %s." % (jobflow, status) if status != "COMPLETED": # if cronned, this will get sent as email print >> sys.stderr, emr.list_steps(jobflow) sys.exit(1) # Jobflow was successful, so transfer the data to # the Mongo reporting db used by dashboards run_report_importer("daily_exercise_stats", "daily_exercise_stats")
def main(): parser = optparse.OptionParser() parser.add_option("-b", "--begindate", help="In format YYYY-MM-DD.") parser.add_option("-e", "--enddate", help="In format YYYY-MM-DD.") options, dummy = parser.parse_args() today = datetime.datetime.combine(datetime.date.today(), datetime.time()) yesterday = today - datetime.timedelta(days=1) if options.begindate and options.enddate: start_date = options.begindate end_date = options.enddate else: start_date = yesterday.strftime("%Y-%m-%d") end_date = today.strftime("%Y-%m-%d") earliest_date = "2011-01-01" # farthest back we've ever populated jobflow, status = run_hive_jobs(start_date, end_date, earliest_date) print "Jobflow %s ended with status %s." % (jobflow, status) if status != "COMPLETED": # if cronned, this will get sent as email print >>sys.stderr, emr.list_steps(jobflow) sys.exit(1) # Jobflow was successful, so transfer the data to # the Mongo reporting db used by dashboards run_report_importer("user_growth", "user_growth") run_report_importer("company_metrics", "company_metrics")
def run_hive_jobs(jobname, steps, num_instances): """Run hive steps. Arguments: jobname: Name for the Amazon EMR job. steps: A sequence of dictionaries describing the job steps to add. Each step may specify the keys "hive_script" and "hive_args". If "hive_script" is missing, no job step will be added. These steps usually come directly from a configuration file. num_instances: The number of instances to run this job on. Equivalent to the EMR CLI option --num-instances. Calls sys.exit() when a job does not complete successfully. """ jobflow = emr.create_hive_cluster( jobname, {"num_instances": num_instances}) for step in steps: # It's possible to leave out hive_script and hive_args, for # when the step just wants to move data from hive into mongo, # and not run any hive script. if 'hive_script' not in step: continue emr.add_hive_step(jobflow, {}, hive_script=step["hive_script"], script_args=step["hive_args"]) status = emr.wait_for_completion(jobflow, logger=g_logger) listing = emr.list_steps(jobflow) failures = ["FAILED", "CANCELLED", "TERMINATED"] if any(s in listing for s in failures): subject = "Reporting jobflow FAILED: %s" % jobname notify.send_email(subject, listing) notify.send_hipchat(subject) else: subject = "Reporting jobflow SUCCEEDED: %s" % jobname notify.send_email(subject, listing) if status != "COMPLETED": g_logger.fatal("Hive jobs failed") g_logger.fatal(emr.list_steps(jobflow)) sys.exit(1)
def run_hive_jobs(jobname, steps, num_instances): """Run hive steps. Arguments: jobname: Name for the Amazon EMR job. steps: A sequence of dictionaries describing the job steps to add. Each step may specify the keys "hive_script" and "hive_args". If "hive_script" is missing, no job step will be added. These steps usually come directly from a configuration file. num_instances: The number of instances to run this job on. Equivalent to the EMR CLI option --num-instances. Calls sys.exit() when a job does not complete successfully. """ jobflow = emr.create_hive_cluster( jobname, {"num_instances": num_instances}) for step in steps: # It's possible to leave out hive_script and hive_args, for # when the step just wants to move data from hive into mongo, # and not run any hive script. if 'hive_script' not in step: continue emr.add_hive_step(jobflow, {}, hive_script=step["hive_script"], script_args=step.get("hive_args", {})) status = emr.wait_for_completion(jobflow, logger=g_logger) listing = emr.list_steps(jobflow) failures = ["FAILED", "CANCELLED", "TERMINATED"] if any(s in listing for s in failures): subject = "Reporting jobflow FAILED: %s" % jobname notify.send_email(subject, listing) notify.send_hipchat(subject) else: subject = "Reporting jobflow SUCCEEDED: %s" % jobname notify.send_email(subject, listing) if status != "COMPLETED": g_logger.fatal("Hive jobs failed") g_logger.fatal(emr.list_steps(jobflow)) sys.exit(1)
def monitor_jobflow(jobflow_id): status = emr.wait_for_completion(jobflow_id) listing = emr.list_steps(jobflow_id) jobname = jobflow_id heading = listing.split("\n")[0] # there just happens to be a fixed number of characters (85) in the # output of the 'elastic-mapreduce --list' command before the jobname if len(heading) > 85: jobname += ": " + heading[85:] subject = "Jobflow status = %s (%s)" % (status, jobname) failures = ["FAILED", "CANCELLED", "TERMINATED"] if any(s in listing for s in failures): subject = "STEP FAILED: " + subject notify.send_hipchat(subject) # Until we get more confident, always send email, even on success notify.send_email(subject, listing)