def main(file_details, server_details, logger_file=None, run_filter=None, run_date='', version=''): """ Main function to run all the things. Parameters ---------- file_details: Named tuple. All the other config information not in ``server_details``. server_details: Named tuple. Config information specifically related to the remote server for FTP uploading. logger_file: String. Path to a log file. Defaults to ``None`` and opens a ``PHOX_pipeline.log`` file in the current working directory. run_filter: String. Whether to run the ``oneaday_formatter``. Takes True or False (strings) as values. run_date: String. Date of the format YYYYMMDD. The pipeline will run using this date. If not specified the pipeline will run with ``current_date`` minus one day. """ if logger_file: utilities.init_logger(logger_file) else: utilities.init_logger('PHOX_pipeline.log') # get a local copy for the pipeline logger = logging.getLogger('pipeline_log') print('\nPHOX.pipeline run:', datetime.datetime.utcnow()) if run_date: process_date = dateutil.parser.parse(run_date) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) else: process_date = datetime.datetime.utcnow() - datetime.timedelta(days=1) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) results, scraperfilename = scraper_connection.main(process_date, file_details) if scraperfilename: logger.info("Scraper file name: " + scraperfilename) print("Scraper file name:", scraperfilename) logger.info("Running Mongo.formatter.py") print("Running Mongo.formatter.py") formatted = formatter.main(results, file_details, process_date, date_string) logger.info("Running PETRARCH") file_details.fullfile_stem + date_string if run_filter == 'False': print('Running PETRARCH and writing to a file. No one-a-day.') logger.info('Running PETRARCH and writing to a file. No one-a-day.') # Command to write output to a file directly from PETR # petrarch.run_pipeline(formatted, # '{}{}.txt'.format(file_details.fullfile_stem, # date_string), parsed=True) petr_results = petrarch.run_pipeline(formatted, write_output=False, parsed=True) elif run_filter == 'True': print('Running PETRARCH and returning output.') logger.info('Running PETRARCH and returning output.') petr_results = petrarch.run_pipeline(formatted, write_output=False, parsed=True) else: print("""Can't run with the options you've specified. You need to fix something.""") logger.warning("Can't run with the options you've specified. Exiting.") sys.exit() if run_filter == 'True': logger.info("Running oneaday_formatter.py") print("Running oneaday_formatter.py") formatted_results = oneaday_filter.main(petr_results) else: logger.info("Running result_formatter.py") print("Running result_formatter.py") formatted_results = result_formatter.main(petr_results) logger.info("Running postprocess.py") print("Running postprocess.py") if version: postprocess.main(formatted_results, date_string, version, file_details, server_details) else: print("Please specify a data version number. Program ending.") logger.info("Running phox_uploader.py") print("Running phox_uploader.py") try: uploader.main(date_string, server_details, file_details) except Exception as e: logger.warning("Error on the upload portion. {}".format(e)) print("""Error on the uploader. This step isn't absolutely necessary. Valid events should still be generated.""") logger.info('PHOX.pipeline end') print('PHOX.pipeline end:', datetime.datetime.utcnow())
def main(file_details, geo_details, server_details, petrarch_version, logger_file=None, run_filter=None, run_date='', version=''): """ Main function to run all the things. Parameters ---------- file_details: Named tuple. All the other config information not in ``server_details``. geo_details: Named tuple. Settings for geocoding. server_details: Named tuple. Config information specifically related to the remote server for FTP uploading. logger_file: String. Path to a log file. Defaults to ``None`` and opens a ``PHOX_pipeline.log`` file in the current working directory. run_filter: String. Whether to run the ``oneaday_formatter``. Takes True or False (strings) as values. run_date: String. Date of the format YYYYMMDD. The pipeline will run using this date. If not specified the pipeline will run with ``current_date`` minus one day. """ if logger_file: utilities.init_logger(logger_file) else: utilities.init_logger('PHOX_pipeline.log') # get a local copy for the pipeline logger = logging.getLogger('pipeline_log') if petrarch_version == '1': from petrarch import petrarch logger.info("Using original Petrarch version") elif petrarch_version == '2': from petrarch2 import petrarch2 as petrarch logger.info("Using Petrarch2") else: logger.error("Invalid Petrarch version. Argument must be '1' or '2'") print('\nPHOX.pipeline run:', datetime.datetime.utcnow()) if run_date: process_date = dateutil.parser.parse(run_date) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) else: process_date = datetime.datetime.utcnow() - datetime.timedelta(days=1) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) results, scraperfilename = scraper_connection.main(process_date, file_details) if geo_details.geo_service == "Mordecai": dest = "{0}:{1}/places".format(geo_details.mordecai_host, geo_details.mordecai_port) try: out = requests.get(dest) assert out.status_code == 200 except (AssertionError, requests.exceptions.ConnectionError): print("Mordecai geolocation service not responding. Continuing anyway...") elif geo_details.geo_service == "CLIFF": print("CLIFF") else: print("Invalid geo service name. Must be 'CLIFF' or 'Mordecai'. Continuing...") if scraperfilename: logger.info("Scraper file name: " + scraperfilename) print("Scraper file name:", scraperfilename) logger.info("Running Mongo.formatter.py") print("Running Mongo.formatter.py") formatted = formatter.main(results, file_details, process_date, date_string) logger.info("Running PETRARCH") file_details.fullfile_stem + date_string if run_filter == 'False': print('Running PETRARCH and writing to a file. No one-a-day.') logger.info('Running PETRARCH and writing to a file. No one-a-day.') # Command to write output to a file directly from PETR # petrarch.run_pipeline(formatted, # '{}{}.txt'.format(file_details.fullfile_stem, # date_string), parsed=True) petr_results = petrarch.run_pipeline(formatted, write_output=False, parsed=True) elif run_filter == 'True': print('Running PETRARCH and returning output.') logger.info('Running PETRARCH and returning output.') petr_results = petrarch.run_pipeline(formatted, write_output=False, parsed=True) else: print("""Can't run with the options you've specified. You need to fix something.""") logger.warning("Can't run with the options you've specified. Exiting.") sys.exit() if run_filter == 'True': logger.info("Running oneaday_formatter.py") print("Running oneaday_formatter.py") formatted_results = oneaday_filter.main(petr_results) else: logger.info("Running result_formatter.py") print("Running result_formatter.py") formatted_results = result_formatter.main(petr_results) logger.info("Running postprocess.py") print("Running postprocess.py") if version: postprocess.main(formatted_results, date_string, version, file_details, server_details, geo_details) else: print("Please specify a data version number. Program ending.") logger.info("Running phox_uploader.py") print("Running phox_uploader.py") try: uploader.main(date_string, server_details, file_details) except Exception as e: logger.warning("Error on the upload portion. {}".format(e)) print("""Error on the uploader. This step isn't absolutely necessary. Valid events should still be generated.""") logger.info('PHOX.pipeline end') print('PHOX.pipeline end:', datetime.datetime.utcnow())
def main(file_details, server_details, logger_file=None, run_filter=None, run_date=None): """ Main function to run all the things. Parameters ---------- file_details: Named tuple. All the other config information not in ``server_details``. server_details: Named tuple. Config information specifically related to the remote server for FTP uploading. logger_file: String. Path to a log file. Defaults to ``None`` and opens a ``PHOX_pipeline.log`` file in the current working directory. run_filter: String. Whether to run the ``oneaday_formatter``. Takes True or False (strings) as values. run_date: String. Date of the format YYYYMMDD. The pipeline will run using this date. If not specified the pipeline will run with ``current_date`` minus one day. """ if logger_file: utilities.init_logger(logger_file) else: utilities.init_logger('PHOX_pipeline.log') # get a local copy for the pipeline logger = logging.getLogger('pipeline_log') print('\nPHOX.pipeline run:', datetime.datetime.utcnow()) if run_date: process_date = dateutil.parser.parse(run_date) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) else: process_date = datetime.datetime.utcnow() - datetime.timedelta(days=1) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) results, scraperfilename = scraper_connection.main(process_date, file_details) if scraperfilename: logger.info("Scraper file name: " + scraperfilename) print("Scraper file name:", scraperfilename) logger.info("Running Mongo.formatter.py") print("Running Mongo.formatter.py") formatted = formatter.main(results, file_details, process_date, date_string) logger.info("Running PETRARCH") file_details.fullfile_stem + date_string if run_filter == 'False': print('Running PETRARCH and writing to a file. No one-a-day.') logger.info('Running PETRARCH and writing to a file. No one-a-day.') #Command to write output to a file directly from PETR # petrarch.run_pipeline(formatted, # '{}{}.txt'.format(file_details.fullfile_stem, # date_string), parsed=True) petr_results = petrarch.run_pipeline(formatted, write_output=False, parsed=True) elif run_filter == 'True': print('Running PETRARCH and returning output.') logger.info('Running PETRARCH and returning output.') petr_results = petrarch.run_pipeline(formatted, write_output=False, parsed=True) else: print("""Can't run with the options you've specified. You need to fix something.""") logger.warning("Can't run with the options you've specified. Exiting.") sys.exit() if run_filter == 'True': logger.info("Running oneaday_formatter.py") print("Running oneaday_formatter.py") formatted_results = oneaday_filter.main(petr_results) else: logger.info("Running result_formatter.py") print("Running result_formatter.py") formatted_results = result_formatter.main(petr_results) logger.info("Running postprocess.py") print("Running postprocess.py") postprocess.main(formatted_results, date_string, file_details) logger.info("Running phox_uploader.py") print("Running phox_uploader.py") uploader.main(date_string, server_details, file_details) logger.info('PHOX.pipeline end') print('PHOX.pipeline end:', datetime.datetime.utcnow())
phox_utilities.parse_config('PHOX_config.ini') # initialize the various phox_utilities globals print '\nPHOX.pipeline run:', datetime.datetime.now() if len(sys.argv) > 1: date_string = sys.argv[1] logger.info('Date string: ' + date_string + '\n') print 'Date string:', date_string else: logger.info('Error: No date string in PHOX.pipeline') sys.exit() # this is actually generated inside Mongo.formatter.py # also we could just shift and use the config.ini info to get this scraperfilename = scraper_connection.main() subprocess.call("cp " + scraperfilename + " " + phox_utilities.Scraper_Stem + date_string + ".txt", shell=True) logger.info("Scraper file name: " + scraperfilename) print "Scraper file name:", scraperfilename logger.info("Running Mongo.formatter.py \n ") print "Running Mongo.formatter.py" mongo_formatter.main(date_string) logger.info("Running TABARI") print "Running TABARI" subprocess.call( "./TABARI.0.8.4b1 -ad PHOX.pipeline.project -t " + phox_utilities.Recordfile_Stem + date_string +
def main(file_details, geo_details, server_details, petrarch_version, run_date, mongo_details, logger_file=None, run_filter=None, version=''): """ Main function to run all the things. Parameters ---------- file_details: Named tuple. All the other config information not in ``server_details``. geo_details: Named tuple. Settings for geocoding. server_details: Named tuple. Config information specifically related to the remote server for FTP uploading. petrarch_version: String. Which version of Petrarch to use. Must be '1' or '2' logger_file: String. Path to a log file. Defaults to ``None`` and opens a ``PHOX_pipeline.log`` file in the current working directory. run_filter: String. Whether to run the ``oneaday_formatter``. Takes True or False (strings) as values. run_date: String. Date of the format YYYYMMDD. The pipeline will run using this date. If not specified the pipeline will run with ``current_date`` minus one day. """ if logger_file: utilities.init_logger(logger_file) else: utilities.init_logger('PHOX_pipeline.log') # get a local copy for the pipeline logger = logging.getLogger('pipeline_log') if petrarch_version == '1': from petrarch import petrarch logger.info("Using original Petrarch version") elif petrarch_version == '2': from petrarch2 import petrarch2 as petrarch logger.info("Using Petrarch2") else: logger.error("Invalid Petrarch version. Argument must be '1' or '2'") print('\nPHOX.pipeline run:', datetime.datetime.utcnow()) if run_date: process_date = dateutil.parser.parse(run_date) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) else: process_date = datetime.datetime.utcnow() - datetime.timedelta(days=1) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) results, scraperfilename = scraper_connection.main(process_date, file_details) if geo_details.geo_service == "Mordecai": dest = "{0}:{1}/places".format(geo_details.mordecai_host, geo_details.mordecai_port) try: out = requests.get(dest) assert out.status_code == 200 except (AssertionError, requests.exceptions.ConnectionError): print("Mordecai geolocation service not responding. Continuing anyway...") elif geo_details.geo_service == "CLIFF": print("CLIFF") else: print("Invalid geo service name. Must be 'CLIFF' or 'Mordecai'. Continuing...") if scraperfilename: logger.info("Scraper file name: " + scraperfilename) print("Scraper file name:", scraperfilename) logger.info("Running Mongo.formatter.py") print("Running Mongo.formatter.py") formatted = formatter.main(results, file_details, process_date, date_string) logger.info("Running PETRARCH") file_details.fullfile_stem + date_string print('Running PETRARCH in Null Actors Mode.') # DGM Run this in actor-gen mode ACTOR_OUTPUT_FILE = "events_null_actors_mode_" + run_date + ".txt" petrarch.run_pipeline(formatted, out_file = ACTOR_OUTPUT_FILE, config = "petr_config.ini", write_output=True, parsed=True) print("See events results in: " + ACTOR_OUTPUT_FILE) print("See actors list in: nullactors." + ACTOR_OUTPUT_FILE) logger.info('PHOX.pipeline end') print('PHOX.pipeline end:', datetime.datetime.utcnow())
def main(file_details, geo_details, server_details, petrarch_version, run_date, mongo_details, logger_file=None, run_filter=None, version=''): """ Main function to run all the things. Parameters ---------- file_details: Named tuple. All the other config information not in ``server_details``. geo_details: Named tuple. Settings for geocoding. server_details: Named tuple. Config information specifically related to the remote server for FTP uploading. petrarch_version: String. Which version of Petrarch to use. Must be '1' or '2' logger_file: String. Path to a log file. Defaults to ``None`` and opens a ``PHOX_pipeline.log`` file in the current working directory. run_filter: String. Whether to run the ``oneaday_formatter``. Takes True or False (strings) as values. run_date: String. Date of the format YYYYMMDD. The pipeline will run using this date. If not specified the pipeline will run with ``current_date`` minus one day. """ if logger_file: utilities.init_logger(logger_file) else: utilities.init_logger('PHOX_pipeline.log') # get a local copy for the pipeline logger = logging.getLogger('pipeline_log') if petrarch_version == '1': from petrarch import petrarch logger.info("Using original Petrarch version") elif petrarch_version == '2': from petrarch2 import petrarch2 as petrarch logger.info("Using Petrarch2") else: logger.error("Invalid Petrarch version. Argument must be '1' or '2'") print('\nPHOX.pipeline run:', datetime.datetime.utcnow()) if run_date: process_date = dateutil.parser.parse(run_date) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) else: process_date = datetime.datetime.utcnow() - datetime.timedelta(days=1) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) # Get the stories for the desired date from the DB results, scraperfilename = scraper_connection.main(process_date, file_details) if geo_details.geo_service == "Mordecai": dest = "{0}:{1}/places".format(geo_details.mordecai_host, geo_details.mordecai_port) try: out = requests.get(dest) assert out.status_code == 200 except (AssertionError, requests.exceptions.ConnectionError): print( "Mordecai geolocation service not responding. Continuing anyway..." ) elif geo_details.geo_service == "CLIFF": print("CLIFF") else: print( "Invalid geo service name. Must be 'CLIFF' or 'Mordecai'. Continuing..." ) if scraperfilename: logger.info("Scraper file name: " + scraperfilename) print("Scraper file name:", scraperfilename) logger.info("Running Mongo.formatter.py") print("Running Mongo.formatter.py") formatted = formatter.main(results, file_details, process_date, date_string) logger.info("Running PETRARCH") file_details.fullfile_stem + date_string if run_filter == 'False': print('Running PETRARCH and writing to a file. No one-a-day.') logger.info('Running PETRARCH and writing to a file. No one-a-day.') # Command to write output to a file directly from PETR # petrarch.run_pipeline(formatted, # '{}{}.txt'.format(file_details.fullfile_stem, # date_string), parsed=True) petr_results = petrarch.run_pipeline( formatted, config="petr_config.ini", write_output=False, # DGM TEst # petrarch.run_pipeline(formatted, out_file = "TESTOUT.txt", config = "petr_config.ini", write_output=True, parsed=True) #sys.exit() elif run_filter == 'True': print('Running PETRARCH and returning output.') logger.info('Running PETRARCH and returning output.') petr_results = petrarch.run_pipeline(formatted, config="petr_config.ini", write_output=False, parsed=True) else: print("""Can't run with the options you've specified. You need to fix something.""") logger.warning("Can't run with the options you've specified. Exiting.") sys.exit() if run_filter == 'True': logger.info("Running oneaday_formatter.py") print("Running oneaday_formatter.py") formatted_results = oneaday_filter.main(petr_results) else: logger.info("Running result_formatter.py") print("Running result_formatter.py") formatted_results = result_formatter.main(petr_results) logger.info("Running postprocess.py") print("Running postprocess.py") if version: postprocess.main(formatted_results, date_string, version, file_details, server_details, geo_details) else: print("Please specify a data version number. Program ending.") #logger.info("Running phox_uploader.py") # print("Running phox_uploader.py") # try: # uploader.main(date_string, server_details, file_details) # except Exception as e: # logger.warning("Error on the upload portion. {}".format(e)) # print("""Error on the uploader. This step isn't absolutely necessary. # Valid events should still be generated.""") logger.info('PHOX.pipeline end') print('PHOX.pipeline end:', datetime.datetime.utcnow())
def main(file_details, server_details, logger_file=None, run_filter=None): """ Main function to run all the things. Parameters ---------- file_details: Named tuple. All the other config information not in ``server_details``. server_details: Named tuple. Config information specifically related to the remote server for FTP uploading. logger_file: String. Path to a log file. Defaults to ``None`` and opens a ``PHOX_pipeline.log`` file in the current working directory. run_filter: String. Whether to run the ``oneaday_formatter``. Takes True or False (strings) as values. """ if logger_file: utilities.init_logger(logger_file) else: utilities.init_logger('PHOX_pipeline.log') # get a local copy for the pipeline logger = logging.getLogger('pipeline_log') print('\nPHOX.pipeline run:', datetime.datetime.utcnow()) if len(sys.argv) > 1: date_string = sys.argv[1] process_date = datetime.datetime.utcnow() - datetime.timedelta(days=1) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) else: process_date = datetime.datetime.utcnow() - datetime.timedelta(days=1) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) results, scraperfilename = scraper_connection.main(process_date, file_details) if scraperfilename: logger.info("Scraper file name: " + scraperfilename) print("Scraper file name:", scraperfilename) logger.info("Running Mongo.formatter.py") print("Running Mongo.formatter.py") formatted = formatter.main(results, file_details, process_date, date_string) logger.info("Running PETRARCH") file_details.fullfile_stem + date_string if run_filter == 'False': print('Running PETRARCH and writing to a file. No one-a-day.') logger.info('Running PETRARCH and writing to a file. No one-a-day.') petrarch.run_pipeline(formatted, '{}{}.txt'.format(file_details.fullfile_stem, date_string), parsed=True) results = '' elif run_filter == 'True': print('Running PETRARCH and returning output.') logger.info('Running PETRARCH and returning output.') petr_results = petrarch.run_pipeline(formatted, write_output=False, parsed=True) else: print("Can't run with the options you've specified. You need to fix something.") logger.warning("Can't run with the options you've specified. Exiting.") sys.exit() if run_filter == 'True': logger.info("Running oneaday_formatter.py") print("Running oneaday_formatter.py") oneaday_formatter.main(petr_results, date_string, server_details, file_details) logger.info("Running phox_uploader.py") print("Running phox_uploader.py") uploader.main(date_string, server_details, file_details) logger.info('PHOX.pipeline end') print('PHOX.pipeline end:', datetime.datetime.utcnow())