def __init__(self, port, host, request_handler, parameters): # Registering configuration settings and request handler, logger self.config_dict = utilities.load_config(CONFIG_FILE) self.logger = logging.getLogger() utilities.init_logger(self.logger, self.config_dict) check_config(self.config_dict, self.logger) self.request_handler = request_handler self.parameters = parameters self.servSock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.servSock.getsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) self.servSock.bind((host, port)) self.servSock.listen(self.config_dict['listen_connections']) self.servSock.setblocking(0) if (self.config_dict['tcp_nagle']): self.servSock.getsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) # Intializing client dicts self.connections = {} self.responses = {} self.epoll = select.epoll() # Creating Epoll for future read events self.epoll.register(self.servSock.fileno(), select.EPOLLIN | select.EPOLLET) self.logger.info('[%s:%d] started' % (host, port))
def run_pipeline(data, out_file=None, config=None, write_output=True, parsed=False): utilities.init_logger('PETRARCH.log') logger = logging.getLogger('petr_log') if config: print('Using user-specified config: {}'.format(config)) logger.info('Using user-specified config: {}'.format(config)) PETRreader.parse_Config(config) else: logger.info('Using default config file.') logger.info('Config path: {}'.format(utilities._get_data('data/config/', 'PETR_config.ini'))) PETRreader.parse_Config(utilities._get_data('data/config/', 'PETR_config.ini')) read_dictionaries() logger.info('Hitting read events...') events = PETRreader.read_pipeline_input(data) if parsed: logger.info('Hitting do_coding') updated_events = do_coding(events, None) else: events = utilities.stanford_parse(events) updated_events = do_coding(events, None) if not write_output: output_events = PETRwriter.pipe_output(updated_events) return output_events elif write_output and not out_file: print('Please specify an output file...') logger.warning('Need an output file. ¯\_(ツ)_/¯') sys.exit() elif write_output and out_file: PETRwriter.write_events(updated_events, out_file)
def run_pipeline(data, out_file=None, config=None, write_output=True, parsed=False): # this is called externally utilities.init_logger('PETRARCH.log') logger = logging.getLogger('petr_log') if config: print('Using user-specified config: {}'.format(config)) logger.info('Using user-specified config: {}'.format(config)) PETRreader.parse_Config(config) else: logger.info('Using default config file.') logger.info('Config path: {}'.format(utilities._get_data('data/config/', 'PETR_config.ini'))) PETRreader.parse_Config(utilities._get_data('data/config/', 'PETR_config.ini')) read_dictionaries() logger.info('Hitting read events...') events = PETRreader.read_pipeline_input(data) if parsed: logger.info('Hitting do_coding') updated_events = do_coding(events, None) # else: # events = utilities.stanford_parse(events) # updated_events = do_coding(events, None) if not write_output: output_events = PETRwriter.pipe_output(updated_events) return output_events elif write_output and not out_file: print('Please specify an output file...') logger.warning('Need an output file. ¯\_(ツ)_/¯') sys.exit() elif write_output and out_file: PETRwriter.write_events(updated_events, out_file)
def process_target(queue, cli_args, multi_log_lock): # 打印子进程启动消息 write_multiprocess_log( multi_log_lock, '{}Process {}: {}'.format(u'', os.getpid(), u'started.')) # 子进程先读取进程运行所需各种信息 utilities.init_logger() logger = logging.getLogger('petr_log') PETRglobals.RunTimeString = time.asctime() if cli_args.config: print('Using user-specified config: {}'.format(cli_args.config)) logger.info('Using user-specified config: {}'.format(cli_args.config)) PETRreader.parse_Config(cli_args.config) else: logger.info('Using default config file.') PETRreader.parse_Config( utilities._get_data('data/config/', 'PETR_config.ini')) if cli_args.nullverbs: print('Coding in null verbs mode; no events will be generated') logger.info('Coding in null verbs mode; no events will be generated') # Only get verb phrases that are not in the dictionary but are # associated with coded noun phrases PETRglobals.NullVerbs = True elif cli_args.nullactors: print('Coding in null actors mode; no events will be generated') logger.info('Coding in null verbs mode; no events will be generated') # Only get actor phrases that are not in the dictionary but # associated with coded verb phrases PETRglobals.NullActors = True PETRglobals.NewActorLength = int(cli_args.nullactors) read_dictionaries() print('\n\n') out = "" # PETRglobals.EventFileName if cli_args.outputs: out = cli_args.outputs # 创建一个和数据库交流的session session = Session() while True: if queue.qsize > 0: # 从队列中获取一个任务 task = queue.get() # 打印日志,获取到了任务 write_multiprocess_log( multi_log_lock, '{}Process {} get one task: {}'.format(u'', os.getpid(), task)) # 执行任务 process_task(task, out, multi_log_lock, session) else: time.sleep(0.5 * random.random()) continue
def main(): cli_args = parse_cli_args() utilities.init_logger('PETRARCH.log', cli_args.debug) logger = logging.getLogger('petr_log') PETRglobals.RunTimeString = time.asctime() if cli_args.command_name == 'parse' or cli_args.command_name == 'batch': if cli_args.config: print('Using user-specified config: {}'.format(cli_args.config)) logger.info( 'Using user-specified config: {}'.format(cli_args.config)) PETRglobals.ConfigFileName = cli_args.config PETRreader.parse_Config(cli_args.config) else: logger.info('Using default config file.') PETRglobals.ConfigFileName = 'PETR_config.ini' PETRreader.parse_Config(utilities._get_data('data/config/', 'PETR_config.ini')) read_dictionaries() start_time = time.time() print('\n\n') paths = PETRglobals.TextFileList if cli_args.inputs: if os.path.isdir(cli_args.inputs): if cli_args.inputs[-1] != '/': paths = glob.glob(cli_args.inputs + '/*.xml') else: paths = glob.glob(cli_args.inputs + '*.xml') elif os.path.isfile(cli_args.inputs): paths = [cli_args.inputs] else: print( '\nFatal runtime error:\n"' + cli_args.inputs + '" could not be located\nPlease enter a valid directory or file of source texts.') sys.exit() out = "" #PETRglobals.EventFileName if cli_args.outputs: out = cli_args.outputs if cli_args.command_name == 'parse': run(paths, out, cli_args.parsed) else: run(paths, out , True) ## <=== print("Coding time:", time.time() - start_time) print("Finished")
def main(): cli_args = parse_cli_args() utilities.init_logger('PETRARCH.log') logger = logging.getLogger('petr_log') PETRglobals.RunTimeString = time.asctime() if cli_args.command_name == 'parse' or cli_args.command_name == 'batch': if cli_args.config: print('Using user-specified config: {}'.format(cli_args.config)) logger.info( 'Using user-specified config: {}'.format(cli_args.config)) PETRreader.parse_Config(cli_args.config) else: logger.info('Using default config file.') PETRreader.parse_Config(utilities._get_data('data/config/', 'PETR_config.ini')) read_dictionaries() start_time = time.time() print('\n\n') paths = PETRglobals.TextFileList if cli_args.inputs: if os.path.isdir(cli_args.inputs): if cli_args.inputs[-1] != '/': paths = glob.glob(cli_args.inputs + '/*.xml') else: paths = glob.glob(cli_args.inputs + '*.xml') elif os.path.isfile(cli_args.inputs): paths = [cli_args.inputs] else: print( '\nFatal runtime error:\n"' + cli_args.inputs + '" could not be located\nPlease enter a valid directory or file of source texts.') sys.exit() out = "" #PETRglobals.EventFileName if cli_args.outputs: out = cli_args.outputs if cli_args.command_name == 'parse': run(paths, out, cli_args.parsed) else: run(paths, out , True) print("Coding time:", time.time() - start_time) print("Finished")
def __init__(self, config_folder='data/config/', config_file='PETR_config.ini'): #cli_args = petrarch2.parse_cli_args() utilities.init_logger('PETRARCH.log') logger = logging.getLogger('petr_log') PETRglobals.RunTimeString = time.asctime() logger.info('Using Config file: ' + config_file) PETRreader.parse_Config(utilities._get_data(config_folder, config_file)) petrarch2.read_dictionaries()
def __init__(self, petrGlobal={}, config_folder='data/config/', config_file='PETR_config.ini'): #cli_args = petrarch2.parse_cli_args() if not petrGlobal: utilities.init_logger('PETRARCH.log', debug=False) logger = logging.getLogger('petr_log') PETRglobals.RunTimeString = time.asctime() logger.info('Using Config file: '+config_file) PETRreader.parse_Config(utilities._get_data(config_folder, config_file)) petrarch_ud.read_dictionaries() print("SUCCESSFULL ON LOADING DICTIONARIES") else: print ("LOADING FROM MAP") self.load(petrGlobal)
def main(message, logger_file=None, run_date='', version=''): """ Main function to run all the things. Parameters ---------- logger_file: String. Path to a log file. Defaults to ``None`` and opens a ``PHOX_pipeline.log`` file in the current working directory. run_date: String. Date of the format YYYYMMDD. The pipeline will run using this date. If not specified the pipeline will run with ``current_date`` minus one day. """ if logger_file: utilities.init_logger(logger_file) else: utilities.init_logger('PHOX_pipeline.log') # get a local copy for the pipeline logger = logging.getLogger('pipeline_log') print('\nPHOX.pipeline run:', datetime.datetime.utcnow()) process_date = datetime.datetime.utcnow() date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) server_details = '' logger.info("Extracting date.") print("Extracting date.") date = formatter.get_date(message, process_date) logger.info("Sending to Hypnos.") story_id = message['entry_id'] print(story_id) text = message['cleaned_text'] headers = {'Content-Type': 'application/json'} payload = {'text': text, 'id': story_id, 'date': date} data = json.dumps(payload) hypnos_ip = os.environ['HYPNOS_PORT_5002_TCP_ADDR'] hypnos_url = 'http://{}:5002/hypnos/extract'.format(hypnos_ip) r = requests.get(hypnos_url, data=data, headers=headers) print(r.status_code) if r.status_code == 200: logger.info("Running postprocess.py") print("Running postprocess.py") hypnos_res = r.json() print(hypnos_res) events = [] for k, v in hypnos_res[story_id]['sents'].iteritems(): if 'events' in v: sent = hypnos_res[story_id]['sents'][k] for event in v['events']: event_tup = (date, event[0], event[1], event[2]) formatted, actors = postprocess.main( event_tup, sent, version, server_details) logger.info(formatted) logger.info(actors) print(formatted, actors) logger.info('PHOX.pipeline end') print('PHOX.pipeline end:', datetime.datetime.utcnow())
def main(cli_args=None): if not cli_args: cli_args = parse_cli_args() utilities.init_logger('PETRARCH.log') logger = logging.getLogger('petr_log') PETRglobals.RunTimeString = time.asctime() print(cli_args) if cli_args.config: print('Using user-specified config: {}'.format(cli_args.config)) logger.info('Using user-specified config: {}'.format(cli_args.config)) PETRreader.parse_Config(cli_args.config) else: logger.info('Using default config file.') PETRreader.parse_Config( utilities._get_data('data/config/', 'PETR_config.ini')) if cli_args.nullverbs: print('Coding in null verbs mode; no events will be generated') logger.info('Coding in null verbs mode; no events will be generated') # Only get verb phrases that are not in the dictionary but are # associated with coded noun phrases PETRglobals.NullVerbs = True elif cli_args.nullactors: print('Coding in null actors mode; no events will be generated') logger.info('Coding in null verbs mode; no events will be generated') # Only get actor phrases that are not in the dictionary but # associated with coded verb phrases PETRglobals.NullActors = True PETRglobals.NewActorLength = int(cli_args.nullactors) read_dictionaries() start_time = time.time() print('\n\n') paths = PETRglobals.TextFileList if cli_args.inputs: if os.path.isdir(cli_args.inputs): if cli_args.inputs[-1] != '/': paths = glob.glob(cli_args.inputs + '/*.xml') else: paths = glob.glob(cli_args.inputs + '*.xml') elif os.path.isfile(cli_args.inputs): paths = [cli_args.inputs] else: print( '\nFatal runtime error:\n"' + cli_args.inputs + '" could not be located\nPlease enter a valid directory or file of source texts.' ) sys.exit() out = "" # PETRglobals.EventFileName if cli_args.outputs: out = cli_args.outputs if cli_args.command_name == 'parse': events = run(paths, out, cli_args.parsed) else: events = run(paths, out, True) # <=== print("Coding time:", time.time() - start_time) print("Finished") return events
def process_target_bak(q, l, first_task, cli_args, multi_log_lock): # 子进程先读取进程运行所需各种信息 utilities.init_logger() logger = logging.getLogger('petr_log') PETRglobals.RunTimeString = time.asctime() if cli_args.config: print('Using user-specified config: {}'.format(cli_args.config)) logger.info('Using user-specified config: {}'.format(cli_args.config)) PETRreader.parse_Config(cli_args.config) else: logger.info('Using default config file.') PETRreader.parse_Config( utilities._get_data('data/config/', 'PETR_config.ini')) if cli_args.nullverbs: print('Coding in null verbs mode; no events will be generated') logger.info('Coding in null verbs mode; no events will be generated') # Only get verb phrases that are not in the dictionary but are # associated with coded noun phrases PETRglobals.NullVerbs = True elif cli_args.nullactors: print('Coding in null actors mode; no events will be generated') logger.info('Coding in null verbs mode; no events will be generated') # Only get actor phrases that are not in the dictionary but # associated with coded verb phrases PETRglobals.NullActors = True PETRglobals.NewActorLength = int(cli_args.nullactors) read_dictionaries() print('\n\n') out = "" # PETRglobals.EventFileName if cli_args.outputs: out = cli_args.outputs # 创建一个和数据库交流的session session = Session() # 子进程先完成第一个任务 write_multiprocess_log( multi_log_lock, '{}Process {}: {}'.format(u'', os.getpid(), first_task)) process_task(first_task, out, multi_log_lock, session) while l.acquire(): # 队列不为空,empty()方法不可靠,使用qsize() if q.qsize() != 0: # 从队列中获取下一个任务 task = q.get() # 任务获取完之后释放锁 l.release() # 完成获取到的任务 write_multiprocess_log( multi_log_lock, '{}Process {}: {}'.format(u'', os.getpid(), task)) process_task(task, out, multi_log_lock, session) # 队列为空 else: # 释放锁 l.release() # 跳出循环 break write_multiprocess_log( multi_log_lock, '{}Process {}: {}'.format(u'', os.getpid(), u'exited...'))
def main(file_details, geo_details, server_details, petrarch_version, logger_file=None, run_filter=None, run_date='', version=''): """ Main function to run all the things. Parameters ---------- file_details: Named tuple. All the other config information not in ``server_details``. geo_details: Named tuple. Settings for geocoding. server_details: Named tuple. Config information specifically related to the remote server for FTP uploading. logger_file: String. Path to a log file. Defaults to ``None`` and opens a ``PHOX_pipeline.log`` file in the current working directory. run_filter: String. Whether to run the ``oneaday_formatter``. Takes True or False (strings) as values. run_date: String. Date of the format YYYYMMDD. The pipeline will run using this date. If not specified the pipeline will run with ``current_date`` minus one day. """ if logger_file: utilities.init_logger(logger_file) else: utilities.init_logger('PHOX_pipeline.log') # get a local copy for the pipeline logger = logging.getLogger('pipeline_log') if petrarch_version == '1': from petrarch import petrarch logger.info("Using original Petrarch version") elif petrarch_version == '2': from petrarch2 import petrarch2 as petrarch logger.info("Using Petrarch2") else: logger.error("Invalid Petrarch version. Argument must be '1' or '2'") print('\nPHOX.pipeline run:', datetime.datetime.utcnow()) if run_date: process_date = dateutil.parser.parse(run_date) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) else: process_date = datetime.datetime.utcnow() - datetime.timedelta(days=1) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) results, scraperfilename = scraper_connection.main(process_date, file_details) if geo_details.geo_service == "Mordecai": dest = "{0}:{1}/places".format(geo_details.mordecai_host, geo_details.mordecai_port) try: out = requests.get(dest) assert out.status_code == 200 except (AssertionError, requests.exceptions.ConnectionError): print("Mordecai geolocation service not responding. Continuing anyway...") elif geo_details.geo_service == "CLIFF": print("CLIFF") else: print("Invalid geo service name. Must be 'CLIFF' or 'Mordecai'. Continuing...") if scraperfilename: logger.info("Scraper file name: " + scraperfilename) print("Scraper file name:", scraperfilename) logger.info("Running Mongo.formatter.py") print("Running Mongo.formatter.py") formatted = formatter.main(results, file_details, process_date, date_string) logger.info("Running PETRARCH") file_details.fullfile_stem + date_string if run_filter == 'False': print('Running PETRARCH and writing to a file. No one-a-day.') logger.info('Running PETRARCH and writing to a file. No one-a-day.') # Command to write output to a file directly from PETR # petrarch.run_pipeline(formatted, # '{}{}.txt'.format(file_details.fullfile_stem, # date_string), parsed=True) petr_results = petrarch.run_pipeline(formatted, write_output=False, parsed=True) elif run_filter == 'True': print('Running PETRARCH and returning output.') logger.info('Running PETRARCH and returning output.') petr_results = petrarch.run_pipeline(formatted, write_output=False, parsed=True) else: print("""Can't run with the options you've specified. You need to fix something.""") logger.warning("Can't run with the options you've specified. Exiting.") sys.exit() if run_filter == 'True': logger.info("Running oneaday_formatter.py") print("Running oneaday_formatter.py") formatted_results = oneaday_filter.main(petr_results) else: logger.info("Running result_formatter.py") print("Running result_formatter.py") formatted_results = result_formatter.main(petr_results) logger.info("Running postprocess.py") print("Running postprocess.py") if version: postprocess.main(formatted_results, date_string, version, file_details, server_details, geo_details) else: print("Please specify a data version number. Program ending.") logger.info("Running phox_uploader.py") print("Running phox_uploader.py") try: uploader.main(date_string, server_details, file_details) except Exception as e: logger.warning("Error on the upload portion. {}".format(e)) print("""Error on the uploader. This step isn't absolutely necessary. Valid events should still be generated.""") logger.info('PHOX.pipeline end') print('PHOX.pipeline end:', datetime.datetime.utcnow())
def main(file_details, server_details, logger_file=None, run_filter=None): """ Main function to run all the things. Parameters ---------- file_details: Named tuple. All the other config information not in ``server_details``. server_details: Named tuple. Config information specifically related to the remote server for FTP uploading. logger_file: String. Path to a log file. Defaults to ``None`` and opens a ``PHOX_pipeline.log`` file in the current working directory. run_filter: String. Whether to run the ``oneaday_formatter``. Takes True or False (strings) as values. """ if logger_file: utilities.init_logger(logger_file) else: utilities.init_logger('PHOX_pipeline.log') # get a local copy for the pipeline logger = logging.getLogger('pipeline_log') print('\nPHOX.pipeline run:', datetime.datetime.utcnow()) if len(sys.argv) > 1: date_string = sys.argv[1] process_date = datetime.datetime.utcnow() - datetime.timedelta(days=1) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) else: process_date = datetime.datetime.utcnow() - datetime.timedelta(days=1) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) results, scraperfilename = scraper_connection.main(process_date, file_details) if scraperfilename: logger.info("Scraper file name: " + scraperfilename) print("Scraper file name:", scraperfilename) logger.info("Running Mongo.formatter.py") print("Running Mongo.formatter.py") formatted = formatter.main(results, file_details, process_date, date_string) logger.info("Running PETRARCH") file_details.fullfile_stem + date_string if run_filter == 'False': print('Running PETRARCH and writing to a file. No one-a-day.') logger.info('Running PETRARCH and writing to a file. No one-a-day.') petrarch.run_pipeline(formatted, '{}{}.txt'.format(file_details.fullfile_stem, date_string), parsed=True) results = '' elif run_filter == 'True': print('Running PETRARCH and returning output.') logger.info('Running PETRARCH and returning output.') petr_results = petrarch.run_pipeline(formatted, write_output=False, parsed=True) else: print("Can't run with the options you've specified. You need to fix something.") logger.warning("Can't run with the options you've specified. Exiting.") sys.exit() if run_filter == 'True': logger.info("Running oneaday_formatter.py") print("Running oneaday_formatter.py") oneaday_formatter.main(petr_results, date_string, server_details, file_details) logger.info("Running phox_uploader.py") print("Running phox_uploader.py") uploader.main(date_string, server_details, file_details) logger.info('PHOX.pipeline end') print('PHOX.pipeline end:', datetime.datetime.utcnow())
def main(): cli_args = parse_cli_args() """print(cli_args) sys.exit()""" utilities.init_logger('PETRARCH.log') logger = logging.getLogger('petr_log') PETRglobals.RunTimeString = time.asctime() if cli_args.command_name == 'parse' or cli_args.command_name == 'batch': # 16.06.27: no longer needed, right? print(cli_args) if cli_args.config: print('Using user-specified config: {}'.format(cli_args.config)) logger.info( 'Using user-specified config: {}'.format(cli_args.config)) PETRreader.parse_Config(cli_args.config) else: logger.info('Using default config file.') PETRreader.parse_Config(utilities._get_data('data/config/', 'PETR_config.ini')) if cli_args.nullverbs: print('Coding in null verbs mode; no events will be generated') logger.info('Coding in null verbs mode; no events will be generated') PETRglobals.NullVerbs = True # Only get verb phrases that are not in the dictionary but are associated with coded noun phrases elif cli_args.nullactors: print('Coding in null actors mode; no events will be generated') logger.info('Coding in null verbs mode; no events will be generated') PETRglobals.NullActors = True # Only get actor phrases that are not in the dictionary but associated with coded verb phrases PETRglobals.NewActorLength = int(cli_args.nullactors) read_dictionaries() start_time = time.time() print('\n\n') paths = PETRglobals.TextFileList if cli_args.inputs: if os.path.isdir(cli_args.inputs): if cli_args.inputs[-1] != '/': paths = glob.glob(cli_args.inputs + '/*.xml') else: paths = glob.glob(cli_args.inputs + '*.xml') elif os.path.isfile(cli_args.inputs): paths = [cli_args.inputs] else: print( '\nFatal runtime error:\n"' + cli_args.inputs + '" could not be located\nPlease enter a valid directory or file of source texts.') sys.exit() out = "" #PETRglobals.EventFileName if cli_args.outputs: out = cli_args.outputs if cli_args.command_name == 'parse': run(paths, out, cli_args.parsed) else: run(paths, out , True) ## <=== print("Coding time:", time.time() - start_time) print("Finished")
def main(file_details, server_details, logger_file=None, run_filter=None, run_date='', version=''): """ Main function to run all the things. Parameters ---------- file_details: Named tuple. All the other config information not in ``server_details``. server_details: Named tuple. Config information specifically related to the remote server for FTP uploading. logger_file: String. Path to a log file. Defaults to ``None`` and opens a ``PHOX_pipeline.log`` file in the current working directory. run_filter: String. Whether to run the ``oneaday_formatter``. Takes True or False (strings) as values. run_date: String. Date of the format YYYYMMDD. The pipeline will run using this date. If not specified the pipeline will run with ``current_date`` minus one day. """ if logger_file: utilities.init_logger(logger_file) else: utilities.init_logger('PHOX_pipeline.log') # get a local copy for the pipeline logger = logging.getLogger('pipeline_log') print('\nPHOX.pipeline run:', datetime.datetime.utcnow()) if run_date: process_date = dateutil.parser.parse(run_date) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) else: process_date = datetime.datetime.utcnow() - datetime.timedelta(days=1) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) results, scraperfilename = scraper_connection.main(process_date, file_details) if scraperfilename: logger.info("Scraper file name: " + scraperfilename) print("Scraper file name:", scraperfilename) logger.info("Running Mongo.formatter.py") print("Running Mongo.formatter.py") formatted = formatter.main(results, file_details, process_date, date_string) logger.info("Running PETRARCH") file_details.fullfile_stem + date_string if run_filter == 'False': print('Running PETRARCH and writing to a file. No one-a-day.') logger.info('Running PETRARCH and writing to a file. No one-a-day.') # Command to write output to a file directly from PETR # petrarch.run_pipeline(formatted, # '{}{}.txt'.format(file_details.fullfile_stem, # date_string), parsed=True) petr_results = petrarch.run_pipeline(formatted, write_output=False, parsed=True) elif run_filter == 'True': print('Running PETRARCH and returning output.') logger.info('Running PETRARCH and returning output.') petr_results = petrarch.run_pipeline(formatted, write_output=False, parsed=True) else: print("""Can't run with the options you've specified. You need to fix something.""") logger.warning("Can't run with the options you've specified. Exiting.") sys.exit() if run_filter == 'True': logger.info("Running oneaday_formatter.py") print("Running oneaday_formatter.py") formatted_results = oneaday_filter.main(petr_results) else: logger.info("Running result_formatter.py") print("Running result_formatter.py") formatted_results = result_formatter.main(petr_results) logger.info("Running postprocess.py") print("Running postprocess.py") if version: postprocess.main(formatted_results, date_string, version, file_details, server_details) else: print("Please specify a data version number. Program ending.") logger.info("Running phox_uploader.py") print("Running phox_uploader.py") try: uploader.main(date_string, server_details, file_details) except Exception as e: logger.warning("Error on the upload portion. {}".format(e)) print("""Error on the uploader. This step isn't absolutely necessary. Valid events should still be generated.""") logger.info('PHOX.pipeline end') print('PHOX.pipeline end:', datetime.datetime.utcnow())
def main(file_details, server_details, logger_file=None, run_filter=None, run_date=None): """ Main function to run all the things. Parameters ---------- file_details: Named tuple. All the other config information not in ``server_details``. server_details: Named tuple. Config information specifically related to the remote server for FTP uploading. logger_file: String. Path to a log file. Defaults to ``None`` and opens a ``PHOX_pipeline.log`` file in the current working directory. run_filter: String. Whether to run the ``oneaday_formatter``. Takes True or False (strings) as values. run_date: String. Date of the format YYYYMMDD. The pipeline will run using this date. If not specified the pipeline will run with ``current_date`` minus one day. """ if logger_file: utilities.init_logger(logger_file) else: utilities.init_logger('PHOX_pipeline.log') # get a local copy for the pipeline logger = logging.getLogger('pipeline_log') print('\nPHOX.pipeline run:', datetime.datetime.utcnow()) if run_date: process_date = dateutil.parser.parse(run_date) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) else: process_date = datetime.datetime.utcnow() - datetime.timedelta(days=1) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) results, scraperfilename = scraper_connection.main(process_date, file_details) if scraperfilename: logger.info("Scraper file name: " + scraperfilename) print("Scraper file name:", scraperfilename) logger.info("Running Mongo.formatter.py") print("Running Mongo.formatter.py") formatted = formatter.main(results, file_details, process_date, date_string) logger.info("Running PETRARCH") file_details.fullfile_stem + date_string if run_filter == 'False': print('Running PETRARCH and writing to a file. No one-a-day.') logger.info('Running PETRARCH and writing to a file. No one-a-day.') #Command to write output to a file directly from PETR # petrarch.run_pipeline(formatted, # '{}{}.txt'.format(file_details.fullfile_stem, # date_string), parsed=True) petr_results = petrarch.run_pipeline(formatted, write_output=False, parsed=True) elif run_filter == 'True': print('Running PETRARCH and returning output.') logger.info('Running PETRARCH and returning output.') petr_results = petrarch.run_pipeline(formatted, write_output=False, parsed=True) else: print("""Can't run with the options you've specified. You need to fix something.""") logger.warning("Can't run with the options you've specified. Exiting.") sys.exit() if run_filter == 'True': logger.info("Running oneaday_formatter.py") print("Running oneaday_formatter.py") formatted_results = oneaday_filter.main(petr_results) else: logger.info("Running result_formatter.py") print("Running result_formatter.py") formatted_results = result_formatter.main(petr_results) logger.info("Running postprocess.py") print("Running postprocess.py") postprocess.main(formatted_results, date_string, file_details) logger.info("Running phox_uploader.py") print("Running phox_uploader.py") uploader.main(date_string, server_details, file_details) logger.info('PHOX.pipeline end') print('PHOX.pipeline end:', datetime.datetime.utcnow())
def main(): cli_args = parse_cli_args() # miaoweixin added begin # 作为后台程序无限循环运行 if cli_args.command_name == 'background': try: # infinite loop run_in_background(cli_args) except KeyboardInterrupt: print("Program exited due to keyboard interrupt.\n") return None # miaoweixin added end utilities.init_logger() logger = logging.getLogger('petr_log') PETRglobals.RunTimeString = time.asctime() print(cli_args) if cli_args.config: print('Using user-specified config: {}'.format(cli_args.config)) logger.info('Using user-specified config: {}'.format(cli_args.config)) PETRreader.parse_Config(cli_args.config) else: logger.info('Using default config file.') PETRreader.parse_Config( utilities._get_data('data/config/', 'PETR_config.ini')) if cli_args.nullverbs: print('Coding in null verbs mode; no events will be generated') logger.info('Coding in null verbs mode; no events will be generated') # Only get verb phrases that are not in the dictionary but are # associated with coded noun phrases PETRglobals.NullVerbs = True elif cli_args.nullactors: print('Coding in null actors mode; no events will be generated') logger.info('Coding in null verbs mode; no events will be generated') # Only get actor phrases that are not in the dictionary but # associated with coded verb phrases PETRglobals.NullActors = True PETRglobals.NewActorLength = int(cli_args.nullactors) read_dictionaries() start_time = time.time() print('\n\n') paths = PETRglobals.TextFileList if cli_args.inputs: if os.path.isdir(cli_args.inputs): if cli_args.inputs[-1] != '/': paths = glob.glob(cli_args.inputs + '/*.xml') else: paths = glob.glob(cli_args.inputs + '*.xml') elif os.path.isfile(cli_args.inputs): paths = [cli_args.inputs] else: print( '\nFatal runtime error:\n"' + cli_args.inputs + '" could not be located\nPlease enter a valid directory or file of source texts.' ) sys.exit() elif cli_args.command_name == 'javainfo': # add else to java info 0904 paths = 'javainfo' out = "" # PETRglobals.EventFileName if cli_args.outputs: out = cli_args.outputs if cli_args.command_name == 'parse': run(paths, out, cli_args.parsed, cli_args) else: run(paths, out, True, cli_args) # <=== print("Coding time:", time.time() - start_time) print("Finished")
def main(file_details, geo_details, server_details, petrarch_version, run_date, mongo_details, logger_file=None, run_filter=None, version=''): """ Main function to run all the things. Parameters ---------- file_details: Named tuple. All the other config information not in ``server_details``. geo_details: Named tuple. Settings for geocoding. server_details: Named tuple. Config information specifically related to the remote server for FTP uploading. petrarch_version: String. Which version of Petrarch to use. Must be '1' or '2' logger_file: String. Path to a log file. Defaults to ``None`` and opens a ``PHOX_pipeline.log`` file in the current working directory. run_filter: String. Whether to run the ``oneaday_formatter``. Takes True or False (strings) as values. run_date: String. Date of the format YYYYMMDD. The pipeline will run using this date. If not specified the pipeline will run with ``current_date`` minus one day. """ if logger_file: utilities.init_logger(logger_file) else: utilities.init_logger('PHOX_pipeline.log') # get a local copy for the pipeline logger = logging.getLogger('pipeline_log') if petrarch_version == '1': from petrarch import petrarch logger.info("Using original Petrarch version") elif petrarch_version == '2': from petrarch2 import petrarch2 as petrarch logger.info("Using Petrarch2") else: logger.error("Invalid Petrarch version. Argument must be '1' or '2'") print('\nPHOX.pipeline run:', datetime.datetime.utcnow()) if run_date: process_date = dateutil.parser.parse(run_date) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) else: process_date = datetime.datetime.utcnow() - datetime.timedelta(days=1) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) # Get the stories for the desired date from the DB results, scraperfilename = scraper_connection.main(process_date, file_details) if geo_details.geo_service == "Mordecai": dest = "{0}:{1}/places".format(geo_details.mordecai_host, geo_details.mordecai_port) try: out = requests.get(dest) assert out.status_code == 200 except (AssertionError, requests.exceptions.ConnectionError): print( "Mordecai geolocation service not responding. Continuing anyway..." ) elif geo_details.geo_service == "CLIFF": print("CLIFF") else: print( "Invalid geo service name. Must be 'CLIFF' or 'Mordecai'. Continuing..." ) if scraperfilename: logger.info("Scraper file name: " + scraperfilename) print("Scraper file name:", scraperfilename) logger.info("Running Mongo.formatter.py") print("Running Mongo.formatter.py") formatted = formatter.main(results, file_details, process_date, date_string) logger.info("Running PETRARCH") file_details.fullfile_stem + date_string if run_filter == 'False': print('Running PETRARCH and writing to a file. No one-a-day.') logger.info('Running PETRARCH and writing to a file. No one-a-day.') # Command to write output to a file directly from PETR # petrarch.run_pipeline(formatted, # '{}{}.txt'.format(file_details.fullfile_stem, # date_string), parsed=True) petr_results = petrarch.run_pipeline( formatted, config="petr_config.ini", write_output=False, # DGM TEst # petrarch.run_pipeline(formatted, out_file = "TESTOUT.txt", config = "petr_config.ini", write_output=True, parsed=True) #sys.exit() elif run_filter == 'True': print('Running PETRARCH and returning output.') logger.info('Running PETRARCH and returning output.') petr_results = petrarch.run_pipeline(formatted, config="petr_config.ini", write_output=False, parsed=True) else: print("""Can't run with the options you've specified. You need to fix something.""") logger.warning("Can't run with the options you've specified. Exiting.") sys.exit() if run_filter == 'True': logger.info("Running oneaday_formatter.py") print("Running oneaday_formatter.py") formatted_results = oneaday_filter.main(petr_results) else: logger.info("Running result_formatter.py") print("Running result_formatter.py") formatted_results = result_formatter.main(petr_results) logger.info("Running postprocess.py") print("Running postprocess.py") if version: postprocess.main(formatted_results, date_string, version, file_details, server_details, geo_details) else: print("Please specify a data version number. Program ending.") #logger.info("Running phox_uploader.py") # print("Running phox_uploader.py") # try: # uploader.main(date_string, server_details, file_details) # except Exception as e: # logger.warning("Error on the upload portion. {}".format(e)) # print("""Error on the uploader. This step isn't absolutely necessary. # Valid events should still be generated.""") logger.info('PHOX.pipeline end') print('PHOX.pipeline end:', datetime.datetime.utcnow())
if not (sport and fport and log and host): logger.error('%s is not correctly configured' % CONFIG_FILE) sys.exit(-1) def proxy_request_handler(epoll_context, parameters): startTime = time.time() request, host, port = epoll_context def server_request_handler(epoll_context, parameters): startTime = time.time() request, host, port = epoll_context def receive_signal(signum, stack): print 'Received:', signum if __name__ == '__main__': opts = argparse.ArgumentParser(description=__description__) parse_config(opts) args = opts.parse_args() config_dict = utilities.load_config(args.config) utilities.init_logger(logger, config_dict) check_config(config_dict, logger) signal.signal(signal.SIGUSR1, receive_signal) #server = epoll.Server(int(args.port), args.host, request_handler, []) #thisserver.run()
def main(file_details, geo_details, server_details, petrarch_version, run_date, mongo_details, logger_file=None, run_filter=None, version=''): """ Main function to run all the things. Parameters ---------- file_details: Named tuple. All the other config information not in ``server_details``. geo_details: Named tuple. Settings for geocoding. server_details: Named tuple. Config information specifically related to the remote server for FTP uploading. petrarch_version: String. Which version of Petrarch to use. Must be '1' or '2' logger_file: String. Path to a log file. Defaults to ``None`` and opens a ``PHOX_pipeline.log`` file in the current working directory. run_filter: String. Whether to run the ``oneaday_formatter``. Takes True or False (strings) as values. run_date: String. Date of the format YYYYMMDD. The pipeline will run using this date. If not specified the pipeline will run with ``current_date`` minus one day. """ if logger_file: utilities.init_logger(logger_file) else: utilities.init_logger('PHOX_pipeline.log') # get a local copy for the pipeline logger = logging.getLogger('pipeline_log') if petrarch_version == '1': from petrarch import petrarch logger.info("Using original Petrarch version") elif petrarch_version == '2': from petrarch2 import petrarch2 as petrarch logger.info("Using Petrarch2") else: logger.error("Invalid Petrarch version. Argument must be '1' or '2'") print('\nPHOX.pipeline run:', datetime.datetime.utcnow()) if run_date: process_date = dateutil.parser.parse(run_date) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) else: process_date = datetime.datetime.utcnow() - datetime.timedelta(days=1) date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) results, scraperfilename = scraper_connection.main(process_date, file_details) if geo_details.geo_service == "Mordecai": dest = "{0}:{1}/places".format(geo_details.mordecai_host, geo_details.mordecai_port) try: out = requests.get(dest) assert out.status_code == 200 except (AssertionError, requests.exceptions.ConnectionError): print("Mordecai geolocation service not responding. Continuing anyway...") elif geo_details.geo_service == "CLIFF": print("CLIFF") else: print("Invalid geo service name. Must be 'CLIFF' or 'Mordecai'. Continuing...") if scraperfilename: logger.info("Scraper file name: " + scraperfilename) print("Scraper file name:", scraperfilename) logger.info("Running Mongo.formatter.py") print("Running Mongo.formatter.py") formatted = formatter.main(results, file_details, process_date, date_string) logger.info("Running PETRARCH") file_details.fullfile_stem + date_string print('Running PETRARCH in Null Actors Mode.') # DGM Run this in actor-gen mode ACTOR_OUTPUT_FILE = "events_null_actors_mode_" + run_date + ".txt" petrarch.run_pipeline(formatted, out_file = ACTOR_OUTPUT_FILE, config = "petr_config.ini", write_output=True, parsed=True) print("See events results in: " + ACTOR_OUTPUT_FILE) print("See actors list in: nullactors." + ACTOR_OUTPUT_FILE) logger.info('PHOX.pipeline end') print('PHOX.pipeline end:', datetime.datetime.utcnow())
def main(message, logger_file=None, run_date='', version=''): """ Main function to run all the things. Parameters ---------- logger_file: String. Path to a log file. Defaults to ``None`` and opens a ``PHOX_pipeline.log`` file in the current working directory. run_date: String. Date of the format YYYYMMDD. The pipeline will run using this date. If not specified the pipeline will run with ``current_date`` minus one day. """ if logger_file: utilities.init_logger(logger_file) else: utilities.init_logger('PHOX_pipeline.log') # get a local copy for the pipeline logger = logging.getLogger('pipeline_log') print('\nPHOX.pipeline run:', datetime.datetime.utcnow()) process_date = datetime.datetime.utcnow() date_string = '{:02d}{:02d}{:02d}'.format(process_date.year, process_date.month, process_date.day) logger.info('Date string: {}'.format(date_string)) print('Date string:', date_string) server_details = '' logger.info("Extracting date.") print("Extracting date.") date = formatter.get_date(message, process_date) logger.info("Sending to Hypnos.") story_id = message['entry_id'] print(story_id) text = message['cleaned_text'] headers = {'Content-Type': 'application/json'} payload = {'text': text, 'id': story_id, 'date': date} data = json.dumps(payload) hypnos_ip = os.environ['HYPNOS_PORT_5002_TCP_ADDR'] hypnos_url = 'http://{}:5002/hypnos/extract'.format(hypnos_ip) r = requests.get(hypnos_url, data=data, headers=headers) print(r.status_code) if r.status_code == 200: logger.info("Running postprocess.py") print("Running postprocess.py") hypnos_res = r.json() print(hypnos_res) events = [] for k, v in hypnos_res[story_id]['sents'].iteritems(): if 'events' in v: sent = hypnos_res[story_id]['sents'][k] for event in v['events']: event_tup = (date, event[0], event[1], event[2]) formatted, actors = postprocess.main(event_tup, sent, version, server_details) logger.info(formatted) logger.info(actors) print(formatted, actors) logger.info('PHOX.pipeline end') print('PHOX.pipeline end:', datetime.datetime.utcnow())
import os import sys from shutil import copyfile from keras.models import load_model from keras.utils import to_categorical from alagent import ALAgent from tagger import CRFTagger import utilities import tensorflow as tf import numpy as np import time args = utilities.get_args() logger = utilities.init_logger() max_len = args.max_seq_length VOCABULARY = args.vocab_size EPISODES = args.episodes BUDGET = args.annotation_budget k = args.k rootdir = args.root_dir train_file = args.train_file dev_file = args.dev_file test_file = args.test_file emb_file = args.word_vec_file DATASET_NAME = args.dataset_name policy_path = args.policy_path
source = triple[0] if isinstance( triple[0], basestring) else triple[0].text target = triple[1] if isinstance( triple[1], basestring) else triple[1].text others = "" for other in triple[3]: others = others + other.text + "," tuples = tuples + "source: " + source + "\ttarget: " + target + "\tverb: " + triple[ 2].text + "\tother_noun: " + others + "\n" ET.SubElement(sentence, "Triplets").text = tuples tree = ET.ElementTree(root) tree.write(outputfile, 'UTF-8') utilities.init_logger('PETRARCH.log', True) config = utilities._get_data('data/config/', 'PETR_config.ini') print("reading config") sys.stdout.write('Mk1\n') PETRreader.parse_Config(config) print("reading dicts") petrarch_ud.read_dictionaries() inputFile = sys.argv[1] #inputFile=sys.argv[1].replace(".xml","")+"_parsed.xml" outputFile = inputFile.replace("_parsed.xml", "") + "_phrase.xml" events = read_xml_input([inputFile], True) ''' print(len(events)) for key in events.keys(): print(len(events[key]['sents'])) for subkey,v in events[key]['sents'].items():