def process(in_file, in_list, out_file, log_file, tika_server_url): # Log input parameters logger = LogUtil('lpsc-parser', log_file) logger.info('Input parameters') logger.info('in_file: %s' % in_file) logger.info('in_list: %s' % in_list) logger.info('out_file: %s' % out_file) logger.info('tika_server_url: %s' % tika_server_url) if in_file and in_list: print('[ERROR] in_file and in_list cannot be provided simultaneously') sys.exit(1) tika_parser = TikaParser(tika_server_url) if in_file: files = [in_file] else: files = read_lines(in_list) out_f = open(out_file, 'wb', 1) for f in tqdm(files): try: tika_dict = tika_parser.parse(f) out_f.write(json.dumps(tika_dict)) out_f.write('\n') except Exception as e: logger.info('TIKA parser failed: %s' % os.path.abspath(f)) logger.error(e) out_f.close()
def __init__(self, ent_dict, tri_dict, arg_dict): constraint_file = './data_files/argrole_dict.txt' self.constraint_list = [] # [(ent_type, tri_type, arg_type)] for line in read_lines(constraint_file): line = str(line).lower() arr = line.split() arg_type = arr[0] for pair in arr[1:]: pair_arr = pair.split(',') tri_type = pair_arr[0] ent_type = pair_arr[1] ent_type = self._replace_ent(ent_type) self.constraint_list.append((ent_type, tri_type, arg_type)) print('Event constraint size:', len(self.constraint_list)) # { (ent_type, tri_type) : (arg_type1, ...)} self.ent_tri_to_arg_hash = {} for cons in self.constraint_list: ent_id = ent_dict[cons[0]] tri_id = tri_dict[cons[1]] arg_id = arg_dict[cons[2]] # ent_id = cons[0] # tri_id = cons[1] # arg_id = cons[2] if (ent_id, tri_id) not in self.ent_tri_to_arg_hash: self.ent_tri_to_arg_hash[(ent_id, tri_id)] = set() self.ent_tri_to_arg_hash[(ent_id, tri_id)].add(arg_id)
def process(in_file, in_list, out_file, log_file, tika_server_url, ads_url, ads_token): # Log input parameters logger = LogUtil('ads-parser', log_file) logger.info('Input parameters') logger.info('in_file: %s' % in_file) logger.info('in_list: %s' % in_list) logger.info('out_file: %s' % out_file) logger.info('tika_server_url: %s' % tika_server_url) logger.info('ads_url: %s' % ads_url) logger.info('ads_token: %s' % ads_token) if in_file and in_list: print('[ERROR] in_file and in_list cannot be provided simultaneously') sys.exit(1) ads_parser = AdsParser(ads_token, ads_url, tika_server_url) if in_file: files = [in_file] else: files = read_lines(in_list) out_f = open(out_file, 'wb', 1) for f in tqdm(files): logger.info('Processing %s' % os.path.basename(f)) try: ads_dict = ads_parser.parse(f) out_f.write(json.dumps(ads_dict)) out_f.write('\n') except Exception as e: logger.info('ADS parser failed: %s' % os.path.abspath(f)) logger.error(e) out_f.close()
def process(in_file, in_list, out_file, log_file, tika_server_url, corenlp_server_url, ner_model, gazette_file, relation_types, jsre_root, jsre_models, jsre_tmp_dir, ads_url, ads_token): # Log input parameters logger = LogUtil(log_file) logger.info('Input parameters') logger.info('in_file: %s' % in_file) logger.info('in_list: %s' % in_list) logger.info('out_file: %s' % out_file) logger.info('tika_server_url: %s' % tika_server_url) logger.info('corenlp_server_url: %s' % corenlp_server_url) logger.info('ner_model: %s' % os.path.abspath(ner_model)) logger.info('gazette_file: %s' % gazette_file) logger.info('relation_types: %s' % json.dumps(relation_types)) logger.info('jsre_root: %s' % os.path.abspath(jsre_root)) logger.info('jsre_models: %s' % json.dumps(jsre_models)) logger.info('jsre_tmp_dir: %s' % os.path.abspath(jsre_tmp_dir)) logger.info('ads_url: %s' % ads_url) logger.info('ads_token: %s' % ads_token) if in_file and in_list: print('[ERROR] in_file and in_list cannot be provided simultaneously') sys.exit(1) if len(relation_types) != len(jsre_models): print( '[ERROR] There should be a one-to-one mapping for relation types ' 'and jSRE models.') sys.exit(1) ads_parser = AdsParser(ads_token, ads_url, tika_server_url) jsre_parser = JsreParser(corenlp_server_url, ner_model, gazette_file, relation_types, jsre_root, jsre_models, jsre_tmp_dir) if in_file: files = [in_file] else: files = read_lines(in_list) out_f = open(out_file, 'wb', 1) for f in tqdm(files): try: ads_dict = ads_parser.parse(f) jsre_dict = jsre_parser.parse(ads_dict['content']) ads_dict['metadata']['ner'] = jsre_dict['ner'] ads_dict['metadata']['rel'] = jsre_dict['relation'] ads_dict['metadata']['sentences'] = jsre_dict['sentences'] ads_dict['metadata']['X-Parsed-By'].append( jsre_dict['X-Parsed-By']) out_f.write(json.dumps(ads_dict)) out_f.write('\n') except Exception as e: logger.info('JSRE parser failed: %s' % os.path.abspath(f)) logger.error(e) out_f.close()
def process(in_file, in_list, out_file, log_file, tika_server_url, corenlp_server_url, ner_model, gazette_file, relation_type, jsre_root, jsre_model, jsre_tmp_dir, ads_url, ads_token): # Log input parameters logger = LogUtil('jgr-parser', log_file) logger.info('Input parameters') logger.info('in_file: %s' % in_file) logger.info('in_list: %s' % in_list) logger.info('out_file: %s' % out_file) logger.info('tika_server_url: %s' % tika_server_url) logger.info('corenlp_server_url: %s' % corenlp_server_url) logger.info('ner_model: %s' % os.path.abspath(ner_model)) logger.info('gazette_file: %s' % gazette_file) logger.info('relation_type: %s' % relation_type) logger.info('jsre_root: %s' % os.path.abspath(jsre_root)) logger.info('jsre_model: %s' % os.path.abspath(jsre_model)) logger.info('jsre_tmp_dir: %s' % os.path.abspath(jsre_tmp_dir)) logger.info('ads_url: %s' % ads_url) logger.info('ads_token: %s' % ads_token) if in_file and in_list: print('[ERROR] in_file and in_list cannot be provided simultaneously') sys.exit(1) ads_parser = AdsParser(ads_token, ads_url, tika_server_url) jgr_parser = JgrParser() jsre_parser = JsreParser(corenlp_server_url, ner_model, gazette_file, relation_type, jsre_root, jsre_model, jsre_tmp_dir) if in_file: files = [in_file] else: files = read_lines(in_list) out_f = open(out_file, 'wb', 1) for f in tqdm(files): try: ads_dict = ads_parser.parse(f) journal_dict = jgr_parser.parse(ads_dict['content'], ads_dict['metadata']) jsre_dict = jsre_parser.parse(journal_dict['cleaned_content']) ads_dict['content_ann_s'] = journal_dict['cleaned_content'] ads_dict['references'] = journal_dict['references'] ads_dict['metadata']['ner'] = jsre_dict['ner'] ads_dict['metadata']['rel'] = jsre_dict['relation'] ads_dict['metadata']['sentences'] = jsre_dict['sentences'] ads_dict['metadata']['X-Parsed-By'] = jsre_dict['X-Parsed-By'] out_f.write(json.dumps(ads_dict)) out_f.write('\n') except Exception as e: logger.info('JGR parser failed: %s' % os.path.abspath(f)) logger.error(e) out_f.close()
def process(in_file, in_list, out_file, log_file, tika_server_url, corenlp_server_url, ner_model, gazette_file, ads_url, ads_token): # Log input parameters logger = LogUtil('corenlp-parser', log_file) logger.info('Input parameters') logger.info('in_file: %s' % in_file) logger.info('in_list: %s' % in_list) logger.info('out_file: %s' % out_file) logger.info('tika_server_url: %s' % tika_server_url) logger.info('corenlp_server_url: %s' % corenlp_server_url) logger.info('ner_model: %s' % os.path.abspath(ner_model)) logger.info('gazette_file: %s' % gazette_file) logger.info('ads_url: %s' % ads_url) logger.info('ads_token: %s' % ads_token) if in_file and in_list: print('[ERROR] in_file and in_list cannot be provided simultaneously') sys.exit(1) ads_parser = AdsParser(ads_token, ads_url, tika_server_url) corenlp_parser = CoreNLPParser(corenlp_server_url, ner_model, gazette_file) if in_file: files = [in_file] else: files = read_lines(in_list) out_f = open(out_file, 'wb', 1) for f in tqdm(files): try: ads_dict = ads_parser.parse(f) corenlp_dict = corenlp_parser.parse(ads_dict['content']) ads_dict['metadata']['ner'] = corenlp_dict['ner'] ads_dict['metadata']['X-Parsed-By'].append( corenlp_dict['X-Parsed-By']) ads_dict['metadata']['sentences'] = corenlp_dict['sentences'] out_f.write(json.dumps(ads_dict)) out_f.write('\n') except Exception as e: logger.info('CoreNLP parser failed: %s' % os.path.abspath(f)) logger.error(e) out_f.close()
def process(in_file, in_list, out_file, log_file, tika_server_url, corenlp_server_url, ner_model, gazette_file, relation_types, jsre_root, jsre_models, jsre_tmp_dir, ads_url, ads_token): # Log input parameters logger = LogUtil(log_file) logger.info('Input parameters') logger.info('in_file: %s' % in_file) logger.info('in_list: %s' % in_list) logger.info('out_file: %s' % out_file) logger.info('tika_server_url: %s' % tika_server_url) logger.info('corenlp_server_url: %s' % corenlp_server_url) logger.info('ner_model: %s' % os.path.abspath(ner_model)) logger.info('gazette_file: %s' % gazette_file) logger.info('relation_types: %s' % json.dumps(relation_types)) logger.info('jsre_root: %s' % os.path.abspath(jsre_root)) logger.info('jsre_model: %s' % json.dumps(jsre_models)) logger.info('jsre_tmp_dir: %s' % os.path.abspath(jsre_tmp_dir)) logger.info('ads_url: %s' % ads_url) logger.info('ads_token: %s' % ads_token) if in_file and in_list: print('[ERROR] in_file and in_list cannot be provided simultaneously') sys.exit(1) ads_parser = AdsParser(ads_token, ads_url, tika_server_url) paper_parser = PaperParser() jsre_parser = JsreParser(corenlp_server_url, ner_model, gazette_file, relation_types, jsre_root, jsre_models, jsre_tmp_dir) if in_file: files = [in_file] else: files = read_lines(in_list) out_f = open(out_file, 'wb', 1) for f in tqdm(files): logger.info('Processing %s' % os.path.basename(f)) try: ads_dict = ads_parser.parse(f) if 'grobid:header_Title' in ads_dict['metadata'].keys(): logger.info('Document title: %s' % ads_dict['metadata']['grobid:header_Title']) paper_dict = paper_parser.parse(ads_dict['content'], ads_dict['metadata']) jsre_dict = jsre_parser.parse(paper_dict['cleaned_content']) ads_dict['content_ann_s'] = paper_dict['cleaned_content'] ads_dict['metadata']['ner'] = jsre_dict['ner'] ads_dict['metadata']['rel'] = jsre_dict['relation'] ads_dict['metadata']['sentences'] = jsre_dict['sentences'] ads_dict['metadata']['X-Parsed-By'] = jsre_dict['X-Parsed-By'] out_f.write(json.dumps(ads_dict)) out_f.write('\n') except Exception as e: logger.info('Paper parser failed: %s' % os.path.abspath(f)) logger.error(e) out_f.close()
def process(in_file, in_list, out_file, log_file, tika_server_url, ads_url, ads_token, corenlp_server_url, ner_model, gazette_file, containee_model_file, container_model_file, entity_linking_method, gpu_id, batch_size): # Log input parameters logger = LogUtil(log_file) logger.info('Input parameters') logger.info('in_file: %s' % in_file) logger.info('in_list: %s' % in_list) logger.info('out_file: %s' % out_file) logger.info('log_file: %s' % log_file) logger.info('tika_server_url: %s' % tika_server_url) logger.info('ads_url: %s' % ads_url) logger.info('ads_token: %s' % ads_token) logger.info('corenlp_server_url: %s' % corenlp_server_url) logger.info('ner_model: %s' % os.path.abspath(ner_model)) logger.info('container_model_file: %s' % os.path.abspath(container_model_file)) logger.info('containee_model_file: %s' % os.path.abspath(containee_model_file)) logger.info('entity_linking_method: %s' % entity_linking_method) logger.info('gpu_id: %s' % str(gpu_id)) if in_file and in_list: raise NameError( '[ERROR] in_file and in_list cannot be provided simultaneously') ads_parser = AdsParser(ads_token, ads_url, tika_server_url) unary_parser = UnaryParser(corenlp_server_url, ner_model, gazette_file, containee_model_file, container_model_file, gpu_id=gpu_id) if in_file: files = [in_file] else: files = read_lines(in_list) out_f = open(out_file, 'wb', 1) for f in tqdm(files): try: ads_dict = ads_parser.parse(f) unary_dict = unary_parser.parse( ads_dict['content'], batch_size=batch_size, entity_linking_method=entity_linking_method) ads_dict['metadata']['ner'] = unary_dict['ner'] ads_dict['metadata']['rel'] = unary_dict['relation'] ads_dict['metadata']['sentences'] = unary_dict['sentences'] ads_dict['metadata']['X-Parsed-By'].append( unary_dict['X-Parsed-By']) out_f.write(json.dumps(ads_dict)) out_f.write('\n') except Exception as e: logger.info('Unary parser failed: %s' % abspath(f)) logger.error(e) out_f.close()
def process(in_file, in_list, out_file, log_file, tika_server_url, corenlp_server_url, ner_model, gazette_file, relation_types, jsre_root, jsre_models, jsre_tmp_dir, containee_model_file, container_model_file, entity_linking_method, gpu_id, batch_size, ads_url, ads_token): # Log input parameters logger = LogUtil(log_file) logger.info('Input parameters') logger.info('in_file: %s' % in_file) logger.info('in_list: %s' % in_list) logger.info('out_file: %s' % out_file) logger.info('tika_server_url: %s' % tika_server_url) logger.info('corenlp_server_url: %s' % corenlp_server_url) logger.info('ner_model: %s' % os.path.abspath(ner_model)) logger.info('gazette_file: %s' % gazette_file) logger.info('ads_url: %s' % ads_url) logger.info('ads_token: %s' % ads_token) if in_file and in_list: logger.info('[ERROR] in_file and in_list cannot be provided ' 'simultaneously') sys.exit(1) if len(relation_types) != len(jsre_models): print('[ERROR] There should be a one-to-one mapping for relation types ' 'and jSRE models.') sys.exit(1) ads_parser = AdsParser(ads_token, ads_url, tika_server_url) lpsc_parser = LpscParser() # Note: this is temporary solution as it requires users to carefully provide # inputs to the script. A better solution would be restrict users to provide # inputs that are mutual exclusive (e.g., if jsre_model is provided, then # unary parser's options should be disallowed). # Steven Lu, September 2, 2021 jsre_parser = None unary_parser = None if jsre_models: logger.info('relation_types: %s' % json.dumps(relation_types)) logger.info('jsre_root: %s' % os.path.abspath(jsre_root)) logger.info('jsre_models: %s' % json.dumps(jsre_models)) logger.info('jsre_tmp_dir: %s' % os.path.abspath(jsre_tmp_dir)) jsre_parser = JsreParser(corenlp_server_url, ner_model, gazette_file, relation_types, jsre_root, jsre_models, jsre_tmp_dir) elif container_model_file and containee_model_file and entity_linking_method: logger.info('container_model_file: %s' % os.path.abspath(container_model_file)) logger.info('containee_model_file: %s' % os.path.abspath(containee_model_file)) logger.info('entity_linking_method: %s' % entity_linking_method) logger.info('gpu_id: %s' % str(gpu_id)) unary_parser = UnaryParser(corenlp_server_url, ner_model, gazette_file, containee_model_file, container_model_file, gpu_id=gpu_id) if in_file: files = [in_file] else: files = read_lines(in_list) out_f = open(out_file, 'wb', 1) for f in tqdm(files): try: base_name = os.path.basename(f) logger.info('Processing %s' % base_name) base_name = base_name.split('.')[0] year, abs_num = base_name.split('_') query_dict = {'lpsc_query_strategy': { 'year': year, 'abstract_number': abs_num }} ads_dict = ads_parser.parse(f, query_dict) lpsc_dict = lpsc_parser.parse(ads_dict['content'], ads_dict['metadata']) if jsre_parser is not None: rel_dict = jsre_parser.parse(lpsc_dict['cleaned_content']) else: rel_dict = unary_parser.parse( lpsc_dict['cleaned_content'], batch_size=batch_size, entity_linking_method=entity_linking_method) ads_dict['content_ann_s'] = lpsc_dict['cleaned_content'] ads_dict['references'] = lpsc_dict['references'] ads_dict['metadata']['ner'] = rel_dict['ner'] ads_dict['metadata']['rel'] = rel_dict['relation'] ads_dict['metadata']['sentences'] = rel_dict['sentences'] ads_dict['metadata']['X-Parsed-By'] = rel_dict['X-Parsed-By'] out_f.write(json.dumps(ads_dict)) out_f.write('\n') except Exception as e: logger.info('LPSC parser failed: %s' % os.path.abspath(f)) logger.error(e) out_f.close()