예제 #1
0
def process(in_file, in_list, out_file, log_file, tika_server_url):
    # Log input parameters
    logger = LogUtil('lpsc-parser', log_file)
    logger.info('Input parameters')
    logger.info('in_file: %s' % in_file)
    logger.info('in_list: %s' % in_list)
    logger.info('out_file: %s' % out_file)
    logger.info('tika_server_url: %s' % tika_server_url)

    if in_file and in_list:
        print('[ERROR] in_file and in_list cannot be provided simultaneously')
        sys.exit(1)

    tika_parser = TikaParser(tika_server_url)

    if in_file:
        files = [in_file]
    else:
        files = read_lines(in_list)

    out_f = open(out_file, 'wb', 1)
    for f in tqdm(files):
        try:
            tika_dict = tika_parser.parse(f)

            out_f.write(json.dumps(tika_dict))
            out_f.write('\n')
        except Exception as e:
            logger.info('TIKA parser failed: %s' % os.path.abspath(f))
            logger.error(e)

    out_f.close()
예제 #2
0
    def __init__(self, ent_dict, tri_dict, arg_dict):
        constraint_file = './data_files/argrole_dict.txt'

        self.constraint_list = []  # [(ent_type, tri_type, arg_type)]
        for line in read_lines(constraint_file):
            line = str(line).lower()
            arr = line.split()
            arg_type = arr[0]
            for pair in arr[1:]:
                pair_arr = pair.split(',')
                tri_type = pair_arr[0]
                ent_type = pair_arr[1]
                ent_type = self._replace_ent(ent_type)
                self.constraint_list.append((ent_type, tri_type, arg_type))

        print('Event constraint size:', len(self.constraint_list))
        # { (ent_type, tri_type) : (arg_type1, ...)}
        self.ent_tri_to_arg_hash = {}
        for cons in self.constraint_list:
            ent_id = ent_dict[cons[0]]
            tri_id = tri_dict[cons[1]]
            arg_id = arg_dict[cons[2]]
            # ent_id = cons[0]
            # tri_id = cons[1]
            # arg_id = cons[2]
            if (ent_id, tri_id) not in self.ent_tri_to_arg_hash:
                self.ent_tri_to_arg_hash[(ent_id, tri_id)] = set()

            self.ent_tri_to_arg_hash[(ent_id, tri_id)].add(arg_id)
예제 #3
0
def process(in_file, in_list, out_file, log_file, tika_server_url, ads_url,
            ads_token):
    # Log input parameters
    logger = LogUtil('ads-parser', log_file)
    logger.info('Input parameters')
    logger.info('in_file: %s' % in_file)
    logger.info('in_list: %s' % in_list)
    logger.info('out_file: %s' % out_file)
    logger.info('tika_server_url: %s' % tika_server_url)
    logger.info('ads_url: %s' % ads_url)
    logger.info('ads_token: %s' % ads_token)
    if in_file and in_list:
        print('[ERROR] in_file and in_list cannot be provided simultaneously')
        sys.exit(1)

    ads_parser = AdsParser(ads_token, ads_url, tika_server_url)

    if in_file:
        files = [in_file]
    else:
        files = read_lines(in_list)

    out_f = open(out_file, 'wb', 1)
    for f in tqdm(files):
        logger.info('Processing %s' % os.path.basename(f))
        try:
            ads_dict = ads_parser.parse(f)

            out_f.write(json.dumps(ads_dict))
            out_f.write('\n')
        except Exception as e:
            logger.info('ADS parser failed: %s' % os.path.abspath(f))
            logger.error(e)

    out_f.close()
예제 #4
0
def process(in_file, in_list, out_file, log_file, tika_server_url,
            corenlp_server_url, ner_model, gazette_file, relation_types,
            jsre_root, jsre_models, jsre_tmp_dir, ads_url, ads_token):
    # Log input parameters
    logger = LogUtil(log_file)
    logger.info('Input parameters')
    logger.info('in_file: %s' % in_file)
    logger.info('in_list: %s' % in_list)
    logger.info('out_file: %s' % out_file)
    logger.info('tika_server_url: %s' % tika_server_url)
    logger.info('corenlp_server_url: %s' % corenlp_server_url)
    logger.info('ner_model: %s' % os.path.abspath(ner_model))
    logger.info('gazette_file: %s' % gazette_file)
    logger.info('relation_types: %s' % json.dumps(relation_types))
    logger.info('jsre_root: %s' % os.path.abspath(jsre_root))
    logger.info('jsre_models: %s' % json.dumps(jsre_models))
    logger.info('jsre_tmp_dir: %s' % os.path.abspath(jsre_tmp_dir))
    logger.info('ads_url: %s' % ads_url)
    logger.info('ads_token: %s' % ads_token)

    if in_file and in_list:
        print('[ERROR] in_file and in_list cannot be provided simultaneously')
        sys.exit(1)

    if len(relation_types) != len(jsre_models):
        print(
            '[ERROR] There should be a one-to-one mapping for relation types '
            'and jSRE models.')
        sys.exit(1)

    ads_parser = AdsParser(ads_token, ads_url, tika_server_url)
    jsre_parser = JsreParser(corenlp_server_url, ner_model, gazette_file,
                             relation_types, jsre_root, jsre_models,
                             jsre_tmp_dir)

    if in_file:
        files = [in_file]
    else:
        files = read_lines(in_list)

    out_f = open(out_file, 'wb', 1)
    for f in tqdm(files):
        try:
            ads_dict = ads_parser.parse(f)
            jsre_dict = jsre_parser.parse(ads_dict['content'])

            ads_dict['metadata']['ner'] = jsre_dict['ner']
            ads_dict['metadata']['rel'] = jsre_dict['relation']
            ads_dict['metadata']['sentences'] = jsre_dict['sentences']
            ads_dict['metadata']['X-Parsed-By'].append(
                jsre_dict['X-Parsed-By'])

            out_f.write(json.dumps(ads_dict))
            out_f.write('\n')
        except Exception as e:
            logger.info('JSRE parser failed: %s' % os.path.abspath(f))
            logger.error(e)

    out_f.close()
예제 #5
0
def process(in_file, in_list, out_file, log_file, tika_server_url,
            corenlp_server_url, ner_model, gazette_file, relation_type,
            jsre_root, jsre_model, jsre_tmp_dir, ads_url, ads_token):
    # Log input parameters
    logger = LogUtil('jgr-parser', log_file)
    logger.info('Input parameters')
    logger.info('in_file: %s' % in_file)
    logger.info('in_list: %s' % in_list)
    logger.info('out_file: %s' % out_file)
    logger.info('tika_server_url: %s' % tika_server_url)
    logger.info('corenlp_server_url: %s' % corenlp_server_url)
    logger.info('ner_model: %s' % os.path.abspath(ner_model))
    logger.info('gazette_file: %s' % gazette_file)
    logger.info('relation_type: %s' % relation_type)
    logger.info('jsre_root: %s' % os.path.abspath(jsre_root))
    logger.info('jsre_model: %s' % os.path.abspath(jsre_model))
    logger.info('jsre_tmp_dir: %s' % os.path.abspath(jsre_tmp_dir))
    logger.info('ads_url: %s' % ads_url)
    logger.info('ads_token: %s' % ads_token)

    if in_file and in_list:
        print('[ERROR] in_file and in_list cannot be provided simultaneously')
        sys.exit(1)

    ads_parser = AdsParser(ads_token, ads_url, tika_server_url)
    jgr_parser = JgrParser()
    jsre_parser = JsreParser(corenlp_server_url, ner_model, gazette_file,
                             relation_type, jsre_root, jsre_model, jsre_tmp_dir)

    if in_file:
        files = [in_file]
    else:
        files = read_lines(in_list)

    out_f = open(out_file, 'wb', 1)
    for f in tqdm(files):
        try:
            ads_dict = ads_parser.parse(f)
            journal_dict = jgr_parser.parse(ads_dict['content'],
                                            ads_dict['metadata'])
            jsre_dict = jsre_parser.parse(journal_dict['cleaned_content'])

            ads_dict['content_ann_s'] = journal_dict['cleaned_content']
            ads_dict['references'] = journal_dict['references']
            ads_dict['metadata']['ner'] = jsre_dict['ner']
            ads_dict['metadata']['rel'] = jsre_dict['relation']
            ads_dict['metadata']['sentences'] = jsre_dict['sentences']
            ads_dict['metadata']['X-Parsed-By'] = jsre_dict['X-Parsed-By']

            out_f.write(json.dumps(ads_dict))
            out_f.write('\n')
        except Exception as e:
            logger.info('JGR parser failed: %s' % os.path.abspath(f))
            logger.error(e)

    out_f.close()
예제 #6
0
def process(in_file, in_list, out_file, log_file, tika_server_url,
            corenlp_server_url, ner_model, gazette_file, ads_url, ads_token):
    # Log input parameters
    logger = LogUtil('corenlp-parser', log_file)
    logger.info('Input parameters')
    logger.info('in_file: %s' % in_file)
    logger.info('in_list: %s' % in_list)
    logger.info('out_file: %s' % out_file)
    logger.info('tika_server_url: %s' % tika_server_url)
    logger.info('corenlp_server_url: %s' % corenlp_server_url)
    logger.info('ner_model: %s' % os.path.abspath(ner_model))
    logger.info('gazette_file: %s' % gazette_file)
    logger.info('ads_url: %s' % ads_url)
    logger.info('ads_token: %s' % ads_token)

    if in_file and in_list:
        print('[ERROR] in_file and in_list cannot be provided simultaneously')
        sys.exit(1)

    ads_parser = AdsParser(ads_token, ads_url, tika_server_url)
    corenlp_parser = CoreNLPParser(corenlp_server_url, ner_model, gazette_file)

    if in_file:
        files = [in_file]
    else:
        files = read_lines(in_list)

    out_f = open(out_file, 'wb', 1)
    for f in tqdm(files):
        try:
            ads_dict = ads_parser.parse(f)
            corenlp_dict = corenlp_parser.parse(ads_dict['content'])

            ads_dict['metadata']['ner'] = corenlp_dict['ner']
            ads_dict['metadata']['X-Parsed-By'].append(
                corenlp_dict['X-Parsed-By'])
            ads_dict['metadata']['sentences'] = corenlp_dict['sentences']

            out_f.write(json.dumps(ads_dict))
            out_f.write('\n')
        except Exception as e:
            logger.info('CoreNLP parser failed: %s' % os.path.abspath(f))
            logger.error(e)

    out_f.close()
예제 #7
0
파일: paper_parser.py 프로젝트: wkiri/MTE
def process(in_file, in_list, out_file, log_file, tika_server_url,
            corenlp_server_url, ner_model, gazette_file, relation_types,
            jsre_root, jsre_models, jsre_tmp_dir, ads_url, ads_token):
    # Log input parameters
    logger = LogUtil(log_file)
    logger.info('Input parameters')
    logger.info('in_file: %s' % in_file)
    logger.info('in_list: %s' % in_list)
    logger.info('out_file: %s' % out_file)
    logger.info('tika_server_url: %s' % tika_server_url)
    logger.info('corenlp_server_url: %s' % corenlp_server_url)
    logger.info('ner_model: %s' % os.path.abspath(ner_model))
    logger.info('gazette_file: %s' % gazette_file)
    logger.info('relation_types: %s' % json.dumps(relation_types))
    logger.info('jsre_root: %s' % os.path.abspath(jsre_root))
    logger.info('jsre_model: %s' % json.dumps(jsre_models))
    logger.info('jsre_tmp_dir: %s' % os.path.abspath(jsre_tmp_dir))
    logger.info('ads_url: %s' % ads_url)
    logger.info('ads_token: %s' % ads_token)

    if in_file and in_list:
        print('[ERROR] in_file and in_list cannot be provided simultaneously')
        sys.exit(1)

    ads_parser = AdsParser(ads_token, ads_url, tika_server_url)
    paper_parser = PaperParser()
    jsre_parser = JsreParser(corenlp_server_url, ner_model, gazette_file,
                             relation_types, jsre_root, jsre_models,
                             jsre_tmp_dir)

    if in_file:
        files = [in_file]
    else:
        files = read_lines(in_list)

    out_f = open(out_file, 'wb', 1)
    for f in tqdm(files):
        logger.info('Processing %s' % os.path.basename(f))
        try:
            ads_dict = ads_parser.parse(f)

            if 'grobid:header_Title' in ads_dict['metadata'].keys():
                logger.info('Document title: %s' %
                            ads_dict['metadata']['grobid:header_Title'])

            paper_dict = paper_parser.parse(ads_dict['content'],
                                            ads_dict['metadata'])
            jsre_dict = jsre_parser.parse(paper_dict['cleaned_content'])

            ads_dict['content_ann_s'] = paper_dict['cleaned_content']
            ads_dict['metadata']['ner'] = jsre_dict['ner']
            ads_dict['metadata']['rel'] = jsre_dict['relation']
            ads_dict['metadata']['sentences'] = jsre_dict['sentences']
            ads_dict['metadata']['X-Parsed-By'] = jsre_dict['X-Parsed-By']

            out_f.write(json.dumps(ads_dict))
            out_f.write('\n')
        except Exception as e:
            logger.info('Paper parser failed: %s' % os.path.abspath(f))
            logger.error(e)

    out_f.close()
예제 #8
0
def process(in_file, in_list, out_file, log_file, tika_server_url, ads_url,
            ads_token, corenlp_server_url, ner_model, gazette_file,
            containee_model_file, container_model_file, entity_linking_method,
            gpu_id, batch_size):
    # Log input parameters
    logger = LogUtil(log_file)
    logger.info('Input parameters')
    logger.info('in_file: %s' % in_file)
    logger.info('in_list: %s' % in_list)
    logger.info('out_file: %s' % out_file)
    logger.info('log_file: %s' % log_file)
    logger.info('tika_server_url: %s' % tika_server_url)
    logger.info('ads_url: %s' % ads_url)
    logger.info('ads_token: %s' % ads_token)
    logger.info('corenlp_server_url: %s' % corenlp_server_url)
    logger.info('ner_model: %s' % os.path.abspath(ner_model))
    logger.info('container_model_file: %s' %
                os.path.abspath(container_model_file))
    logger.info('containee_model_file: %s' %
                os.path.abspath(containee_model_file))
    logger.info('entity_linking_method: %s' % entity_linking_method)
    logger.info('gpu_id: %s' % str(gpu_id))

    if in_file and in_list:
        raise NameError(
            '[ERROR] in_file and in_list cannot be provided simultaneously')

    ads_parser = AdsParser(ads_token, ads_url, tika_server_url)

    unary_parser = UnaryParser(corenlp_server_url,
                               ner_model,
                               gazette_file,
                               containee_model_file,
                               container_model_file,
                               gpu_id=gpu_id)

    if in_file:
        files = [in_file]
    else:
        files = read_lines(in_list)

    out_f = open(out_file, 'wb', 1)
    for f in tqdm(files):
        try:
            ads_dict = ads_parser.parse(f)

            unary_dict = unary_parser.parse(
                ads_dict['content'],
                batch_size=batch_size,
                entity_linking_method=entity_linking_method)

            ads_dict['metadata']['ner'] = unary_dict['ner']
            ads_dict['metadata']['rel'] = unary_dict['relation']
            ads_dict['metadata']['sentences'] = unary_dict['sentences']
            ads_dict['metadata']['X-Parsed-By'].append(
                unary_dict['X-Parsed-By'])

            out_f.write(json.dumps(ads_dict))
            out_f.write('\n')
        except Exception as e:
            logger.info('Unary parser failed: %s' % abspath(f))
            logger.error(e)

    out_f.close()
예제 #9
0
def process(in_file, in_list, out_file, log_file, tika_server_url,
            corenlp_server_url, ner_model, gazette_file, relation_types,
            jsre_root, jsre_models, jsre_tmp_dir, containee_model_file,
            container_model_file, entity_linking_method, gpu_id, batch_size,
            ads_url, ads_token):
    # Log input parameters
    logger = LogUtil(log_file)
    logger.info('Input parameters')
    logger.info('in_file: %s' % in_file)
    logger.info('in_list: %s' % in_list)
    logger.info('out_file: %s' % out_file)
    logger.info('tika_server_url: %s' % tika_server_url)
    logger.info('corenlp_server_url: %s' % corenlp_server_url)
    logger.info('ner_model: %s' % os.path.abspath(ner_model))
    logger.info('gazette_file: %s' % gazette_file)
    logger.info('ads_url: %s' % ads_url)
    logger.info('ads_token: %s' % ads_token)

    if in_file and in_list:
        logger.info('[ERROR] in_file and in_list cannot be provided '
                    'simultaneously')
        sys.exit(1)

    if len(relation_types) != len(jsre_models):
        print('[ERROR] There should be a one-to-one mapping for relation types '
              'and jSRE models.')
        sys.exit(1)

    ads_parser = AdsParser(ads_token, ads_url, tika_server_url)
    lpsc_parser = LpscParser()

    # Note: this is temporary solution as it requires users to carefully provide
    # inputs to the script. A better solution would be restrict users to provide
    # inputs that are mutual exclusive (e.g., if jsre_model is provided, then
    # unary parser's options should be disallowed).
    # Steven Lu, September 2, 2021
    jsre_parser = None
    unary_parser = None
    if jsre_models:
        logger.info('relation_types: %s' % json.dumps(relation_types))
        logger.info('jsre_root: %s' % os.path.abspath(jsre_root))
        logger.info('jsre_models: %s' % json.dumps(jsre_models))
        logger.info('jsre_tmp_dir: %s' % os.path.abspath(jsre_tmp_dir))

        jsre_parser = JsreParser(corenlp_server_url, ner_model, gazette_file,
                                 relation_types, jsre_root, jsre_models,
                                 jsre_tmp_dir)
    elif container_model_file and containee_model_file and entity_linking_method:
        logger.info('container_model_file: %s' %
                    os.path.abspath(container_model_file))
        logger.info('containee_model_file: %s' %
                    os.path.abspath(containee_model_file))
        logger.info('entity_linking_method: %s' % entity_linking_method)
        logger.info('gpu_id: %s' % str(gpu_id))

        unary_parser = UnaryParser(corenlp_server_url, ner_model, gazette_file,
                                   containee_model_file, container_model_file,
                                   gpu_id=gpu_id)

    if in_file:
        files = [in_file]
    else:
        files = read_lines(in_list)

    out_f = open(out_file, 'wb', 1)
    for f in tqdm(files):
        try:
            base_name = os.path.basename(f)
            logger.info('Processing %s' % base_name)
            base_name = base_name.split('.')[0]
            year, abs_num = base_name.split('_')
            query_dict = {'lpsc_query_strategy': {
                'year': year,
                'abstract_number': abs_num
            }}

            ads_dict = ads_parser.parse(f, query_dict)
            lpsc_dict = lpsc_parser.parse(ads_dict['content'],
                                          ads_dict['metadata'])

            if jsre_parser is not None:
                rel_dict = jsre_parser.parse(lpsc_dict['cleaned_content'])
            else:
                rel_dict = unary_parser.parse(
                    lpsc_dict['cleaned_content'], batch_size=batch_size,
                    entity_linking_method=entity_linking_method)

            ads_dict['content_ann_s'] = lpsc_dict['cleaned_content']
            ads_dict['references'] = lpsc_dict['references']
            ads_dict['metadata']['ner'] = rel_dict['ner']
            ads_dict['metadata']['rel'] = rel_dict['relation']
            ads_dict['metadata']['sentences'] = rel_dict['sentences']
            ads_dict['metadata']['X-Parsed-By'] = rel_dict['X-Parsed-By']

            out_f.write(json.dumps(ads_dict))
            out_f.write('\n')
        except Exception as e:
            logger.info('LPSC parser failed: %s' % os.path.abspath(f))
            logger.error(e)

    out_f.close()