예제 #1
0
def read_dictionaries(validation=False):


    print('Verb dictionary:', PETRglobals.VerbFileName)
    verb_path = utilities._get_data(
        'data/dictionaries',
        PETRglobals.VerbFileName)
    PETRreader.read_verb_dictionary(verb_path)
    
    print('Actor dictionaries:', PETRglobals.ActorFileList)
    for actdict in PETRglobals.ActorFileList:
        actor_path = utilities._get_data('data/dictionaries', actdict)
        PETRreader.read_actor_dictionary(actor_path)

    print('Agent dictionary:', PETRglobals.AgentFileName)
    agent_path = utilities._get_data('data/dictionaries',
                                     PETRglobals.AgentFileName)
    PETRreader.read_agent_dictionary(agent_path)

    print('Discard dictionary:', PETRglobals.DiscardFileName)
    discard_path = utilities._get_data('data/dictionaries',
                                       PETRglobals.DiscardFileName)
    PETRreader.read_discard_list(discard_path)

    if PETRglobals.IssueFileName != "":
        print('Issues dictionary:', PETRglobals.IssueFileName)
        issue_path = utilities._get_data('data/dictionaries',
                                         PETRglobals.IssueFileName)
        PETRreader.read_issue_list(issue_path)
예제 #2
0
def run_pipeline(data, out_file=None, config=None, write_output=True,
                 parsed=False):
    utilities.init_logger('PETRARCH.log')
    logger = logging.getLogger('petr_log')
    if config:
        print('Using user-specified config: {}'.format(config))
        logger.info('Using user-specified config: {}'.format(config))
        PETRreader.parse_Config(config)
    else:
        logger.info('Using default config file.')
        logger.info('Config path: {}'.format(utilities._get_data('data/config/',
                                                                 'PETR_config.ini')))
        PETRreader.parse_Config(utilities._get_data('data/config/',
                                                    'PETR_config.ini'))

    read_dictionaries()

    logger.info('Hitting read events...')
    events = PETRreader.read_pipeline_input(data)
    if parsed:
        logger.info('Hitting do_coding')
        updated_events = do_coding(events, None)
    else:
        events = utilities.stanford_parse(events)
        updated_events = do_coding(events, None)
    if not write_output:
        output_events = PETRwriter.pipe_output(updated_events)
        return output_events
    elif write_output and not out_file:
        print('Please specify an output file...')
        logger.warning('Need an output file. ¯\_(ツ)_/¯')
        sys.exit()
    elif write_output and out_file:
        PETRwriter.write_events(updated_events, out_file)
예제 #3
0
def read_dictionaries(validation=False):

    print('Verb dictionary:', PETRglobals.VerbFileName)
    verb_path = utilities._get_dict_data('dictionary',
                                         PETRglobals.VerbFileName)
    PETRreader.read_verb_dictionary(verb_path)

    print('Actor dictionaries:', PETRglobals.ActorFileList)
    for actdict in PETRglobals.ActorFileList:
        actor_path = utilities._get_data('data/dictionaries', actdict)
        PETRreader.read_actor_dictionary(actor_path)

    print('Agent dictionary:', PETRglobals.AgentFileName)
    agent_path = utilities._get_data('data/dictionaries',
                                     PETRglobals.AgentFileName)
    PETRreader.read_agent_dictionary(agent_path)

    print('Discard dictionary:', PETRglobals.DiscardFileName)
    discard_path = utilities._get_data('data/dictionaries',
                                       PETRglobals.DiscardFileName)
    PETRreader.read_discard_list(discard_path)

    if PETRglobals.IssueFileName != "":
        print('Issues dictionary:', PETRglobals.IssueFileName)
        issue_path = utilities._get_data('data/dictionaries',
                                         PETRglobals.IssueFileName)
        PETRreader.read_issue_list(issue_path)
예제 #4
0
def run_pipeline(data, out_file=None, config=None, write_output=True,
                 parsed=False):
    # this is called externally
    utilities.init_logger('PETRARCH.log')
    logger = logging.getLogger('petr_log')
    if config:
        print('Using user-specified config: {}'.format(config))
        logger.info('Using user-specified config: {}'.format(config))
        PETRreader.parse_Config(config)
    else:
        logger.info('Using default config file.')
        logger.info('Config path: {}'.format(utilities._get_data('data/config/',
                                                                 'PETR_config.ini')))
        PETRreader.parse_Config(utilities._get_data('data/config/',
                                                    'PETR_config.ini'))

    read_dictionaries()

    logger.info('Hitting read events...')
    events = PETRreader.read_pipeline_input(data)
    if parsed:
        logger.info('Hitting do_coding')
        updated_events = do_coding(events, None)
#     else:
#         events = utilities.stanford_parse(events)
#         updated_events = do_coding(events, None)
    if not write_output:
        output_events = PETRwriter.pipe_output(updated_events)
        return output_events
    elif write_output and not out_file:
        print('Please specify an output file...')
        logger.warning('Need an output file. ¯\_(ツ)_/¯')
        sys.exit()
    elif write_output and out_file:
        PETRwriter.write_events(updated_events, out_file)
예제 #5
0
def process_target(queue, cli_args, multi_log_lock):
    # 打印子进程启动消息
    write_multiprocess_log(
        multi_log_lock, '{}Process {}: {}'.format(u'', os.getpid(),
                                                  u'started.'))

    # 子进程先读取进程运行所需各种信息
    utilities.init_logger()
    logger = logging.getLogger('petr_log')

    PETRglobals.RunTimeString = time.asctime()

    if cli_args.config:
        print('Using user-specified config: {}'.format(cli_args.config))
        logger.info('Using user-specified config: {}'.format(cli_args.config))
        PETRreader.parse_Config(cli_args.config)
    else:
        logger.info('Using default config file.')
        PETRreader.parse_Config(
            utilities._get_data('data/config/', 'PETR_config.ini'))

    if cli_args.nullverbs:
        print('Coding in null verbs mode; no events will be generated')
        logger.info('Coding in null verbs mode; no events will be generated')
        # Only get verb phrases that are not in the dictionary but are
        # associated with coded noun phrases
        PETRglobals.NullVerbs = True
    elif cli_args.nullactors:
        print('Coding in null actors mode; no events will be generated')
        logger.info('Coding in null verbs mode; no events will be generated')
        # Only get actor phrases that are not in the dictionary but
        # associated with coded verb phrases
        PETRglobals.NullActors = True
        PETRglobals.NewActorLength = int(cli_args.nullactors)

    read_dictionaries()
    print('\n\n')

    out = ""  # PETRglobals.EventFileName
    if cli_args.outputs:
        out = cli_args.outputs

    # 创建一个和数据库交流的session
    session = Session()

    while True:
        if queue.qsize > 0:
            # 从队列中获取一个任务
            task = queue.get()
            # 打印日志,获取到了任务
            write_multiprocess_log(
                multi_log_lock,
                '{}Process {} get one task: {}'.format(u'', os.getpid(), task))
            # 执行任务
            process_task(task, out, multi_log_lock, session)
        else:
            time.sleep(0.5 * random.random())
            continue
예제 #6
0
def main():

    cli_args = parse_cli_args()
    utilities.init_logger('PETRARCH.log', cli_args.debug)
    logger = logging.getLogger('petr_log')

    PETRglobals.RunTimeString = time.asctime()

    if cli_args.command_name == 'parse' or cli_args.command_name == 'batch':

        if cli_args.config:
            print('Using user-specified config: {}'.format(cli_args.config))
            logger.info(
                'Using user-specified config: {}'.format(cli_args.config))
            PETRglobals.ConfigFileName = cli_args.config

            PETRreader.parse_Config(cli_args.config)
        else:
            logger.info('Using default config file.')
            PETRglobals.ConfigFileName = 'PETR_config.ini'
            PETRreader.parse_Config(utilities._get_data('data/config/',
                                                        'PETR_config.ini'))

        read_dictionaries()

        start_time = time.time()
        print('\n\n')

        paths = PETRglobals.TextFileList
        if cli_args.inputs:
            if os.path.isdir(cli_args.inputs):
                if cli_args.inputs[-1] != '/':
                    paths = glob.glob(cli_args.inputs + '/*.xml')
                else:
                    paths = glob.glob(cli_args.inputs + '*.xml')
            elif os.path.isfile(cli_args.inputs):
                paths = [cli_args.inputs]
            else:
                print(
                    '\nFatal runtime error:\n"' +
                    cli_args.inputs +
                    '" could not be located\nPlease enter a valid directory or file of source texts.')
                sys.exit()

        out = "" #PETRglobals.EventFileName
        if cli_args.outputs:
                out = cli_args.outputs

        if cli_args.command_name == 'parse':
            run(paths, out, cli_args.parsed)

        else:
            run(paths, out , True)  ## <===

        print("Coding time:", time.time() - start_time)

    print("Finished")
예제 #7
0
def main():

    cli_args = parse_cli_args()
    utilities.init_logger('PETRARCH.log')
    logger = logging.getLogger('petr_log')

    PETRglobals.RunTimeString = time.asctime()


    if cli_args.command_name == 'parse' or cli_args.command_name == 'batch':

        if cli_args.config:
            print('Using user-specified config: {}'.format(cli_args.config))
            logger.info(
                'Using user-specified config: {}'.format(cli_args.config))
            PETRreader.parse_Config(cli_args.config)
        else:
            logger.info('Using default config file.')
            PETRreader.parse_Config(utilities._get_data('data/config/',
                                                        'PETR_config.ini'))

        read_dictionaries()
        start_time = time.time()
        print('\n\n')

        paths = PETRglobals.TextFileList
        if cli_args.inputs:
            if os.path.isdir(cli_args.inputs):
                if cli_args.inputs[-1] != '/':
                    paths = glob.glob(cli_args.inputs + '/*.xml')
                else:
                    paths = glob.glob(cli_args.inputs + '*.xml')
            elif os.path.isfile(cli_args.inputs):
                paths = [cli_args.inputs]
            else:
                print(
                    '\nFatal runtime error:\n"' +
                    cli_args.inputs +
                    '" could not be located\nPlease enter a valid directory or file of source texts.')
                sys.exit()
        
        out = "" #PETRglobals.EventFileName
        if cli_args.outputs:
                out = cli_args.outputs
             
        if cli_args.command_name == 'parse':
            run(paths, out, cli_args.parsed)

        else:
            run(paths, out , True)

        print("Coding time:", time.time() - start_time)

    print("Finished")
예제 #8
0
 def __init__(self,
              config_folder='data/config/',
              config_file='PETR_config.ini'):
     #cli_args = petrarch2.parse_cli_args()
     utilities.init_logger('PETRARCH.log')
     logger = logging.getLogger('petr_log')
     PETRglobals.RunTimeString = time.asctime()
     logger.info('Using Config file: ' + config_file)
     PETRreader.parse_Config(utilities._get_data(config_folder,
                                                 config_file))
     petrarch2.read_dictionaries()
예제 #9
0
 def __init__(self, petrGlobal={}, config_folder='data/config/', config_file='PETR_config.ini'):
     #cli_args = petrarch2.parse_cli_args()
     if not petrGlobal:
         utilities.init_logger('PETRARCH.log', debug=False)
         logger = logging.getLogger('petr_log')
         PETRglobals.RunTimeString = time.asctime()
         logger.info('Using Config file: '+config_file)
         PETRreader.parse_Config(utilities._get_data(config_folder, config_file))
         petrarch_ud.read_dictionaries()
         print("SUCCESSFULL ON LOADING DICTIONARIES")
     else:
         print ("LOADING FROM MAP")
         self.load(petrGlobal)
예제 #10
0
def read_dictionaries(validation=False):

    print('Internal Coding Ontology:', PETRglobals.InternalCodingOntologyFileName)
    pico_path = utilities._get_data('data/dictionaries', PETRglobals.InternalCodingOntologyFileName)
    PETRreader.read_internal_coding_ontology(pico_path)

    print('Verb dictionary:', PETRglobals.VerbFileName)
    verb_path = utilities._get_data(
        'data/dictionaries',
        PETRglobals.VerbFileName)
    PETRreader.read_verb_dictionary(verb_path)

    if PETRglobals.CodeWithPetrarch1:
        print('Petrarch 1 Verb dictionary:', PETRglobals.P1VerbFileName)
        verb_path = utilities._get_data(
            'data/dictionaries',
            PETRglobals.P1VerbFileName)
        PETRreader.read_petrarch1_verb_dictionary(verb_path)

    print('Actor dictionaries:', PETRglobals.ActorFileList)
    for actdict in PETRglobals.ActorFileList:
        actor_path = utilities._get_data('data/dictionaries', actdict)
        PETRreader.read_actor_dictionary(actor_path)

    print('Agent dictionary:', PETRglobals.AgentFileList)
    for agentdict in PETRglobals.AgentFileList:
        agent_path = utilities._get_data('data/dictionaries', agentdict)
        PETRreader.read_agent_dictionary(agent_path)

    print('Discard dictionary:', PETRglobals.DiscardFileName)
    discard_path = utilities._get_data('data/dictionaries',
                                       PETRglobals.DiscardFileName)
    PETRreader.read_discard_list(discard_path)

    if PETRglobals.IssueFileName != "":
        print('Issues dictionary:', PETRglobals.IssueFileName)
        issue_path = utilities._get_data('data/dictionaries',
                                         PETRglobals.IssueFileName)
        PETRreader.read_issue_list(issue_path)
예제 #11
0
def main(cli_args=None):
    if not cli_args:
        cli_args = parse_cli_args()
    utilities.init_logger('PETRARCH.log')
    logger = logging.getLogger('petr_log')

    PETRglobals.RunTimeString = time.asctime()

    print(cli_args)
    if cli_args.config:
        print('Using user-specified config: {}'.format(cli_args.config))
        logger.info('Using user-specified config: {}'.format(cli_args.config))
        PETRreader.parse_Config(cli_args.config)
    else:
        logger.info('Using default config file.')
        PETRreader.parse_Config(
            utilities._get_data('data/config/', 'PETR_config.ini'))

    if cli_args.nullverbs:
        print('Coding in null verbs mode; no events will be generated')
        logger.info('Coding in null verbs mode; no events will be generated')
        # Only get verb phrases that are not in the dictionary but are
        # associated with coded noun phrases
        PETRglobals.NullVerbs = True
    elif cli_args.nullactors:
        print('Coding in null actors mode; no events will be generated')
        logger.info('Coding in null verbs mode; no events will be generated')
        # Only get actor phrases that are not in the dictionary but
        # associated with coded verb phrases
        PETRglobals.NullActors = True
        PETRglobals.NewActorLength = int(cli_args.nullactors)

    read_dictionaries()
    start_time = time.time()
    print('\n\n')

    paths = PETRglobals.TextFileList
    if cli_args.inputs:
        if os.path.isdir(cli_args.inputs):
            if cli_args.inputs[-1] != '/':
                paths = glob.glob(cli_args.inputs + '/*.xml')
            else:
                paths = glob.glob(cli_args.inputs + '*.xml')
        elif os.path.isfile(cli_args.inputs):
            paths = [cli_args.inputs]
        else:
            print(
                '\nFatal runtime error:\n"' + cli_args.inputs +
                '" could not be located\nPlease enter a valid directory or file of source texts.'
            )
            sys.exit()

    out = ""  # PETRglobals.EventFileName
    if cli_args.outputs:
        out = cli_args.outputs

    if cli_args.command_name == 'parse':
        events = run(paths, out, cli_args.parsed)
    else:
        events = run(paths, out, True)  # <===
    print("Coding time:", time.time() - start_time)
    print("Finished")
    return events
예제 #12
0
def main():

    cli_args = parse_cli_args()
    """print(cli_args)
    sys.exit()"""
    utilities.init_logger('PETRARCH.log')
    logger = logging.getLogger('petr_log')

    PETRglobals.RunTimeString = time.asctime()


    if cli_args.command_name == 'parse' or cli_args.command_name == 'batch':  # 16.06.27: no longer needed, right?

        print(cli_args)
        if cli_args.config:
            print('Using user-specified config: {}'.format(cli_args.config))
            logger.info(
                'Using user-specified config: {}'.format(cli_args.config))
            PETRreader.parse_Config(cli_args.config)
        else:
            logger.info('Using default config file.')
            PETRreader.parse_Config(utilities._get_data('data/config/',
                                                        'PETR_config.ini'))

        if cli_args.nullverbs:
            print('Coding in null verbs mode; no events will be generated')
            logger.info('Coding in null verbs mode; no events will be generated')
            PETRglobals.NullVerbs  = True  # Only get verb phrases that are not in the dictionary but are associated with coded noun phrases
        elif cli_args.nullactors:
            print('Coding in null actors mode; no events will be generated')
            logger.info('Coding in null verbs mode; no events will be generated')
            PETRglobals.NullActors = True  # Only get actor phrases that are not in the dictionary but associated with coded verb phrases
            PETRglobals.NewActorLength = int(cli_args.nullactors)

        read_dictionaries()
        start_time = time.time()
        print('\n\n')

        paths = PETRglobals.TextFileList
        if cli_args.inputs:
            if os.path.isdir(cli_args.inputs):
                if cli_args.inputs[-1] != '/':
                    paths = glob.glob(cli_args.inputs + '/*.xml')
                else:
                    paths = glob.glob(cli_args.inputs + '*.xml')
            elif os.path.isfile(cli_args.inputs):
                paths = [cli_args.inputs]
            else:
                print(
                    '\nFatal runtime error:\n"' +
                    cli_args.inputs +
                    '" could not be located\nPlease enter a valid directory or file of source texts.')
                sys.exit()
        
        out = "" #PETRglobals.EventFileName
        if cli_args.outputs:
                out = cli_args.outputs
             
        if cli_args.command_name == 'parse':
            run(paths, out, cli_args.parsed)

        else:
            run(paths, out , True)  ## <===

        print("Coding time:", time.time() - start_time)

    print("Finished")
예제 #13
0
                    triple[0], basestring) else triple[0].text
                target = triple[1] if isinstance(
                    triple[1], basestring) else triple[1].text
                others = ""
                for other in triple[3]:
                    others = others + other.text + ","
                tuples = tuples + "source: " + source + "\ttarget: " + target + "\tverb: " + triple[
                    2].text + "\tother_noun: " + others + "\n"
            ET.SubElement(sentence, "Triplets").text = tuples

    tree = ET.ElementTree(root)
    tree.write(outputfile, 'UTF-8')


utilities.init_logger('PETRARCH.log', True)
config = utilities._get_data('data/config/', 'PETR_config.ini')
print("reading config")
sys.stdout.write('Mk1\n')
PETRreader.parse_Config(config)
print("reading dicts")
petrarch_ud.read_dictionaries()
inputFile = sys.argv[1]
#inputFile=sys.argv[1].replace(".xml","")+"_parsed.xml"
outputFile = inputFile.replace("_parsed.xml", "") + "_phrase.xml"
events = read_xml_input([inputFile], True)
'''
print(len(events))
for key in events.keys():
	print(len(events[key]['sents']))
	for subkey,v in events[key]['sents'].items():
		print(subkey)
예제 #14
0
def process_target_bak(q, l, first_task, cli_args, multi_log_lock):

    # 子进程先读取进程运行所需各种信息
    utilities.init_logger()
    logger = logging.getLogger('petr_log')

    PETRglobals.RunTimeString = time.asctime()

    if cli_args.config:
        print('Using user-specified config: {}'.format(cli_args.config))
        logger.info('Using user-specified config: {}'.format(cli_args.config))
        PETRreader.parse_Config(cli_args.config)
    else:
        logger.info('Using default config file.')
        PETRreader.parse_Config(
            utilities._get_data('data/config/', 'PETR_config.ini'))

    if cli_args.nullverbs:
        print('Coding in null verbs mode; no events will be generated')
        logger.info('Coding in null verbs mode; no events will be generated')
        # Only get verb phrases that are not in the dictionary but are
        # associated with coded noun phrases
        PETRglobals.NullVerbs = True
    elif cli_args.nullactors:
        print('Coding in null actors mode; no events will be generated')
        logger.info('Coding in null verbs mode; no events will be generated')
        # Only get actor phrases that are not in the dictionary but
        # associated with coded verb phrases
        PETRglobals.NullActors = True
        PETRglobals.NewActorLength = int(cli_args.nullactors)

    read_dictionaries()
    print('\n\n')

    out = ""  # PETRglobals.EventFileName
    if cli_args.outputs:
        out = cli_args.outputs

    # 创建一个和数据库交流的session
    session = Session()

    # 子进程先完成第一个任务
    write_multiprocess_log(
        multi_log_lock, '{}Process {}: {}'.format(u'', os.getpid(),
                                                  first_task))
    process_task(first_task, out, multi_log_lock, session)

    while l.acquire():
        # 队列不为空,empty()方法不可靠,使用qsize()
        if q.qsize() != 0:
            # 从队列中获取下一个任务
            task = q.get()
            # 任务获取完之后释放锁
            l.release()
            # 完成获取到的任务
            write_multiprocess_log(
                multi_log_lock,
                '{}Process {}: {}'.format(u'', os.getpid(), task))
            process_task(task, out, multi_log_lock, session)
        # 队列为空
        else:
            # 释放锁
            l.release()
            # 跳出循环
            break

    write_multiprocess_log(
        multi_log_lock, '{}Process {}: {}'.format(u'', os.getpid(),
                                                  u'exited...'))
예제 #15
0
def main():
    cli_args = parse_cli_args()

    # miaoweixin added begin
    # 作为后台程序无限循环运行
    if cli_args.command_name == 'background':
        try:
            # infinite loop
            run_in_background(cli_args)
        except KeyboardInterrupt:
            print("Program exited due to keyboard interrupt.\n")
            return None
    # miaoweixin added end

    utilities.init_logger()
    logger = logging.getLogger('petr_log')

    PETRglobals.RunTimeString = time.asctime()

    print(cli_args)
    if cli_args.config:
        print('Using user-specified config: {}'.format(cli_args.config))
        logger.info('Using user-specified config: {}'.format(cli_args.config))
        PETRreader.parse_Config(cli_args.config)
    else:
        logger.info('Using default config file.')
        PETRreader.parse_Config(
            utilities._get_data('data/config/', 'PETR_config.ini'))

    if cli_args.nullverbs:
        print('Coding in null verbs mode; no events will be generated')
        logger.info('Coding in null verbs mode; no events will be generated')
        # Only get verb phrases that are not in the dictionary but are
        # associated with coded noun phrases
        PETRglobals.NullVerbs = True
    elif cli_args.nullactors:
        print('Coding in null actors mode; no events will be generated')
        logger.info('Coding in null verbs mode; no events will be generated')
        # Only get actor phrases that are not in the dictionary but
        # associated with coded verb phrases
        PETRglobals.NullActors = True
        PETRglobals.NewActorLength = int(cli_args.nullactors)

    read_dictionaries()
    start_time = time.time()
    print('\n\n')

    paths = PETRglobals.TextFileList
    if cli_args.inputs:
        if os.path.isdir(cli_args.inputs):
            if cli_args.inputs[-1] != '/':
                paths = glob.glob(cli_args.inputs + '/*.xml')
            else:
                paths = glob.glob(cli_args.inputs + '*.xml')
        elif os.path.isfile(cli_args.inputs):
            paths = [cli_args.inputs]
        else:
            print(
                '\nFatal runtime error:\n"' + cli_args.inputs +
                '" could not be located\nPlease enter a valid directory or file of source texts.'
            )
            sys.exit()
    elif cli_args.command_name == 'javainfo':
        # add else to java info 0904
        paths = 'javainfo'

    out = ""  # PETRglobals.EventFileName
    if cli_args.outputs:
        out = cli_args.outputs

    if cli_args.command_name == 'parse':
        run(paths, out, cli_args.parsed, cli_args)
    else:
        run(paths, out, True, cli_args)  # <===

    print("Coding time:", time.time() - start_time)

    print("Finished")