Пример #1
0
def read_dictionaries(validation=False):


    print('Verb dictionary:', PETRglobals.VerbFileName)
    verb_path = utilities._get_data(
        'data/dictionaries',
        PETRglobals.VerbFileName)
    PETRreader.read_verb_dictionary(verb_path)
    
    print('Actor dictionaries:', PETRglobals.ActorFileList)
    for actdict in PETRglobals.ActorFileList:
        actor_path = utilities._get_data('data/dictionaries', actdict)
        PETRreader.read_actor_dictionary(actor_path)

    print('Agent dictionary:', PETRglobals.AgentFileName)
    agent_path = utilities._get_data('data/dictionaries',
                                     PETRglobals.AgentFileName)
    PETRreader.read_agent_dictionary(agent_path)

    print('Discard dictionary:', PETRglobals.DiscardFileName)
    discard_path = utilities._get_data('data/dictionaries',
                                       PETRglobals.DiscardFileName)
    PETRreader.read_discard_list(discard_path)

    if PETRglobals.IssueFileName != "":
        print('Issues dictionary:', PETRglobals.IssueFileName)
        issue_path = utilities._get_data('data/dictionaries',
                                         PETRglobals.IssueFileName)
        PETRreader.read_issue_list(issue_path)
Пример #2
0
def run_pipeline(data, out_file=None, config=None, write_output=True,
                 parsed=False):
    utilities.init_logger('PETRARCH.log')
    logger = logging.getLogger('petr_log')
    if config:
        print('Using user-specified config: {}'.format(config))
        logger.info('Using user-specified config: {}'.format(config))
        PETRreader.parse_Config(config)
    else:
        logger.info('Using default config file.')
        logger.info('Config path: {}'.format(utilities._get_data('data/config/',
                                                                 'PETR_config.ini')))
        PETRreader.parse_Config(utilities._get_data('data/config/',
                                                    'PETR_config.ini'))

    read_dictionaries()

    logger.info('Hitting read events...')
    events = PETRreader.read_pipeline_input(data)
    if parsed:
        logger.info('Hitting do_coding')
        updated_events = do_coding(events, None)
    else:
        events = utilities.stanford_parse(events)
        updated_events = do_coding(events, None)
    if not write_output:
        output_events = PETRwriter.pipe_output(updated_events)
        return output_events
    elif write_output and not out_file:
        print('Please specify an output file...')
        logger.warning('Need an output file. ¯\_(ツ)_/¯')
        sys.exit()
    elif write_output and out_file:
        PETRwriter.write_events(updated_events, out_file)
Пример #3
0
def main():

    cli_args = parse_cli_args()
    utilities.init_logger('PETRARCH.log')
    logger = logging.getLogger('petr_log')

    PETRglobals.RunTimeString = time.asctime()


    if cli_args.command_name == 'parse' or cli_args.command_name == 'batch':

        if cli_args.config:
            print('Using user-specified config: {}'.format(cli_args.config))
            logger.info(
                'Using user-specified config: {}'.format(cli_args.config))
            PETRreader.parse_Config(cli_args.config)
        else:
            logger.info('Using default config file.')
            PETRreader.parse_Config(utilities._get_data('data/config/',
                                                        'PETR_config.ini'))

        read_dictionaries()
        start_time = time.time()
        print('\n\n')

        paths = PETRglobals.TextFileList
        if cli_args.inputs:
            if os.path.isdir(cli_args.inputs):
                if cli_args.inputs[-1] != '/':
                    paths = glob.glob(cli_args.inputs + '/*.xml')
                else:
                    paths = glob.glob(cli_args.inputs + '*.xml')
            elif os.path.isfile(cli_args.inputs):
                paths = [cli_args.inputs]
            else:
                print(
                    '\nFatal runtime error:\n"' +
                    cli_args.inputs +
                    '" could not be located\nPlease enter a valid directory or file of source texts.')
                sys.exit()
        
        out = "" #PETRglobals.EventFileName
        if cli_args.outputs:
                out = cli_args.outputs
             
        if cli_args.command_name == 'parse':
            run(paths, out, cli_args.parsed)

        else:
            run(paths, out , True)

        print("Coding time:", time.time() - start_time)

    print("Finished")
Пример #4
0
def test_date_check():
    parse = "(S (NP (NNP CARL ) (NN XVI ) (NNP GUSTAF ) ) )"

    test = ptree.Sentence(parse,"Carl XVI Gustaf", PETRreader.dstr_to_ordate("20150813"))
    phrase = test.tree.children[0]
    assert phrase.get_meaning() == ["SWEGOV"]

    test = ptree.Sentence(parse,"Carl XVI Gustaf", PETRreader.dstr_to_ordate( "19720813"))
    phrase = test.tree.children[0]
    assert phrase.get_meaning() == ["SWEELI"]

    test = ptree.Sentence(parse,"Carl XVI Gustaf", PETRreader.dstr_to_ordate("19010813"))
    phrase = test.tree.children[0]
    assert phrase.get_meaning() == ["SWEELI"]
Пример #5
0
    def check_date(self, match):
        """
        Method for resolving date restrictions on actor codes. 
        
        Parameters
        -----------
        match: list
               Dates and codes from the dictionary
        
        Returns
        -------
        code: string
              The code corresponding to how the actor should be coded given the date
        """

        code = None
        try:

            for j in match:
                dates = j[1]
                date = []
                code = ""
                for d in dates:
                    if d[0] in "<>":
                        date.append(d[0] + str(PETRreader.dstr_to_ordate(d[1:])))
                    else:
                        date.append(str(PETRreader.dstr_to_ordate(d)))
                curdate = self.date
                if not date:
                    code = j[0]
                elif len(date) == 1:
                    if date[0][0] == "<":
                        if curdate < int(date[0][1:]):
                            code = j[0]
                    else:
                        if curdate >= int(date[0][1:]):
                            code = j[0]
                else:
                    if curdate < int(date[1]):
                        if curdate >= int(date[0]):
                            code = j[0]

                if code:
                    return code
        except Exception as e:
            # print(e)
            return code

        return code
Пример #6
0
def gen_cameo_event(jsonString):
    events = PETRreader.read_json(jsonString)
    if events:
        updated_events = do_coding(events, None)
        return updated_events
    else:
        return {}
Пример #7
0
def run(filepaths, out_file, s_parsed):
    # this is the routine called from main()
    events = PETRreader.read_xml_input(filepaths, s_parsed)
#     if not s_parsed:
#         events = utilities.stanford_parse(events)
    updated_events = do_coding(events, out_file)
    PETRwriter.write_events(updated_events, 'evts.' + out_file)
Пример #8
0
def run(filepaths, out_file, s_parsed):
    # this is the routine called from main()
    events = PETRreader.read_xml_input(filepaths, s_parsed)
    if not s_parsed:
        events = utilities.stanford_parse(events)
    updated_events = do_coding(events, out_file)
    if PETRglobals.NullVerbs:
        PETRwriter.write_nullverbs(updated_events, 'nullverbs.' + out_file)
    elif PETRglobals.NullActors:
        PETRwriter.write_nullactors(updated_events, 'nullactors.' + out_file)
    else:
        PETRwriter.write_events(updated_events, 'evts.' + out_file)
Пример #9
0
def main():

    cli_args = parse_cli_args()
    """print(cli_args)
    sys.exit()"""
    utilities.init_logger('PETRARCH.log')
    logger = logging.getLogger('petr_log')

    PETRglobals.RunTimeString = time.asctime()


    if cli_args.command_name == 'parse' or cli_args.command_name == 'batch':  # 16.06.27: no longer needed, right?

        print(cli_args)
        if cli_args.config:
            print('Using user-specified config: {}'.format(cli_args.config))
            logger.info(
                'Using user-specified config: {}'.format(cli_args.config))
            PETRreader.parse_Config(cli_args.config)
        else:
            logger.info('Using default config file.')
            PETRreader.parse_Config(utilities._get_data('data/config/',
                                                        'PETR_config.ini'))

        if cli_args.nullverbs:
            print('Coding in null verbs mode; no events will be generated')
            logger.info('Coding in null verbs mode; no events will be generated')
            PETRglobals.NullVerbs  = True  # Only get verb phrases that are not in the dictionary but are associated with coded noun phrases
        elif cli_args.nullactors:
            print('Coding in null actors mode; no events will be generated')
            logger.info('Coding in null verbs mode; no events will be generated')
            PETRglobals.NullActors = True  # Only get actor phrases that are not in the dictionary but associated with coded verb phrases
            PETRglobals.NewActorLength = int(cli_args.nullactors)

        read_dictionaries()
        start_time = time.time()
        print('\n\n')

        paths = PETRglobals.TextFileList
        if cli_args.inputs:
            if os.path.isdir(cli_args.inputs):
                if cli_args.inputs[-1] != '/':
                    paths = glob.glob(cli_args.inputs + '/*.xml')
                else:
                    paths = glob.glob(cli_args.inputs + '*.xml')
            elif os.path.isfile(cli_args.inputs):
                paths = [cli_args.inputs]
            else:
                print(
                    '\nFatal runtime error:\n"' +
                    cli_args.inputs +
                    '" could not be located\nPlease enter a valid directory or file of source texts.')
                sys.exit()
        
        out = "" #PETRglobals.EventFileName
        if cli_args.outputs:
                out = cli_args.outputs
             
        if cli_args.command_name == 'parse':
            run(paths, out, cli_args.parsed)

        else:
            run(paths, out , True)  ## <===

        print("Coding time:", time.time() - start_time)

    print("Finished")
Пример #10
0
def test_reflexive():
    parse = "(S (NP (NNP Obama ) )  (VP (VBD asked ) (NP (PRP himself ) )  (SBAR (WHADVP (WRB why ) ) (S (NP (NNP Biden ) ) (VP (VBD was ) (ADJP (VBN tired ) ) ) ) ) ) )".upper()

    test = ptree.Sentence(parse,"Obama asked himself why Biden was tired",PETRreader.dstr_to_ordate("20150813"))
    phrase = test.tree.children[1].children[1]
    assert phrase.get_meaning() == ["USAGOV"]
Пример #11
0
def main():
    cli_args = parse_cli_args()
    utilities.init_logger('PETRARCH.log')
    logger = logging.getLogger('petr_log')

    PETRglobals.RunTimeString = time.asctime()

    if cli_args.command_name == 'parse' or cli_args.command_name == 'batch':  # 16.06.27: no longer needed, right?

        print(cli_args)
        if cli_args.config:
            print('Using user-specified config: {}'.format(cli_args.config))
            logger.info('Using user-specified config: {}'.format(
                cli_args.config))
            PETRreader.parse_Config(cli_args.config)
        else:
            logger.info('Using default config file.')
            PETRreader.parse_Config(
                utilities._get_data('data/config/', 'PETR_config.ini'))

        if cli_args.nullverbs:
            print('Coding in null verbs mode; no events will be generated')
            logger.info(
                'Coding in null verbs mode; no events will be generated')
            # Only get verb phrases that are not in the dictionary but are
            # associated with coded noun phrases
            PETRglobals.NullVerbs = True
        elif cli_args.nullactors:
            print('Coding in null actors mode; no events will be generated')
            logger.info(
                'Coding in null verbs mode; no events will be generated')
            # Only get actor phrases that are not in the dictionary but
            # associated with coded verb phrases
            PETRglobals.NullActors = True
            PETRglobals.NewActorLength = int(cli_args.nullactors)

        read_dictionaries()
        start_time = time.time()
        print('\n\n')

        paths = PETRglobals.TextFileList
        if cli_args.inputs:
            if os.path.isdir(cli_args.inputs):
                if cli_args.inputs[-1] != '/':
                    paths = glob.glob(cli_args.inputs + '/*.xml')
                else:
                    paths = glob.glob(cli_args.inputs + '*.xml')
            elif os.path.isfile(cli_args.inputs):
                paths = [cli_args.inputs]
            else:
                print(
                    '\nFatal runtime error:\n"' + cli_args.inputs +
                    '" could not be located\nPlease enter a valid directory or file of source texts.'
                )
                sys.exit()

        out = ""  # PETRglobals.EventFileName
        if cli_args.outputs:
            out = cli_args.outputs

        if cli_args.command_name == 'parse':
            run(paths, out, cli_args.parsed)

        else:
            run(paths, out, True)  # <===

        print("Coding time:", time.time() - start_time)

    print("Finished")
Пример #12
0
def do_coding(event_dict, out_file):
    """
    Main coding loop Note that entering any character other than 'Enter' at the
    prompt will stop the program: this is deliberate.
    <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first
                sentence of the *next* story
    """

    treestr = ""

    NStory = 0
    NSent = 0
    NEvents = 0
    NEmpty = 0
    NDiscardSent = 0
    NDiscardStory = 0

    """if out_file:  # <16.06.18 pas> disable for now
        file = open_tex(out_file)"""

    logger = logging.getLogger('petr_log')
    times = 0
    sents = 0
    for key, val in sorted(event_dict.items()):
        NStory += 1
        prev_code = []

        SkipStory = False
        print('\n\nProcessing story {}'.format(key))
        StoryDate = event_dict[key]['meta']['date']
        StorySource = 'TEMP'
        for sent in val['sents']:
            NSent += 1
            if 'parsed' in event_dict[key]['sents'][sent]:
                if 'config' in val['sents'][sent]:
                    for id, config in event_dict[key][
                            'sents'][sent]['config'].items():
                        change_Config_Options(config)

                SentenceID = '{}_{}'.format(key, sent)
                SentenceText = event_dict[key]['sents'][sent]['content']
                SentenceDate = event_dict[key]['sents'][sent][
                    'date'] if 'date' in event_dict[key]['sents'][sent] else StoryDate
                Date = PETRreader.dstr_to_ordate(SentenceDate)
                SentenceSource = 'TEMP'
                
                print("\n",SentenceID)
                parsed = event_dict[key]['sents'][sent]['parsed']
                treestr = parsed
                disc = check_discards(SentenceText)
                if disc[0] > 0:
                    if disc[0] == 1:
                        print("Discard sentence:", disc[1])
                        logger.info('\tSentence discard. {}'.format(disc[1]))
                        NDiscardSent += 1
                        continue
                    else:
                        print("Discard story:", disc[1])
                        logger.info('\tStory discard. {}'.format(disc[1]))
                        SkipStory = True
                        NDiscardStory += 1
                        break
                
                
                t1 = time.time()
                sentence = PETRtree.Sentence(treestr,SentenceText,Date)
                print(sentence.txt)
                coded_events , meta = sentence.get_events()  # this is the entry point into the processing in PETRtree
                code_time = time.time()-t1
                if PETRglobals.NullVerbs or PETRglobals.NullActors:
                    event_dict[key]['meta'] = meta
                    event_dict[key]['text'] = sentence.txt                    
                elif PETRglobals.NullActors:
                    event_dict[key]['events'] = coded_events
                    coded_events = None   # skips additional processing
                    event_dict[key]['text'] = sentence.txt                    
                else:
                    event_dict[key]['meta']['verbs'] = meta # 16.04.30 pas: we're using the key value 'meta' at two very different
                                                        # levels of event_dict -- see the code about ten lines below -- and 
                                                        # this is potentially confusing, so it probably would be useful to  
                                                        # change one of those 

                """if out_file: # <16.06.18 pas> This isn't doing anything useful right now, just flipping bits on the hard drive, so I'm disabling it  
                    sentence.print_to_file(sentence.tree,file = file)"""
                
                del(sentence)
                times+=code_time
                sents += 1
                #print('\t\t',code_time)
                
                
                if coded_events:
                    event_dict[key]['sents'][sent]['events'] = coded_events
                    event_dict[key]['sents'][sent]['meta'] = meta  
                    """print('DC-events:', coded_events) # --
                    print('DC-meta:', meta) # --
                    print('+++',event_dict[key]['sents'][sent])  # --"""
                    if  PETRglobals.WriteActorText or PETRglobals.WriteEventText or PETRglobals.WriteActorRoot :
                        text_dict = utilities.extract_phrases(event_dict[key]['sents'][sent],SentenceID)
# --                        print('DC-td1:',text_dict) # --
                        if text_dict:
                            event_dict[key]['sents'][sent]['meta']['actortext'] = {}
                            event_dict[key]['sents'][sent]['meta']['eventtext'] = {}
                            event_dict[key]['sents'][sent]['meta']['actorroot'] = {}
# --                            print('DC1:',text_dict) # --
                            for evt in coded_events:
                                if evt in text_dict: # 16.04.30 pas bypasses problems with expansion of compounds 
                                    event_dict[key]['sents'][sent]['meta']['actortext'][evt] = text_dict[evt][:2]
                                    event_dict[key]['sents'][sent]['meta']['eventtext'][evt] = text_dict[evt][2]
                                    event_dict[key]['sents'][sent]['meta']['actorroot'][evt] = text_dict[evt][3:5]

                if coded_events and PETRglobals.IssueFileName != "":
                    event_issues = get_issues(SentenceText)
                    if event_issues:
                        event_dict[key]['sents'][sent]['issues'] = event_issues

                if PETRglobals.PauseBySentence:
                    if len(input("Press Enter to continue...")) > 0:
                        sys.exit()

                prev_code = coded_events
                NEvents += len(coded_events)
                if len(coded_events) == 0:
                    NEmpty += 1
            else:
                logger.info(
                    '{} has no parse information. Passing.'.format(SentenceID))
                pass

        if SkipStory:
            event_dict[key]['sents'] = None


    """if out_file:  # <16.06.18 pas> disable for now
        close_tex(file)"""
    

    print("\nSummary:")
    print(
        "Stories read:",
        NStory,
        "   Sentences coded:",
        NSent,
        "  Events generated:",
        NEvents)
    print(
        "Discards:  Sentence",
        NDiscardSent,
        "  Story",
        NDiscardStory,
        "  Sentences without events:",
        NEmpty)
    print("Average Coding time = ", times/sents if sents else 0)
# --    print('DC-exit:',event_dict)
    return event_dict
Пример #13
0
    def check_date(self, match):
        """
		Method for resolving date restrictions on actor codes.

		Parameters
		-----------
		match: list
		       Dates and codes from the dictionary

		Returns
		-------
		code: string
		      The code corresponding to how the actor should be coded given the date


		Note <16.06.10 pas>
		-------------------
		In a very small set of cases involving a reflexive PRP inside a PP, the system can get into an infinite
		recursion where it first backs up a couple levels from the (PP, then this call to child.get_meaning() drops
		back down to the same point via the two child invocations in NounPhrase.get_meaning()

		            elif child.label == "PP":
		                m = self.resolve_codes(child.get_meaning())

		and in PrepPhrase.get_meaning()

		            self.meaning = self.children[1].get_meaning() if isinstance(self.children[1],NounPhrase) else ""

		which takes one back to the same point at one deeper level of recursion. These structures occurred about five times
		in a 20M sentence corpus, and I couldn't find any fix that didn't break something else, so I just trapped it
		here.

		There are a bunch of commented-out debugging prints remaining from this futile pursuit that could presumably be
		removed at some point.

		The full record for one of the offending cases is:

		<Sentence date = "20150824" id ="e35ef55a-fa30-4c34-baae-965dea33d8d8_3" source = "ANOTHER INFINITE RECURSION" sentence = "True">
		<Text>
		He started out at the bottom of the Hollywood rung, directed his own movie and managed to get noticed by Steven
		Spielberg himself to nab a tiny role in 1998s Saving Private Ryan .
		</Text>
		<Parse>
		(ROOT (S (S (NP (PRP He))
		(VP (VBD started) (PRT (RP out))
		(PP (IN at)
		(NP (NP (DT the) (NN bottom))
		(PP (IN of) (NP (DT the) (NNP Hollywood) ))))))
		(VP (VBD rung))
		(, ,)
		(S (VP
		(VP (VBD directed) (NP (PRP$ his) (JJ own) (NN movie))) (CC and)
		(VP (VBD managed) (S
		(VP (TO to)
		(VP (VB get)
		    (VP (VBN noticed)
		    (PP (IN by)
		        (NP (NNP Steven) (NNP Spielberg) (PRP himself))
		    )
		    (S  (VP (TO to)  (VP (VB nab)
		            (NP (NP (DT a) (JJ tiny) (NN role))
		            (PP (IN in)
		                (NP (NP (NNS 1998s))  (VP (VBG Saving)  (NP (JJ Private) (NNP Ryan))
		            ))))))))))))))
		(. .)))
		</Parse>
		</Sentence>

		"""

        code = None
        #try:
        for j in match:
            dates = j[1]
            date = []
            code = ""
            for d in dates:
                if d[0] in '<>':
                    date.append(d[0] + str(PETRreader.dstr_to_ordate(d[1:])))
                else:
                    date.append(str(PETRreader.dstr_to_ordate(d)))

            curdate = self.date

            if not date:
                code = j[0]
            elif len(date) == 1:
                if date[0][0] == '<':
                    if curdate < int(date[0][1:]):
                        code = j[0]
                else:
                    if curdate >= int(date[0][1:]):
                        code = j[0]
            else:
                if curdate < int(date[1]):
                    if curdate >= int(date[0]):
                        code = j[0]

            if code:
                return code

        #except Exception as e:
        # print(e)
        #	return code

        return code
Пример #14
0
def do_coding(event_dict):
    """
    Main coding loop Note that entering any character other than 'Enter' at the
    prompt will stop the program: this is deliberate.
    <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first
                sentence of the *next* story
    """

    treestr = ""

    NStory = 0
    NSent = 0
    NEvents = 0
    NEmpty = 0
    NDiscardSent = 0
    NDiscardStory = 0

    logger = logging.getLogger('petr_log')
    times = 0
    sents = 0
    for key, val in sorted(event_dict.items()):
        NStory += 1
        prev_code = []

        SkipStory = False
        print('\n\nProcessing story {}'.format(key))
        StoryDate = event_dict[key]['meta']['date']
        for sent in val['sents']:
            NSent += 1
            if 'parsed' in event_dict[key]['sents'][sent]:
                if 'config' in val['sents'][sent]:
                    for _, config in event_dict[key]['sents'][sent][
                            'config'].items():
                        change_Config_Options(config)

                SentenceID = '{}_{}'.format(key, sent)
                SentenceText = event_dict[key]['sents'][sent]['content']
                SentenceDate = event_dict[key]['sents'][sent][
                    'date'] if 'date' in event_dict[key]['sents'][
                        sent] else StoryDate
                Date = PETRreader.dstr_to_ordate(SentenceDate)

                print("\n", SentenceID)
                parsed = event_dict[key]['sents'][sent]['parsed']
                treestr = parsed
                disc = check_discards(SentenceText)
                if disc[0] > 0:
                    if disc[0] == 1:
                        print("Discard sentence:", disc[1])
                        logger.info('\tSentence discard. {}'.format(disc[1]))
                        NDiscardSent += 1
                        continue
                    else:
                        print("Discard story:", disc[1])
                        logger.info('\tStory discard. {}'.format(disc[1]))
                        SkipStory = True
                        NDiscardStory += 1
                        break

                t1 = time.time()
                sentence = PETRtree.Sentence(treestr, SentenceText, Date)
                print(sentence.txt)
                # this is the entry point into the processing in PETRtree
                coded_events, meta = sentence.get_events()
                code_time = time.time() - t1
                if PETRglobals.NullVerbs or PETRglobals.NullActors:
                    event_dict[key]['meta'] = meta
                    event_dict[key]['text'] = sentence.txt
                elif PETRglobals.NullActors:
                    event_dict[key]['events'] = coded_events
                    coded_events = None  # skips additional processing
                    event_dict[key]['text'] = sentence.txt
                else:
                    # 16.04.30 pas: we're using the key value 'meta' at two
                    # very different
                    event_dict[key]['meta']['verbs'] = meta
                    # levels of event_dict -- see the code about ten lines below -- and
                    # this is potentially confusing, so it probably would be useful to
                    # change one of those

                del (sentence)
                times += code_time
                sents += 1
                # print('\t\t',code_time)

                if coded_events:
                    event_dict[key]['sents'][sent]['events'] = coded_events
                    event_dict[key]['sents'][sent]['meta'] = meta
                    #print('DC-events:', coded_events) # --
                    #print('DC-meta:', meta) # --
                    #print('+++',event_dict[key]['sents'][sent])  # --
                    if PETRglobals.WriteActorText or PETRglobals.WriteEventText or PETRglobals.WriteActorRoot:
                        text_dict = utilities.extract_phrases(
                            event_dict[key]['sents'][sent], SentenceID)
                        # --                        print('DC-td1:',text_dict) # --
                        if text_dict:
                            event_dict[key]['sents'][sent]['meta'][
                                'actortext'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'eventtext'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'actorroot'] = {}
                            # --                            print('DC1:',text_dict) # --
                            for evt in coded_events:
                                if evt in text_dict:  # 16.04.30 pas bypasses problems with expansion of compounds
                                    event_dict[key]['sents'][sent]['meta'][
                                        'actortext'][evt] = text_dict[evt][:2]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'eventtext'][evt] = text_dict[evt][2]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'actorroot'][evt] = text_dict[evt][3:5]

                if coded_events and PETRglobals.IssueFileName != "":
                    event_issues = get_issues(SentenceText)
                    if event_issues:
                        event_dict[key]['sents'][sent]['issues'] = event_issues

                if PETRglobals.PauseBySentence:
                    if len(input("Press Enter to continue...")) > 0:
                        sys.exit()

                prev_code = coded_events
                NEvents += len(coded_events)
                if len(coded_events) == 0:
                    NEmpty += 1
            else:
                logger.info(
                    '{} has no parse information. Passing.'.format(SentenceID))
                pass

        if SkipStory:
            event_dict[key]['sents'] = None

    print("\nSummary:")
    print("Stories read:", NStory, "   Sentences coded:", NSent,
          "  Events generated:", NEvents)
    print("Discards:  Sentence", NDiscardSent, "  Story", NDiscardStory,
          "  Sentences without events:", NEmpty)
    print("Average Coding time = ", times / sents if sents else 0)
    # --    print('DC-exit:',event_dict)
    return event_dict
Пример #15
0
def process_target_bak(q, l, first_task, cli_args, multi_log_lock):

    # 子进程先读取进程运行所需各种信息
    utilities.init_logger()
    logger = logging.getLogger('petr_log')

    PETRglobals.RunTimeString = time.asctime()

    if cli_args.config:
        print('Using user-specified config: {}'.format(cli_args.config))
        logger.info('Using user-specified config: {}'.format(cli_args.config))
        PETRreader.parse_Config(cli_args.config)
    else:
        logger.info('Using default config file.')
        PETRreader.parse_Config(
            utilities._get_data('data/config/', 'PETR_config.ini'))

    if cli_args.nullverbs:
        print('Coding in null verbs mode; no events will be generated')
        logger.info('Coding in null verbs mode; no events will be generated')
        # Only get verb phrases that are not in the dictionary but are
        # associated with coded noun phrases
        PETRglobals.NullVerbs = True
    elif cli_args.nullactors:
        print('Coding in null actors mode; no events will be generated')
        logger.info('Coding in null verbs mode; no events will be generated')
        # Only get actor phrases that are not in the dictionary but
        # associated with coded verb phrases
        PETRglobals.NullActors = True
        PETRglobals.NewActorLength = int(cli_args.nullactors)

    read_dictionaries()
    print('\n\n')

    out = ""  # PETRglobals.EventFileName
    if cli_args.outputs:
        out = cli_args.outputs

    # 创建一个和数据库交流的session
    session = Session()

    # 子进程先完成第一个任务
    write_multiprocess_log(
        multi_log_lock, '{}Process {}: {}'.format(u'', os.getpid(),
                                                  first_task))
    process_task(first_task, out, multi_log_lock, session)

    while l.acquire():
        # 队列不为空,empty()方法不可靠,使用qsize()
        if q.qsize() != 0:
            # 从队列中获取下一个任务
            task = q.get()
            # 任务获取完之后释放锁
            l.release()
            # 完成获取到的任务
            write_multiprocess_log(
                multi_log_lock,
                '{}Process {}: {}'.format(u'', os.getpid(), task))
            process_task(task, out, multi_log_lock, session)
        # 队列为空
        else:
            # 释放锁
            l.release()
            # 跳出循环
            break

    write_multiprocess_log(
        multi_log_lock, '{}Process {}: {}'.format(u'', os.getpid(),
                                                  u'exited...'))
Пример #16
0
def main():
    cli_args = parse_cli_args()

    # miaoweixin added begin
    # 作为后台程序无限循环运行
    if cli_args.command_name == 'background':
        try:
            # infinite loop
            run_in_background(cli_args)
        except KeyboardInterrupt:
            print("Program exited due to keyboard interrupt.\n")
            return None
    # miaoweixin added end

    utilities.init_logger()
    logger = logging.getLogger('petr_log')

    PETRglobals.RunTimeString = time.asctime()

    print(cli_args)
    if cli_args.config:
        print('Using user-specified config: {}'.format(cli_args.config))
        logger.info('Using user-specified config: {}'.format(cli_args.config))
        PETRreader.parse_Config(cli_args.config)
    else:
        logger.info('Using default config file.')
        PETRreader.parse_Config(
            utilities._get_data('data/config/', 'PETR_config.ini'))

    if cli_args.nullverbs:
        print('Coding in null verbs mode; no events will be generated')
        logger.info('Coding in null verbs mode; no events will be generated')
        # Only get verb phrases that are not in the dictionary but are
        # associated with coded noun phrases
        PETRglobals.NullVerbs = True
    elif cli_args.nullactors:
        print('Coding in null actors mode; no events will be generated')
        logger.info('Coding in null verbs mode; no events will be generated')
        # Only get actor phrases that are not in the dictionary but
        # associated with coded verb phrases
        PETRglobals.NullActors = True
        PETRglobals.NewActorLength = int(cli_args.nullactors)

    read_dictionaries()
    start_time = time.time()
    print('\n\n')

    paths = PETRglobals.TextFileList
    if cli_args.inputs:
        if os.path.isdir(cli_args.inputs):
            if cli_args.inputs[-1] != '/':
                paths = glob.glob(cli_args.inputs + '/*.xml')
            else:
                paths = glob.glob(cli_args.inputs + '*.xml')
        elif os.path.isfile(cli_args.inputs):
            paths = [cli_args.inputs]
        else:
            print(
                '\nFatal runtime error:\n"' + cli_args.inputs +
                '" could not be located\nPlease enter a valid directory or file of source texts.'
            )
            sys.exit()
    elif cli_args.command_name == 'javainfo':
        # add else to java info 0904
        paths = 'javainfo'

    out = ""  # PETRglobals.EventFileName
    if cli_args.outputs:
        out = cli_args.outputs

    if cli_args.command_name == 'parse':
        run(paths, out, cli_args.parsed, cli_args)
    else:
        run(paths, out, True, cli_args)  # <===

    print("Coding time:", time.time() - start_time)

    print("Finished")
Пример #17
0
def main():

    cli_args = parse_cli_args()
    utilities.init_logger('PETRARCH.log', cli_args.debug)
    logger = logging.getLogger('petr_log')

    PETRglobals.RunTimeString = time.asctime()

    if cli_args.command_name == 'parse' or cli_args.command_name == 'batch':

        if cli_args.config:
            print('Using user-specified config: {}'.format(cli_args.config))
            logger.info('Using user-specified config: {}'.format(
                cli_args.config))
            PETRglobals.ConfigFileName = cli_args.config

            PETRreader.parse_Config(cli_args.config)
        else:
            logger.info('Using default config file.')
            PETRglobals.ConfigFileName = 'PETR_config.ini'
            PETRreader.parse_Config(
                utilities._get_data('data/config/', 'PETR_config.ini'))

        read_dictionaries()

        start_time = time.time()
        print('\n\n')

        paths = PETRglobals.TextFileList
        if cli_args.inputs:
            if os.path.isdir(cli_args.inputs):
                if cli_args.inputs[-1] != '/':
                    paths = glob.glob(cli_args.inputs + '/*.xml')
                else:
                    paths = glob.glob(cli_args.inputs + '*.xml')
            elif os.path.isfile(cli_args.inputs):
                paths = [cli_args.inputs]
            else:
                print(
                    '\nFatal runtime error:\n"' + cli_args.inputs +
                    '" could not be located\nPlease enter a valid directory or file of source texts.'
                )
                sys.exit()

        out = ""  #PETRglobals.EventFileName
        if cli_args.outputs:
            out = cli_args.outputs

        if cli_args.command_name == 'parse':
            run(paths, out, cli_args.parsed)

        else:
            run(paths, out, True)  ## <===

        print("Coding time:", time.time() - start_time)

    elif cli_args.command_name == 'preprocess':

        if cli_args.config:
            print('Using user-specified config: {}'.format(cli_args.config))
            logger.info('Using user-specified config: {}'.format(
                cli_args.config))
            PETRglobals.ConfigFileName = cli_args.config

            PETRreader.parse_Config(cli_args.config)
        else:
            logger.info('Using default config file.')
            PETRglobals.ConfigFileName = 'PETR_config.ini'
            PETRreader.parse_Config(
                utilities._get_data('data/config/', 'PETR_config.ini'))

        start_time = time.time()
        print('\n\n')

        paths = PETRglobals.TextFileList
        if cli_args.inputs:
            if os.path.isdir(cli_args.inputs):
                if cli_args.inputs[-1] != '/':
                    paths = glob.glob(cli_args.inputs + '/*.xml')
                else:
                    paths = glob.glob(cli_args.inputs + '*.xml')
            elif os.path.isfile(cli_args.inputs):
                paths = [cli_args.inputs]
            else:
                print(
                    '\nFatal runtime error:\n"' + cli_args.inputs +
                    '" could not be located\nPlease enter a valid directory or file of source texts.'
                )
                sys.exit()

        preprocess(paths)

        print("Preprocessing time:", time.time() - start_time)

    print("Finished")
Пример #18
0
def read_dictionaries(validation=False):

    print('Internal Coding Ontology:', PETRglobals.InternalCodingOntologyFileName)
    pico_path = utilities._get_data('data/dictionaries', PETRglobals.InternalCodingOntologyFileName)
    PETRreader.read_internal_coding_ontology(pico_path)

    print('Verb dictionary:', PETRglobals.VerbFileName)
    verb_path = utilities._get_data(
        'data/dictionaries',
        PETRglobals.VerbFileName)
    PETRreader.read_verb_dictionary(verb_path)

    if PETRglobals.CodeWithPetrarch1:
        print('Petrarch 1 Verb dictionary:', PETRglobals.P1VerbFileName)
        verb_path = utilities._get_data(
            'data/dictionaries',
            PETRglobals.P1VerbFileName)
        PETRreader.read_petrarch1_verb_dictionary(verb_path)

    print('Actor dictionaries:', PETRglobals.ActorFileList)
    for actdict in PETRglobals.ActorFileList:
        actor_path = utilities._get_data('data/dictionaries', actdict)
        PETRreader.read_actor_dictionary(actor_path)

    print('Agent dictionary:', PETRglobals.AgentFileList)
    for agentdict in PETRglobals.AgentFileList:
        agent_path = utilities._get_data('data/dictionaries', agentdict)
        PETRreader.read_agent_dictionary(agent_path)

    print('Discard dictionary:', PETRglobals.DiscardFileName)
    discard_path = utilities._get_data('data/dictionaries',
                                       PETRglobals.DiscardFileName)
    PETRreader.read_discard_list(discard_path)

    if PETRglobals.IssueFileName != "":
        print('Issues dictionary:', PETRglobals.IssueFileName)
        issue_path = utilities._get_data('data/dictionaries',
                                         PETRglobals.IssueFileName)
        PETRreader.read_issue_list(issue_path)
Пример #19
0
def test_personal1():
    parse = "(S (NP (NNP Obama ) ) (VP (VBD said ) (SBAR (S (NP (PRP he ) ) (VP (VBD was ) (ADJP (VBN tired ) ) ) ) ) ) ) ".upper()

    test = ptree.Sentence(parse,"Obama said he was tired",PETRreader.dstr_to_ordate("20150813"))
    phrase = test.tree.children[1].children[1].children[0].children[0]
    assert phrase.get_meaning() == ["USAGOV"]
Пример #20
0
def preprocess(filepaths):
    logger = logging.getLogger('petr_log')

    # this is the routine called from main()
    events = PETRreader.depparse_xml_input(filepaths)
Пример #21
0
def do_coding(event_dict):
    """
    Main coding loop Note that entering any character other than 'Enter' at the
    prompt will stop the program: this is deliberate.
    <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first
                sentence of the *next* story
    """

    treestr = ""

    NStory = 0
    NSent = 0
    NEvents = 0
    NEmpty = 0
    NDiscardSent = 0
    NDiscardStory = 0

    logger = logging.getLogger('petr_log')
    times = 0
    sents = 0

    for key, val in sorted(list(event_dict.items())):
        NStory += 1
        prev_code = []

        SkipStory = False
        #print('\n\nProcessing story {}'.format(key))

        StoryDate = event_dict[key]['meta']['date']
        for sent in val['sents']:
            NSent += 1
            SentenceID = '{}_{}'.format(key, sent)
            #if "AFP" not in key:
            #continue

            if 'parsed' in event_dict[key]['sents'][sent]:
                if 'config' in val['sents'][sent]:
                    for _, config in event_dict[key]['sents'][sent][
                            'config'].items():
                        change_Config_Options(config)

                SentenceText = event_dict[key]['sents'][sent]['content']
                SentenceDate = event_dict[key]['sents'][sent][
                    'date'] if 'date' in event_dict[key]['sents'][
                        sent] else StoryDate
                Date = PETRreader.dstr_to_ordate(SentenceDate)

                print("\n", SentenceID)
                #if '020675' not in SentenceID:
                #continue
                parsed = event_dict[key]['sents'][sent]['parsed']
                treestr = parsed

                disc = check_discards(SentenceText)
                if disc[0] > 0:
                    if disc[0] == 1:
                        print("Discard sentence:", disc[1])
                        logger.info('\tSentence discard. {}'.format(disc[1]))
                        NDiscardSent += 1
                        continue
                    else:
                        print("Discard story:", disc[1])
                        logger.info('\tStory discard. {}'.format(disc[1]))
                        SkipStory = True
                        NDiscardStory += 1
                        break

                t1 = time.time()
                sentence = PETRgraph.Sentence(treestr, SentenceText, Date)
                # print(sentence.txt)
                # this is the entry point into the processing in PETRgraph
                coded_events = {}

                if PETRglobals.CodeWithPetrarch2:
                    p2_coded_events = sentence.get_events()
                    coded_events.update(p2_coded_events)

                    event_dict[key]['sents'][sent]['events'] = sentence.events
                    event_dict[key]['sents'][sent]['verbs'] = sentence.verbs
                    event_dict[key]['sents'][sent]['nouns'] = sentence.nouns
                    event_dict[key]['sents'][sent][
                        'triplets'] = sentence.triplets

                if PETRglobals.CodeWithPetrarch1:
                    p1_coded_events = sentence.get_events_from_petrarch1_patterns(
                    )

                    event_dict[key]['sents'][sent].setdefault('events', {})
                    event_dict[key]['sents'][sent].setdefault('triplets', {})
                    for i in range(0, len(p1_coded_events)):
                        #raw_input(p1_coded_events[i])
                        event_dict[key]['sents'][sent]['events'][
                            'p1_' + str(i)] = [[p1_coded_events[i][0]],
                                               [p1_coded_events[i][1]],
                                               p1_coded_events[i][2]]

                        event_dict[key]['sents'][sent]['triplets'][
                            'p1_' + str(i)] = {}
                        event_dict[key]['sents'][sent]['triplets'][
                            'p1_' +
                            str(i)]['matched_txt'] = p1_coded_events[i][5]
                        event_dict[key]['sents'][sent]['triplets'][
                            'p1_' +
                            str(i)]['source_text'] = p1_coded_events[i][
                                3] if p1_coded_events[i][3] != None else "---"
                        event_dict[key]['sents'][sent]['triplets'][
                            'p1_' +
                            str(i)]['target_text'] = p1_coded_events[i][
                                4] if p1_coded_events[i][4] != None else "---"
                        event_dict[key]['sents'][sent]['triplets'][
                            'p1_' +
                            str(i)]['verb_text'] = p1_coded_events[i][6]
                        coded_events['p1_' + str(i)] = event_dict[key][
                            'sents'][sent]['events']['p1_' + str(i)]

                logger.debug("check events of id:" + SentenceID)
                for eventID, event in event_dict[key]['sents'][sent][
                        'events'].items():
                    logger.debug("event:" + eventID)
                    logger.debug(event)

                for tID, triplet in event_dict[key]['sents'][sent][
                        'triplets'].items():
                    logger.debug("triplet:" + tID)
                    logger.debug(triplet['matched_txt'])

                code_time = time.time() - t1
                '''
                if PETRglobals.NullVerbs or PETRglobals.NullActors:
                    event_dict[key]['meta'] = meta
                    event_dict[key]['text'] = sentence.txt
                elif PETRglobals.NullActors:
                    event_dict[key]['events'] = coded_events
                    coded_events = None   # skips additional processing
                    event_dict[key]['text'] = sentence.txt
                else:
                    # 16.04.30 pas: we're using the key value 'meta' at two
                    # very different
                    event_dict[key]['meta']['verbs'] = meta
                    # levels of event_dict -- see the code about ten lines below -- and
                    # this is potentially confusing, so it probably would be useful to
                    # change one of those
                '''

                del (sentence)
                times += code_time
                sents += 1
                # print('\t\t',code_time)

                if coded_events and PETRglobals.IssueFileName != "":
                    event_issues = get_issues(SentenceText)
                    if event_issues:
                        event_dict[key]['sents'][sent]['issues'] = event_issues

                if PETRglobals.PauseBySentence:
                    if len((input("Press Enter to continue..."))) > 0:
                        sys.exit()

                NEvents += len(coded_events.values())
                if len(coded_events) == 0:
                    NEmpty += 1
            else:
                logger.info(
                    '{} has no parse information. Passing.'.format(SentenceID))
                pass

        if SkipStory:
            event_dict[key]['sents'] = None

    print("\nSummary:")
    print("Stories read:", NStory, "   Sentences coded:", NSent,
          "  Events generated:", NEvents)
    print("Discards:  Sentence", NDiscardSent, "  Story", NDiscardStory,
          "  Sentences without events:", NEmpty)
    print("Average Coding time = ", times / sents if sents else 0)
    # --    print('DC-exit:',event_dict)

    return event_dict
Пример #22
0
def do_coding(event_dict, out_file):
    """
    Main coding loop Note that entering any character other than 'Enter' at the
    prompt will stop the program: this is deliberate.
    <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first
                sentence of the *next* story
    """

    treestr = ""

    NStory = 0
    NSent = 0
    NEvents = 0
    NEmpty = 0
    NDiscardSent = 0
    NDiscardStory = 0

    if out_file:
        file = open_tex(out_file)

    logger = logging.getLogger('petr_log')
    times = 0
    sents = 0
    for key, val in sorted(event_dict.items()):
        NStory += 1
        prev_code = []

        SkipStory = False
        print('\n\nProcessing {}'.format(key))
        StoryDate = event_dict[key]['meta']['date']
        StorySource = 'TEMP'
        for sent in val['sents']:
            NSent += 1
            if 'parsed' in event_dict[key]['sents'][sent]:
                if 'config' in val['sents'][sent]:
                    for id, config in event_dict[key][
                            'sents'][sent]['config'].items():
                        change_Config_Options(config)

                SentenceID = '{}_{}'.format(key, sent)
                SentenceText = event_dict[key]['sents'][sent]['content']
                SentenceDate = event_dict[key]['sents'][sent][
                    'date'] if 'date' in event_dict[key]['sents'][sent] else StoryDate
                Date = PETRreader.dstr_to_ordate(SentenceDate)
                SentenceSource = 'TEMP'
                
                #if not "SYNSET" in SentenceID:
                #    continue
                #if not "Sarkozy" in SentenceText:
                #    continue
                print("\t\t",SentenceID)
                parsed = event_dict[key]['sents'][sent]['parsed']
                treestr = parsed
                disc = check_discards(SentenceText)
                if disc[0] > 0:
                    if disc[0] == 1:
                        print("Discard sentence:", disc[1])
                        logger.info('\tSentence discard. {}'.format(disc[1]))
                        NDiscardSent += 1
                        continue
                    else:
                        print("Discard story:", disc[1])
                        logger.info('\tStory discard. {}'.format(disc[1]))
                        SkipStory = True
                        NDiscardStory += 1
                        break
                
                
                t1 = time.time()
                sentence = PETRtree.Sentence(treestr,SentenceText,Date)
                coded_events , meta = sentence.get_events()
                code_time = time.time()-t1
                event_dict[key]['meta']['verbs'] = meta

                if out_file:
                    sentence.print_to_file(sentence.tree,file = file)

                
                del(sentence)
                times+=code_time
                sents += 1
                print('\t\t',code_time)
                
                
                if coded_events:
                    event_dict[key]['sents'][sent]['events'] = coded_events
                if coded_events and PETRglobals.IssueFileName != "":
                    event_issues = get_issues(SentenceText)
                    if event_issues:
                        event_dict[key]['sents'][sent]['issues'] = event_issues

                if PETRglobals.PauseBySentence:
                    if len(input("Press Enter to continue...")) > 0:
                        sys.exit()

                prev_code = coded_events
                NEvents += len(coded_events)
                if len(coded_events) == 0:
                    NEmpty += 1
            else:
                logger.info(
                    '{} has no parse information. Passing.'.format(SentenceID))
                pass

        if SkipStory:
            event_dict[key]['sents'] = None


    if out_file:
        close_tex(file)
    

    print("\nSummary:")
    print(
        "Stories read:",
        NStory,
        "   Sentences coded:",
        NSent,
        "  Events generated:",
        NEvents)
    print(
        "Discards:  Sentence",
        NDiscardSent,
        "  Story",
        NDiscardStory,
        "  Sentences without events:",
        NEmpty)
    print("Average Coding time = ", times/sents if sents else 0)
    return event_dict
Пример #23
0
def run(filepaths, out_file, s_parsed):
    events = PETRreader.read_xml_input(filepaths, s_parsed)
    if not s_parsed:
        events = utilities.stanford_parse(events)
    updated_events = do_coding(events, out_file)
    PETRwriter.write_events(updated_events, out_file)
Пример #24
0
def test_reflexive2():
    parse = "(S (NP (NNP Obama ) ) (VP (VBD knew ) (SBAR (IN that ) (S (NP (NNP Putin ) ) (VP (VBD liked ) (NP (PRP himself ) ) ) ) ) )  ) ".upper()

    test = ptree.Sentence(parse,"Obama knew that Biden liked him",PETRreader.dstr_to_ordate("20150813"))
    phrase = test.tree.children[1].children[1].children[1].children[1].children[1]
    assert phrase.get_meaning() == ["RUSGOV"]
Пример #25
0
def run(filepaths, out_file, s_parsed):
    # this is the routine called from main()
    events = PETRreader.read_xml_input(filepaths, s_parsed)
    #if not s_parsed:
    #    events = utilities.stanford_parse(events)
    updated_events = do_coding(events)
Пример #26
0
                others = ""
                for other in triple[3]:
                    others = others + other.text + ","
                tuples = tuples + "source: " + source + "\ttarget: " + target + "\tverb: " + triple[
                    2].text + "\tother_noun: " + others + "\n"
            ET.SubElement(sentence, "Triplets").text = tuples

    tree = ET.ElementTree(root)
    tree.write(outputfile, 'UTF-8')


utilities.init_logger('PETRARCH.log', True)
config = utilities._get_data('data/config/', 'PETR_config.ini')
print("reading config")
sys.stdout.write('Mk1\n')
PETRreader.parse_Config(config)
print("reading dicts")
petrarch_ud.read_dictionaries()
inputFile = sys.argv[1]
#inputFile=sys.argv[1].replace(".xml","")+"_parsed.xml"
outputFile = inputFile.replace("_parsed.xml", "") + "_phrase.xml"
events = read_xml_input([inputFile], True)
'''
print(len(events))
for key in events.keys():
	print(len(events[key]['sents']))
	for subkey,v in events[key]['sents'].items():
		print(subkey)
		print(v)
'''
updated_events = extract_actor_code(events)
Пример #27
0
def do_coding(event_dict):
    """
    Main coding loop Note that entering any character other than 'Enter' at the
    prompt will stop the program: this is deliberate.
    <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first
                sentence of the *next* story
    """

    treestr = ""
    NStory = 0
    NSent = 0
    NEvents = 0
    NEmpty = 0
    NDiscardSent = 0
    NDiscardStory = 0

    logger = logging.getLogger('petr_log')
    times = 0
    sents = 0

    #获得发布时间
    realiseTimeDic = get_releasetime(event_dict)

    if not realiseTimeDic:
        print(
            "realiseTimeDic have no timeinfo ,please check “get_releasetime” method"
        )
    #获得报道时间
    reporttimeDic = get_reporttime(event_dict, realiseTimeDic)

    for key, val in sorted(event_dict.items()):
        NStory += 1
        prev_code = []
        SkipStory = False
        print('\n\nProcessing paragraph {}'.format(key))
        StoryDate = event_dict[key]['meta']['date']
        if StoryDate == 'NULL':
            continue

        id = key.split("-")
        articleId = id[0]
        paraghId = id[1]

        #设置发布时间与报道时间,报道时间缺失的按发布时间确定
        val["meta"]["realiseTime"] = realiseTimeDic[articleId]
        if articleId in reporttimeDic.keys():
            val["meta"]["reportTime"] = reporttimeDic[articleId]
        else:
            val["meta"]["reportTime"] = realiseTimeDic[articleId]

        if paraghId == "0000":
            with open("timeinfo.txt", "a") as f:

                f.writelines(("发布时间:" + val["meta"]["realiseTime"]
                              ).decode("utf-8").encode("utf-8") + "\n")
                f.writelines(("报道时间:" + val["meta"]["reportTime"]
                              ).decode("utf-8").encode("utf-8") + "\n")
        with open("timeinfo.txt", "a") as f:
            f.writelines(("文章段落ID:" + articleId + " " + paraghId +
                          "\n").decode("utf-8").encode("utf-8"))

        for sent in sorted(val['sents']):
            print('\n\nProcessing sentence {}'.format(sent))
            NSent += 1
            if 'parsed' in event_dict[key]['sents'][sent]:
                SentenceID = '{}_{}'.format(key, sent)
                SentenceText = event_dict[key]['sents'][sent]['content']
                SentenceDate = event_dict[key]['sents'][sent][
                    'date'] if 'date' in event_dict[key]['sents'][
                        sent] else StoryDate
                Date = PETRreader.dstr_to_ordate(
                    SentenceDate.split(' ')[0].replace('-', ''))
                parsed = event_dict[key]['sents'][sent]['parsed']
                treestr = parsed
                disc = check_discards(SentenceText)
                if disc[0] > 0:
                    if disc[0] == 1:
                        print("Discard sentence:", disc[1])
                        logger.info('\tSentence discard. {}'.format(disc[1]))
                        NDiscardSent += 1
                        continue
                    else:
                        print("Discard story:", disc[1])
                        logger.info('\tStory discard. {}'.format(disc[1]))
                        SkipStory = False
                        NDiscardStory += 1
                        break

                t1 = time.time()
                try:
                    sentence = PETRtree.Sentence(treestr, SentenceText, Date)
                    '''
                    下面一行是调用句法树分类器
                    '''
                    sentence.classify_tree()

                except Exception as e:

                    message = "ERROR IN PETRARCH2 DO_CODING:" + SentenceID + "\n" + SentenceText + str(
                        e) + "\n"
                    logging.exception(message)
                    continue
                set_nt_textList(sentence)

                set_sentenceTimeByReport(sentence, val["meta"]["reportTime"],
                                         val['sents'], sent)

                with open("timeinfo.txt", "a") as f:
                    f.writelines(("     句子ID:" + sent +
                                  "\n").decode("utf-8").encode("utf-8"))
                    f.write("       " +
                            sentence.txt.decode("utf-8").encode("utf-8") +
                            "\n")
                    f.write("       时间词列表: ")
                    for text in sentence.ntTextList:
                        f.write(text + ",")
                    f.write("\n       句子时间:" +
                            str(sentence.sentenceTime).decode("utf-8").encode(
                                "utf-8") + "\n\n")
                timeText = sentence.ntTextList
                sentenceTime = sentence.sentenceTime
                try:
                    coded_events, meta = sentence.get_events()
                except Exception as e:
                    message = "ERROR IN PETRARCH2 DO_CODING:" + SentenceID + "\n" + SentenceText + str(
                        e) + "\n"
                    logging.exception(message)

                # print("coded_events:",coded_events)
                # print("meta:",meta)

                #print("coded_events:",coded_events)
                #print("meta:",meta)
                # exit()

                # 暂时只走了最后一条分支
                code_time = time.time() - t1
                if PETRglobals.NullVerbs or PETRglobals.NullActors:
                    event_dict[key]['meta'] = meta
                    event_dict[key]['text'] = sentence.txt
                elif PETRglobals.NullActors:
                    event_dict[key]['events'] = coded_events
                    coded_events = None  # skips additional processing
                    event_dict[key]['text'] = sentence.txt
                else:
                    # 16.04.30 pas: we're using the key value 'meta' at two
                    # very different
                    event_dict[key]['meta']['verbs'] = meta
                    # levels of event_dict -- see the code about ten lines below -- and
                    # this is potentially confusing, so it probably would be useful to
                    # change one of those
                del (sentence)

                times += code_time
                sents += 1
                # print('\t\t',code_time)

                if coded_events:
                    event_dict[key]['sents'][sent]['events'] = coded_events
                    event_dict[key]['sents'][sent]['meta'] = meta
                    #print('DC-events:', coded_events) # --
                    #print('DC-meta:', meta) # --
                    #print('+++',event_dict[key]['sents'][sent])  # --
                    if PETRglobals.WriteActorText or PETRglobals.WriteEventText or PETRglobals.WriteActorRoot:
                        text_dict = utilities.extract_phrases(
                            event_dict[key]['sents'][sent], SentenceID)
                        print('DC-td1:', text_dict)  # --
                        if text_dict:
                            event_dict[key]['sents'][sent]['meta'][
                                'actortext'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'eventtext'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'actorroot'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'eventroot'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'Source'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'Target'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'timeText'] = timeText
                            event_dict[key]['sents'][sent]['meta'][
                                'sentenceTime'] = {sentenceTime}
                            # --                            print('DC1:',text_dict) # --
                            for evt in coded_events:
                                # realLocation = []
                                # location_initial = event_dict[key]['sents'][sent]['ner']
                                #
                                # index1 = SentenceText.find(text_dict[evt][0]) + 1
                                # index2 = SentenceText.find(text_dict[evt][1]) - 1
                                # index3 = SentenceText.find(text_dict[evt][2]) - 1
                                # for loc in location_initial:
                                #     if (SentenceText.find(loc, index1, index2)
                                #             or SentenceText.find(loc, index1, index3)):
                                #         realLocation.append(loc)
                                # event_dict[key]['sents'][sent]['ner'] = realLocation

                                if evt in text_dict:  # 16.04.30 pas bypasses problems with expansion of compounds
                                    event_dict[key]['sents'][sent]['meta'][
                                        'actortext'][evt] = text_dict[evt][:2]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'eventtext'][evt] = text_dict[evt][2]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'actorroot'][evt] = text_dict[evt][3:5]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'eventroot'][evt] = text_dict[evt][5]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'Source'][evt] = text_dict[evt][0]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'Target'][evt] = text_dict[evt][1]

                if coded_events and PETRglobals.IssueFileName != "":
                    event_issues = get_issues(SentenceText)
                    if event_issues:
                        event_dict[key]['sents'][sent]['issues'] = event_issues

                if PETRglobals.PauseBySentence:
                    if len(input("Press Enter to continue...")) > 0:
                        sys.exit()

                prev_code = coded_events
                # NEvents += len(coded_events)
                if coded_events is not None and len(coded_events) == 0:
                    NEmpty += 1
            else:
                logger.info(
                    '{} has no parse information. Passing.'.format(SentenceID))
                pass

        if SkipStory:
            event_dict[key]['sents'] = None

    print("\nSummary:")
    """
    print(
        "Stories read:",
        NStory,
        "   Sentences coded:",
        NSent,
        "  Events generated:",
        NEvents)
    print(
        "Discards:  Sentence",
        NDiscardSent,
        "  Story",
        NDiscardStory,
        "  Sentences without events:",
        NEmpty)
    print("Average Coding time = ", times / sents if sents else 0)
    """
    # --    print('DC-exit:',event_dict)
    return event_dict