Пример #1
0
def do_coding(event_dict, out_file):
    """
    Main coding loop Note that entering any character other than 'Enter' at the
    prompt will stop the program: this is deliberate.
    <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first
                sentence of the *next* story
    """

    treestr = ""

    NStory = 0
    NSent = 0
    NEvents = 0
    NEmpty = 0
    NDiscardSent = 0
    NDiscardStory = 0

    if out_file:
        file = open_tex(out_file)

    logger = logging.getLogger('petr_log')
    times = 0
    sents = 0
    for key, val in sorted(event_dict.items()):
        NStory += 1
        prev_code = []

        SkipStory = False
        print('\n\nProcessing story {}'.format(key))
        StoryDate = event_dict[key]['meta']['date']
        StorySource = 'TEMP'
        for sent in val['sents']:
            NSent += 1
            if 'parsed' in event_dict[key]['sents'][sent]:
                #                 if 'config' in val['sents'][sent]:
                #                     for id, config in event_dict[key][
                #                             'sents'][sent]['config'].items():
                #                         change_Config_Options(config)

                SentenceID = '{}_{}'.format(key, sent)
                SentenceText = event_dict[key]['sents'][sent]['content']
                SentenceDate = event_dict[key]['sents'][sent][
                    'date'] if 'date' in event_dict[key]['sents'][
                        sent] else StoryDate
                Date = PETRreader.dstr_to_ordate(SentenceDate)
                SentenceSource = 'TEMP'

                print("\n", SentenceID)
                parsed = event_dict[key]['sents'][sent]['parsed']
                treestr = parsed
                disc = check_discards(SentenceText)
                if disc[0] > 0:
                    if disc[0] == 1:
                        print("Discard sentence:", disc[1])
                        logger.info('\tSentence discard. {}'.format(disc[1]))
                        NDiscardSent += 1
                        continue
                    else:
                        print("Discard story:", disc[1])
                        logger.info('\tStory discard. {}'.format(disc[1]))
                        SkipStory = True
                        NDiscardStory += 1
                        break

                t1 = time.time()
                sentence = PETRtree.Sentence(treestr, SentenceText, Date)
                print(sentence.actor)
                print(sentence.agent)
                coded_events, meta = sentence.get_events(
                )  # this is the entry point into the processing in PETRtree
                #                print(meta)
                code_time = time.time() - t1
                event_dict[key]['meta'][
                    'verbs'] = meta  # 16.04.30 pas: we're using the key value 'meta' at two very different
                # levels of event_dict -- see the code about ten lines below -- and
                # this is potentially confusing, so it probably would be useful to
                # change one of those

                if out_file:
                    sentence.print_to_file(sentence.tree, file=file)

                del (sentence)
                times += code_time
                sents += 1
                #print('\t\t',code_time)

                if coded_events:
                    event_dict[key]['sents'][sent]['events'] = coded_events
                    event_dict[key]['sents'][sent]['meta'] = meta
                    """print('DC-events:', coded_events) # --
                    print('DC-meta:', meta) # --
                    print('+++',event_dict[key]['sents'][sent])  # --"""
                    if PETRglobals.WriteActorText or PETRglobals.WriteEventText or PETRglobals.WriteActorRoot:
                        text_dict = utilities.extract_phrases(
                            event_dict[key]['sents'][sent], SentenceID)
                        # --                        print('DC-td1:',text_dict) # --
                        if text_dict:
                            event_dict[key]['sents'][sent]['meta'][
                                'actortext'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'eventtext'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'actorroot'] = {}
                            # --                            print('DC1:',text_dict) # --
                            for evt in coded_events:
                                if evt in text_dict:  # 16.04.30 pas bypasses problems with expansion of compounds
                                    event_dict[key]['sents'][sent]['meta'][
                                        'actortext'][evt] = text_dict[evt][:2]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'eventtext'][evt] = text_dict[evt][2]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'actorroot'][evt] = text_dict[evt][3:5]

                if coded_events and PETRglobals.IssueFileName != "":
                    event_issues = get_issues(SentenceText)
                    if event_issues:
                        event_dict[key]['sents'][sent]['issues'] = event_issues

                if PETRglobals.PauseBySentence:
                    if len(input("Press Enter to continue...")) > 0:
                        sys.exit()

                prev_code = coded_events
                NEvents += len(coded_events)
                if len(coded_events) == 0:
                    NEmpty += 1
            else:
                logger.info(
                    '{} has no parse information. Passing.'.format(SentenceID))
                pass

        if SkipStory:
            event_dict[key]['sents'] = None

    if out_file:
        close_tex(file)

    print("\nSummary:")
    print("Stories read:", NStory, "   Sentences coded:", NSent,
          "  Events generated:", NEvents)
    print("Discards:  Sentence", NDiscardSent, "  Story", NDiscardStory,
          "  Sentences without events:", NEmpty)
    print("Average Coding time = ", times / sents if sents else 0)
    # --    print('DC-exit:',event_dict)
    return event_dict
Пример #2
0
def do_coding(event_dict):
    """
    Main coding loop Note that entering any character other than 'Enter' at the
    prompt will stop the program: this is deliberate.
    <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first
                sentence of the *next* story
    """

    treestr = ""
    NStory = 0
    NSent = 0
    NEvents = 0
    NEmpty = 0
    NDiscardSent = 0
    NDiscardStory = 0

    logger = logging.getLogger('petr_log')
    times = 0
    sents = 0

    #获得发布时间
    realiseTimeDic = get_releasetime(event_dict)

    if not realiseTimeDic:
        print(
            "realiseTimeDic have no timeinfo ,please check “get_releasetime” method"
        )
    #获得报道时间
    reporttimeDic = get_reporttime(event_dict, realiseTimeDic)

    for key, val in sorted(event_dict.items()):
        NStory += 1
        prev_code = []
        SkipStory = False
        print('\n\nProcessing paragraph {}'.format(key))
        StoryDate = event_dict[key]['meta']['date']
        if StoryDate == 'NULL':
            continue

        id = key.split("-")
        articleId = id[0]
        paraghId = id[1]

        #设置发布时间与报道时间,报道时间缺失的按发布时间确定
        val["meta"]["realiseTime"] = realiseTimeDic[articleId]
        if articleId in reporttimeDic.keys():
            val["meta"]["reportTime"] = reporttimeDic[articleId]
        else:
            val["meta"]["reportTime"] = realiseTimeDic[articleId]

        # if paraghId == "0000":
        #     with open("timeinfo.txt", "a") as f:
        #
        #         f.writelines(("发布时间:" + val["meta"]["realiseTime"]).decode("utf-8").encode("utf-8") + "\n")
        #         f.writelines(("报道时间:" + val["meta"]["reportTime"]).decode("utf-8").encode("utf-8") + "\n")
        # with open("timeinfo.txt", "a") as f:
        #     f.writelines(("文章段落ID:" + articleId + " " + paraghId + "\n").decode("utf-8").encode("utf-8"))

        for sent in sorted(val['sents']):
            print('\n\nProcessing sentence {}'.format(sent))
            NSent += 1
            if 'parsed' in event_dict[key]['sents'][sent]:
                SentenceID = '{}_{}'.format(key, sent)
                SentenceText = event_dict[key]['sents'][sent]['content']
                SentenceDate = event_dict[key]['sents'][sent][
                    'date'] if 'date' in event_dict[key]['sents'][
                        sent] else StoryDate
                Date = PETRreader.dstr_to_ordate(
                    SentenceDate.split(' ')[0].replace('-', ''))
                parsed = event_dict[key]['sents'][sent]['parsed']
                treestr = parsed
                disc = check_discards(SentenceText)
                if disc[0] > 0:
                    if disc[0] == 1:
                        print("Discard sentence:", disc[1])
                        logger.info('\tSentence discard. {}'.format(disc[1]))
                        NDiscardSent += 1
                        continue
                    else:
                        print("Discard story:", disc[1])
                        logger.info('\tStory discard. {}'.format(disc[1]))
                        SkipStory = False
                        NDiscardStory += 1
                        break

                t1 = time.time()
                try:
                    sentence = PETRtree.Sentence(treestr, SentenceText, Date)

                except Exception as e:

                    message = "ERROR IN PETRARCH2 DO_CODING:" + SentenceID + "\n" + SentenceText + str(
                        e) + "\n"
                    logging.exception(message)
                    continue
                set_nt_textList(sentence)

                set_sentenceTimeByReport(sentence, val["meta"]["reportTime"],
                                         val['sents'], sent)

                # with open("timeinfo.txt", "a") as f:
                #     f.writelines(("     句子ID:" + sent + "\n").decode("utf-8").encode("utf-8"))
                #     f.write("       "+sentence.txt.decode("utf-8").encode("utf-8")+ "\n")
                #     f.write("       时间词列表: ")
                #     for text in sentence.ntTextList:
                #         f.write(text+",")
                #     f.write("\n       句子时间:" +str(sentence.sentenceTime).decode("utf-8").encode("utf-8") + "\n\n")
                timeText = sentence.ntTextList
                sentenceTime = sentence.sentenceTime
                try:
                    coded_events, meta = sentence.get_events()
                except Exception as e:
                    message = "ERROR IN PETRARCH2 DO_CODING:" + SentenceID + "\n" + SentenceText + str(
                        e) + "\n"
                    logging.exception(message)

                # print("coded_events:",coded_events)
                # print("meta:",meta)

                #print("coded_events:",coded_events)
                #print("meta:",meta)
                # exit()

                # 暂时只走了最后一条分支
                code_time = time.time() - t1
                if PETRglobals.NullVerbs or PETRglobals.NullActors:
                    event_dict[key]['meta'] = meta
                    event_dict[key]['text'] = sentence.txt
                elif PETRglobals.NullActors:
                    event_dict[key]['events'] = coded_events
                    coded_events = None  # skips additional processing
                    event_dict[key]['text'] = sentence.txt
                else:
                    # 16.04.30 pas: we're using the key value 'meta' at two
                    # very different
                    event_dict[key]['meta']['verbs'] = meta
                    # levels of event_dict -- see the code about ten lines below -- and
                    # this is potentially confusing, so it probably would be useful to
                    # change one of those
                del (sentence)

                times += code_time
                sents += 1
                # print('\t\t',code_time)

                if coded_events:
                    event_dict[key]['sents'][sent]['events'] = coded_events
                    event_dict[key]['sents'][sent]['meta'] = meta
                    #print('DC-events:', coded_events) # --
                    #print('DC-meta:', meta) # --
                    #print('+++',event_dict[key]['sents'][sent])  # --
                    if PETRglobals.WriteActorText or PETRglobals.WriteEventText or PETRglobals.WriteActorRoot:
                        text_dict = utilities.extract_phrases(
                            event_dict[key]['sents'][sent], SentenceID)
                        print('DC-td1:', text_dict)  # --
                        if text_dict:
                            event_dict[key]['sents'][sent]['meta'][
                                'actortext'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'eventtext'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'actorroot'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'eventroot'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'Source'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'Target'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'timeText'] = timeText
                            event_dict[key]['sents'][sent]['meta'][
                                'sentenceTime'] = {sentenceTime}
                            # --                            print('DC1:',text_dict) # --
                            for evt in coded_events:
                                # realLocation = []
                                # location_initial = event_dict[key]['sents'][sent]['ner']
                                #
                                # index1 = SentenceText.find(text_dict[evt][0]) + 1
                                # index2 = SentenceText.find(text_dict[evt][1]) - 1
                                # index3 = SentenceText.find(text_dict[evt][2]) - 1
                                # for loc in location_initial:
                                #     if (SentenceText.find(loc, index1, index2)
                                #             or SentenceText.find(loc, index1, index3)):
                                #         realLocation.append(loc)
                                # event_dict[key]['sents'][sent]['ner'] = realLocation

                                if evt in text_dict:  # 16.04.30 pas bypasses problems with expansion of compounds
                                    event_dict[key]['sents'][sent]['meta'][
                                        'actortext'][evt] = text_dict[evt][:2]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'eventtext'][evt] = text_dict[evt][2]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'actorroot'][evt] = text_dict[evt][3:5]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'eventroot'][evt] = text_dict[evt][5]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'Source'][evt] = text_dict[evt][0]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'Target'][evt] = text_dict[evt][1]

                if coded_events and PETRglobals.IssueFileName != "":
                    event_issues = get_issues(SentenceText)
                    if event_issues:
                        event_dict[key]['sents'][sent]['issues'] = event_issues

                if PETRglobals.PauseBySentence:
                    if len(input("Press Enter to continue...")) > 0:
                        sys.exit()

                prev_code = coded_events
                # NEvents += len(coded_events)
                if coded_events is not None and len(coded_events) == 0:
                    NEmpty += 1
            else:
                logger.info(
                    '{} has no parse information. Passing.'.format(SentenceID))
                pass

        if SkipStory:
            event_dict[key]['sents'] = None

    print("\nSummary:")
    """
    print(
        "Stories read:",
        NStory,
        "   Sentences coded:",
        NSent,
        "  Events generated:",
        NEvents)
    print(
        "Discards:  Sentence",
        NDiscardSent,
        "  Story",
        NDiscardStory,
        "  Sentences without events:",
        NEmpty)
    print("Average Coding time = ", times / sents if sents else 0)
    """
    # --    print('DC-exit:',event_dict)
    return event_dict
Пример #3
0
def do_coding(event_dict, out_file):
    """
    Main coding loop Note that entering any character other than 'Enter' at the
    prompt will stop the program: this is deliberate.
    <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first
                sentence of the *next* story
    """

    treestr = ""

    NStory = 0
    NSent = 0
    NEvents = 0
    NEmpty = 0
    NDiscardSent = 0
    NDiscardStory = 0

    """if out_file:  # <16.06.18 pas> disable for now
        file = open_tex(out_file)"""

    logger = logging.getLogger('petr_log')
    times = 0
    sents = 0
    for key, val in sorted(event_dict.items()):
        NStory += 1
        prev_code = []

        SkipStory = False
        print('\n\nProcessing story {}'.format(key))
        StoryDate = event_dict[key]['meta']['date']
        StorySource = 'TEMP'
        for sent in val['sents']:
            NSent += 1
            if 'parsed' in event_dict[key]['sents'][sent]:
                if 'config' in val['sents'][sent]:
                    for id, config in event_dict[key][
                            'sents'][sent]['config'].items():
                        change_Config_Options(config)

                SentenceID = '{}_{}'.format(key, sent)
                SentenceText = event_dict[key]['sents'][sent]['content']
                SentenceDate = event_dict[key]['sents'][sent][
                    'date'] if 'date' in event_dict[key]['sents'][sent] else StoryDate
                Date = PETRreader.dstr_to_ordate(SentenceDate)
                SentenceSource = 'TEMP'
                
                print("\n",SentenceID)
                parsed = event_dict[key]['sents'][sent]['parsed']
                treestr = parsed
                disc = check_discards(SentenceText)
                if disc[0] > 0:
                    if disc[0] == 1:
                        print("Discard sentence:", disc[1])
                        logger.info('\tSentence discard. {}'.format(disc[1]))
                        NDiscardSent += 1
                        continue
                    else:
                        print("Discard story:", disc[1])
                        logger.info('\tStory discard. {}'.format(disc[1]))
                        SkipStory = True
                        NDiscardStory += 1
                        break
                
                
                t1 = time.time()
                sentence = PETRtree.Sentence(treestr,SentenceText,Date)
                print(sentence.txt)
                coded_events , meta = sentence.get_events()  # this is the entry point into the processing in PETRtree
                code_time = time.time()-t1
                if PETRglobals.NullVerbs or PETRglobals.NullActors:
                    event_dict[key]['meta'] = meta
                    event_dict[key]['text'] = sentence.txt                    
                elif PETRglobals.NullActors:
                    event_dict[key]['events'] = coded_events
                    coded_events = None   # skips additional processing
                    event_dict[key]['text'] = sentence.txt                    
                else:
                    event_dict[key]['meta']['verbs'] = meta # 16.04.30 pas: we're using the key value 'meta' at two very different
                                                        # levels of event_dict -- see the code about ten lines below -- and 
                                                        # this is potentially confusing, so it probably would be useful to  
                                                        # change one of those 

                """if out_file: # <16.06.18 pas> This isn't doing anything useful right now, just flipping bits on the hard drive, so I'm disabling it  
                    sentence.print_to_file(sentence.tree,file = file)"""
                
                del(sentence)
                times+=code_time
                sents += 1
                #print('\t\t',code_time)
                
                
                if coded_events:
                    event_dict[key]['sents'][sent]['events'] = coded_events
                    event_dict[key]['sents'][sent]['meta'] = meta  
                    """print('DC-events:', coded_events) # --
                    print('DC-meta:', meta) # --
                    print('+++',event_dict[key]['sents'][sent])  # --"""
                    if  PETRglobals.WriteActorText or PETRglobals.WriteEventText or PETRglobals.WriteActorRoot :
                        text_dict = utilities.extract_phrases(event_dict[key]['sents'][sent],SentenceID)
# --                        print('DC-td1:',text_dict) # --
                        if text_dict:
                            event_dict[key]['sents'][sent]['meta']['actortext'] = {}
                            event_dict[key]['sents'][sent]['meta']['eventtext'] = {}
                            event_dict[key]['sents'][sent]['meta']['actorroot'] = {}
# --                            print('DC1:',text_dict) # --
                            for evt in coded_events:
                                if evt in text_dict: # 16.04.30 pas bypasses problems with expansion of compounds 
                                    event_dict[key]['sents'][sent]['meta']['actortext'][evt] = text_dict[evt][:2]
                                    event_dict[key]['sents'][sent]['meta']['eventtext'][evt] = text_dict[evt][2]
                                    event_dict[key]['sents'][sent]['meta']['actorroot'][evt] = text_dict[evt][3:5]

                if coded_events and PETRglobals.IssueFileName != "":
                    event_issues = get_issues(SentenceText)
                    if event_issues:
                        event_dict[key]['sents'][sent]['issues'] = event_issues

                if PETRglobals.PauseBySentence:
                    if len(input("Press Enter to continue...")) > 0:
                        sys.exit()

                prev_code = coded_events
                NEvents += len(coded_events)
                if len(coded_events) == 0:
                    NEmpty += 1
            else:
                logger.info(
                    '{} has no parse information. Passing.'.format(SentenceID))
                pass

        if SkipStory:
            event_dict[key]['sents'] = None


    """if out_file:  # <16.06.18 pas> disable for now
        close_tex(file)"""
    

    print("\nSummary:")
    print(
        "Stories read:",
        NStory,
        "   Sentences coded:",
        NSent,
        "  Events generated:",
        NEvents)
    print(
        "Discards:  Sentence",
        NDiscardSent,
        "  Story",
        NDiscardStory,
        "  Sentences without events:",
        NEmpty)
    print("Average Coding time = ", times/sents if sents else 0)
# --    print('DC-exit:',event_dict)
    return event_dict