def do_coding(event_dict, out_file): """ Main coding loop Note that entering any character other than 'Enter' at the prompt will stop the program: this is deliberate. <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first sentence of the *next* story """ treestr = "" NStory = 0 NSent = 0 NEvents = 0 NEmpty = 0 NDiscardSent = 0 NDiscardStory = 0 if out_file: file = open_tex(out_file) logger = logging.getLogger('petr_log') times = 0 sents = 0 for key, val in sorted(event_dict.items()): NStory += 1 prev_code = [] SkipStory = False print('\n\nProcessing story {}'.format(key)) StoryDate = event_dict[key]['meta']['date'] StorySource = 'TEMP' for sent in val['sents']: NSent += 1 if 'parsed' in event_dict[key]['sents'][sent]: # if 'config' in val['sents'][sent]: # for id, config in event_dict[key][ # 'sents'][sent]['config'].items(): # change_Config_Options(config) SentenceID = '{}_{}'.format(key, sent) SentenceText = event_dict[key]['sents'][sent]['content'] SentenceDate = event_dict[key]['sents'][sent][ 'date'] if 'date' in event_dict[key]['sents'][ sent] else StoryDate Date = PETRreader.dstr_to_ordate(SentenceDate) SentenceSource = 'TEMP' print("\n", SentenceID) parsed = event_dict[key]['sents'][sent]['parsed'] treestr = parsed disc = check_discards(SentenceText) if disc[0] > 0: if disc[0] == 1: print("Discard sentence:", disc[1]) logger.info('\tSentence discard. {}'.format(disc[1])) NDiscardSent += 1 continue else: print("Discard story:", disc[1]) logger.info('\tStory discard. {}'.format(disc[1])) SkipStory = True NDiscardStory += 1 break t1 = time.time() sentence = PETRtree.Sentence(treestr, SentenceText, Date) print(sentence.actor) print(sentence.agent) coded_events, meta = sentence.get_events( ) # this is the entry point into the processing in PETRtree # print(meta) code_time = time.time() - t1 event_dict[key]['meta'][ 'verbs'] = meta # 16.04.30 pas: we're using the key value 'meta' at two very different # levels of event_dict -- see the code about ten lines below -- and # this is potentially confusing, so it probably would be useful to # change one of those if out_file: sentence.print_to_file(sentence.tree, file=file) del (sentence) times += code_time sents += 1 #print('\t\t',code_time) if coded_events: event_dict[key]['sents'][sent]['events'] = coded_events event_dict[key]['sents'][sent]['meta'] = meta """print('DC-events:', coded_events) # -- print('DC-meta:', meta) # -- print('+++',event_dict[key]['sents'][sent]) # --""" if PETRglobals.WriteActorText or PETRglobals.WriteEventText or PETRglobals.WriteActorRoot: text_dict = utilities.extract_phrases( event_dict[key]['sents'][sent], SentenceID) # -- print('DC-td1:',text_dict) # -- if text_dict: event_dict[key]['sents'][sent]['meta'][ 'actortext'] = {} event_dict[key]['sents'][sent]['meta'][ 'eventtext'] = {} event_dict[key]['sents'][sent]['meta'][ 'actorroot'] = {} # -- print('DC1:',text_dict) # -- for evt in coded_events: if evt in text_dict: # 16.04.30 pas bypasses problems with expansion of compounds event_dict[key]['sents'][sent]['meta'][ 'actortext'][evt] = text_dict[evt][:2] event_dict[key]['sents'][sent]['meta'][ 'eventtext'][evt] = text_dict[evt][2] event_dict[key]['sents'][sent]['meta'][ 'actorroot'][evt] = text_dict[evt][3:5] if coded_events and PETRglobals.IssueFileName != "": event_issues = get_issues(SentenceText) if event_issues: event_dict[key]['sents'][sent]['issues'] = event_issues if PETRglobals.PauseBySentence: if len(input("Press Enter to continue...")) > 0: sys.exit() prev_code = coded_events NEvents += len(coded_events) if len(coded_events) == 0: NEmpty += 1 else: logger.info( '{} has no parse information. Passing.'.format(SentenceID)) pass if SkipStory: event_dict[key]['sents'] = None if out_file: close_tex(file) print("\nSummary:") print("Stories read:", NStory, " Sentences coded:", NSent, " Events generated:", NEvents) print("Discards: Sentence", NDiscardSent, " Story", NDiscardStory, " Sentences without events:", NEmpty) print("Average Coding time = ", times / sents if sents else 0) # -- print('DC-exit:',event_dict) return event_dict
def do_coding(event_dict): """ Main coding loop Note that entering any character other than 'Enter' at the prompt will stop the program: this is deliberate. <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first sentence of the *next* story """ treestr = "" NStory = 0 NSent = 0 NEvents = 0 NEmpty = 0 NDiscardSent = 0 NDiscardStory = 0 logger = logging.getLogger('petr_log') times = 0 sents = 0 #获得发布时间 realiseTimeDic = get_releasetime(event_dict) if not realiseTimeDic: print( "realiseTimeDic have no timeinfo ,please check “get_releasetime” method" ) #获得报道时间 reporttimeDic = get_reporttime(event_dict, realiseTimeDic) for key, val in sorted(event_dict.items()): NStory += 1 prev_code = [] SkipStory = False print('\n\nProcessing paragraph {}'.format(key)) StoryDate = event_dict[key]['meta']['date'] if StoryDate == 'NULL': continue id = key.split("-") articleId = id[0] paraghId = id[1] #设置发布时间与报道时间,报道时间缺失的按发布时间确定 val["meta"]["realiseTime"] = realiseTimeDic[articleId] if articleId in reporttimeDic.keys(): val["meta"]["reportTime"] = reporttimeDic[articleId] else: val["meta"]["reportTime"] = realiseTimeDic[articleId] # if paraghId == "0000": # with open("timeinfo.txt", "a") as f: # # f.writelines(("发布时间:" + val["meta"]["realiseTime"]).decode("utf-8").encode("utf-8") + "\n") # f.writelines(("报道时间:" + val["meta"]["reportTime"]).decode("utf-8").encode("utf-8") + "\n") # with open("timeinfo.txt", "a") as f: # f.writelines(("文章段落ID:" + articleId + " " + paraghId + "\n").decode("utf-8").encode("utf-8")) for sent in sorted(val['sents']): print('\n\nProcessing sentence {}'.format(sent)) NSent += 1 if 'parsed' in event_dict[key]['sents'][sent]: SentenceID = '{}_{}'.format(key, sent) SentenceText = event_dict[key]['sents'][sent]['content'] SentenceDate = event_dict[key]['sents'][sent][ 'date'] if 'date' in event_dict[key]['sents'][ sent] else StoryDate Date = PETRreader.dstr_to_ordate( SentenceDate.split(' ')[0].replace('-', '')) parsed = event_dict[key]['sents'][sent]['parsed'] treestr = parsed disc = check_discards(SentenceText) if disc[0] > 0: if disc[0] == 1: print("Discard sentence:", disc[1]) logger.info('\tSentence discard. {}'.format(disc[1])) NDiscardSent += 1 continue else: print("Discard story:", disc[1]) logger.info('\tStory discard. {}'.format(disc[1])) SkipStory = False NDiscardStory += 1 break t1 = time.time() try: sentence = PETRtree.Sentence(treestr, SentenceText, Date) except Exception as e: message = "ERROR IN PETRARCH2 DO_CODING:" + SentenceID + "\n" + SentenceText + str( e) + "\n" logging.exception(message) continue set_nt_textList(sentence) set_sentenceTimeByReport(sentence, val["meta"]["reportTime"], val['sents'], sent) # with open("timeinfo.txt", "a") as f: # f.writelines((" 句子ID:" + sent + "\n").decode("utf-8").encode("utf-8")) # f.write(" "+sentence.txt.decode("utf-8").encode("utf-8")+ "\n") # f.write(" 时间词列表: ") # for text in sentence.ntTextList: # f.write(text+",") # f.write("\n 句子时间:" +str(sentence.sentenceTime).decode("utf-8").encode("utf-8") + "\n\n") timeText = sentence.ntTextList sentenceTime = sentence.sentenceTime try: coded_events, meta = sentence.get_events() except Exception as e: message = "ERROR IN PETRARCH2 DO_CODING:" + SentenceID + "\n" + SentenceText + str( e) + "\n" logging.exception(message) # print("coded_events:",coded_events) # print("meta:",meta) #print("coded_events:",coded_events) #print("meta:",meta) # exit() # 暂时只走了最后一条分支 code_time = time.time() - t1 if PETRglobals.NullVerbs or PETRglobals.NullActors: event_dict[key]['meta'] = meta event_dict[key]['text'] = sentence.txt elif PETRglobals.NullActors: event_dict[key]['events'] = coded_events coded_events = None # skips additional processing event_dict[key]['text'] = sentence.txt else: # 16.04.30 pas: we're using the key value 'meta' at two # very different event_dict[key]['meta']['verbs'] = meta # levels of event_dict -- see the code about ten lines below -- and # this is potentially confusing, so it probably would be useful to # change one of those del (sentence) times += code_time sents += 1 # print('\t\t',code_time) if coded_events: event_dict[key]['sents'][sent]['events'] = coded_events event_dict[key]['sents'][sent]['meta'] = meta #print('DC-events:', coded_events) # -- #print('DC-meta:', meta) # -- #print('+++',event_dict[key]['sents'][sent]) # -- if PETRglobals.WriteActorText or PETRglobals.WriteEventText or PETRglobals.WriteActorRoot: text_dict = utilities.extract_phrases( event_dict[key]['sents'][sent], SentenceID) print('DC-td1:', text_dict) # -- if text_dict: event_dict[key]['sents'][sent]['meta'][ 'actortext'] = {} event_dict[key]['sents'][sent]['meta'][ 'eventtext'] = {} event_dict[key]['sents'][sent]['meta'][ 'actorroot'] = {} event_dict[key]['sents'][sent]['meta'][ 'eventroot'] = {} event_dict[key]['sents'][sent]['meta'][ 'Source'] = {} event_dict[key]['sents'][sent]['meta'][ 'Target'] = {} event_dict[key]['sents'][sent]['meta'][ 'timeText'] = timeText event_dict[key]['sents'][sent]['meta'][ 'sentenceTime'] = {sentenceTime} # -- print('DC1:',text_dict) # -- for evt in coded_events: # realLocation = [] # location_initial = event_dict[key]['sents'][sent]['ner'] # # index1 = SentenceText.find(text_dict[evt][0]) + 1 # index2 = SentenceText.find(text_dict[evt][1]) - 1 # index3 = SentenceText.find(text_dict[evt][2]) - 1 # for loc in location_initial: # if (SentenceText.find(loc, index1, index2) # or SentenceText.find(loc, index1, index3)): # realLocation.append(loc) # event_dict[key]['sents'][sent]['ner'] = realLocation if evt in text_dict: # 16.04.30 pas bypasses problems with expansion of compounds event_dict[key]['sents'][sent]['meta'][ 'actortext'][evt] = text_dict[evt][:2] event_dict[key]['sents'][sent]['meta'][ 'eventtext'][evt] = text_dict[evt][2] event_dict[key]['sents'][sent]['meta'][ 'actorroot'][evt] = text_dict[evt][3:5] event_dict[key]['sents'][sent]['meta'][ 'eventroot'][evt] = text_dict[evt][5] event_dict[key]['sents'][sent]['meta'][ 'Source'][evt] = text_dict[evt][0] event_dict[key]['sents'][sent]['meta'][ 'Target'][evt] = text_dict[evt][1] if coded_events and PETRglobals.IssueFileName != "": event_issues = get_issues(SentenceText) if event_issues: event_dict[key]['sents'][sent]['issues'] = event_issues if PETRglobals.PauseBySentence: if len(input("Press Enter to continue...")) > 0: sys.exit() prev_code = coded_events # NEvents += len(coded_events) if coded_events is not None and len(coded_events) == 0: NEmpty += 1 else: logger.info( '{} has no parse information. Passing.'.format(SentenceID)) pass if SkipStory: event_dict[key]['sents'] = None print("\nSummary:") """ print( "Stories read:", NStory, " Sentences coded:", NSent, " Events generated:", NEvents) print( "Discards: Sentence", NDiscardSent, " Story", NDiscardStory, " Sentences without events:", NEmpty) print("Average Coding time = ", times / sents if sents else 0) """ # -- print('DC-exit:',event_dict) return event_dict
def do_coding(event_dict, out_file): """ Main coding loop Note that entering any character other than 'Enter' at the prompt will stop the program: this is deliberate. <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first sentence of the *next* story """ treestr = "" NStory = 0 NSent = 0 NEvents = 0 NEmpty = 0 NDiscardSent = 0 NDiscardStory = 0 """if out_file: # <16.06.18 pas> disable for now file = open_tex(out_file)""" logger = logging.getLogger('petr_log') times = 0 sents = 0 for key, val in sorted(event_dict.items()): NStory += 1 prev_code = [] SkipStory = False print('\n\nProcessing story {}'.format(key)) StoryDate = event_dict[key]['meta']['date'] StorySource = 'TEMP' for sent in val['sents']: NSent += 1 if 'parsed' in event_dict[key]['sents'][sent]: if 'config' in val['sents'][sent]: for id, config in event_dict[key][ 'sents'][sent]['config'].items(): change_Config_Options(config) SentenceID = '{}_{}'.format(key, sent) SentenceText = event_dict[key]['sents'][sent]['content'] SentenceDate = event_dict[key]['sents'][sent][ 'date'] if 'date' in event_dict[key]['sents'][sent] else StoryDate Date = PETRreader.dstr_to_ordate(SentenceDate) SentenceSource = 'TEMP' print("\n",SentenceID) parsed = event_dict[key]['sents'][sent]['parsed'] treestr = parsed disc = check_discards(SentenceText) if disc[0] > 0: if disc[0] == 1: print("Discard sentence:", disc[1]) logger.info('\tSentence discard. {}'.format(disc[1])) NDiscardSent += 1 continue else: print("Discard story:", disc[1]) logger.info('\tStory discard. {}'.format(disc[1])) SkipStory = True NDiscardStory += 1 break t1 = time.time() sentence = PETRtree.Sentence(treestr,SentenceText,Date) print(sentence.txt) coded_events , meta = sentence.get_events() # this is the entry point into the processing in PETRtree code_time = time.time()-t1 if PETRglobals.NullVerbs or PETRglobals.NullActors: event_dict[key]['meta'] = meta event_dict[key]['text'] = sentence.txt elif PETRglobals.NullActors: event_dict[key]['events'] = coded_events coded_events = None # skips additional processing event_dict[key]['text'] = sentence.txt else: event_dict[key]['meta']['verbs'] = meta # 16.04.30 pas: we're using the key value 'meta' at two very different # levels of event_dict -- see the code about ten lines below -- and # this is potentially confusing, so it probably would be useful to # change one of those """if out_file: # <16.06.18 pas> This isn't doing anything useful right now, just flipping bits on the hard drive, so I'm disabling it sentence.print_to_file(sentence.tree,file = file)""" del(sentence) times+=code_time sents += 1 #print('\t\t',code_time) if coded_events: event_dict[key]['sents'][sent]['events'] = coded_events event_dict[key]['sents'][sent]['meta'] = meta """print('DC-events:', coded_events) # -- print('DC-meta:', meta) # -- print('+++',event_dict[key]['sents'][sent]) # --""" if PETRglobals.WriteActorText or PETRglobals.WriteEventText or PETRglobals.WriteActorRoot : text_dict = utilities.extract_phrases(event_dict[key]['sents'][sent],SentenceID) # -- print('DC-td1:',text_dict) # -- if text_dict: event_dict[key]['sents'][sent]['meta']['actortext'] = {} event_dict[key]['sents'][sent]['meta']['eventtext'] = {} event_dict[key]['sents'][sent]['meta']['actorroot'] = {} # -- print('DC1:',text_dict) # -- for evt in coded_events: if evt in text_dict: # 16.04.30 pas bypasses problems with expansion of compounds event_dict[key]['sents'][sent]['meta']['actortext'][evt] = text_dict[evt][:2] event_dict[key]['sents'][sent]['meta']['eventtext'][evt] = text_dict[evt][2] event_dict[key]['sents'][sent]['meta']['actorroot'][evt] = text_dict[evt][3:5] if coded_events and PETRglobals.IssueFileName != "": event_issues = get_issues(SentenceText) if event_issues: event_dict[key]['sents'][sent]['issues'] = event_issues if PETRglobals.PauseBySentence: if len(input("Press Enter to continue...")) > 0: sys.exit() prev_code = coded_events NEvents += len(coded_events) if len(coded_events) == 0: NEmpty += 1 else: logger.info( '{} has no parse information. Passing.'.format(SentenceID)) pass if SkipStory: event_dict[key]['sents'] = None """if out_file: # <16.06.18 pas> disable for now close_tex(file)""" print("\nSummary:") print( "Stories read:", NStory, " Sentences coded:", NSent, " Events generated:", NEvents) print( "Discards: Sentence", NDiscardSent, " Story", NDiscardStory, " Sentences without events:", NEmpty) print("Average Coding time = ", times/sents if sents else 0) # -- print('DC-exit:',event_dict) return event_dict