def parliament_arc_pipeline(): return ConvokitPipeline([ # to avoid most computations, we'll only run the pipeline if the desired attributes don't exist ('parser', TextParser(input_filter=lambda utt, aux: utt.get_info('arcs') is None) ), ('censor_nouns', CensorNouns( 'parsed_censored', input_filter=lambda utt, aux: utt.get_info('arcs') is None)), ('arcs', TextToArcs( 'arc_arr', input_field='parsed_censored', root_only=True, input_filter=lambda utt, aux: utt.get_info('arcs') is None)), ('question_sentence_filter', QuestionSentences( 'q_arc_arr', input_field='arc_arr', input_filter=lambda utt, aux: utt.get_info('q_arcs') is None)), ('join_arcs', TextProcessor( output_field='arcs', input_field='arc_arr', proc_fn=lambda x: '\n'.join(x), input_filter=lambda utt, aux: utt.get_info('arcs') is None)), ('join_q_arcs', TextProcessor( output_field='q_arcs', input_field='q_arc_arr', proc_fn=lambda x: '\n'.join(x), input_filter=lambda utt, aux: utt.get_info('q_arcs') is None)) ])
def __init__(self, output_field='prompt_types', n_types=8, use_prompt_motifs=True, root_only=True, questions_only=True, enforce_caps=True, recompute_all=False, min_support=100, min_df=100, svd__n_components=25, max_df=.1, max_dist=.9, random_state=None, verbosity=10000, ): self.use_motifs = use_prompt_motifs self.random_state=random_state pipe = [ ('parser', TextParser(verbosity=verbosity, input_filter=lambda utt, aux: recompute_all or (utt.get_info('parsed') is None))), ('censor_nouns', CensorNouns('parsed_censored', input_filter=lambda utt, aux: recompute_all or (utt.get_info('parsed_censored') is None), verbosity=verbosity)), ('shallow_arcs', TextToArcs('arcs', input_field='parsed_censored', input_filter=lambda utt, aux: recompute_all or (utt.get_info('arcs') is None), root_only=root_only, verbosity=verbosity)) ] if questions_only: pipe.append( ('question_sentence_filter', QuestionSentences('question_arcs', input_field='arcs', input_filter=lambda utt, aux: recompute_all or utt.meta['is_question'], use_caps=enforce_caps, verbosity=verbosity)) ) prompt_input_field = 'question_arcs' self.prompt_selector = lambda utt: utt.meta['is_question'] self.reference_selector = lambda utt: (not utt.meta['is_question']) and (utt.reply_to is not None) else: prompt_input_field = 'arcs' self.prompt_selector = lambda utt: True self.reference_selector = lambda utt: True if use_prompt_motifs: pipe.append( ('pm_model', PhrasingMotifs('motifs', prompt_input_field, min_support=min_support, fit_filter=self.prompt_selector, verbosity=verbosity)) ) prompt_field = 'motifs' prompt_transform_field = 'motifs__sink' else: prompt_field = 'arcs' prompt_transform_field = 'arcs' pipe.append( ('pt_model', PromptTypes(prompt_field=prompt_field, reference_field='arcs', prompt_transform_field=prompt_transform_field, output_field=output_field, n_types=n_types, svd__n_components=svd__n_components, prompt__tfidf_min_df=min_df, prompt__tfidf_max_df=max_df, reference__tfidf_min_df=min_df, reference__tfidf_max_df=max_df, max_dist=max_dist, random_state=random_state, verbosity=verbosity )) ) self.pipe = ConvokitPipeline(pipe)
def scotus_arc_pipeline(): return ConvokitPipeline([ ('parser', TextParser( input_filter=lambda utt, aux: utt.get_info('arcs') is None)), ('arcs', TextToArcs( 'arc_arr', input_field='parsed', root_only=False, input_filter=lambda utt, aux: utt.get_info('arcs') is None)), ('join_arcs', TextProcessor( output_field='arcs', input_field='arc_arr', proc_fn=lambda x: '\n'.join(x), input_filter=lambda utt, aux: utt.get_info('arcs') is None)) ])
def wiki_arc_pipeline(): return ConvokitPipeline([ ('parser', TextParser(input_filter=lambda utt, aux: (utt.get_info( 'arcs') is None) and (utt.get_info('parsed') is None))), ('censor_nouns', CensorNouns( 'parsed_censored', input_filter=lambda utt, aux: utt.get_info('arcs') is None)), ('arcs', TextToArcs( 'arc_arr', input_field='parsed_censored', root_only=False, input_filter=lambda utt, aux: utt.get_info('arcs') is None)), ('join_arcs', TextProcessor( output_field='arcs', input_field='arc_arr', proc_fn=lambda x: '\n'.join(x), input_filter=lambda utt, aux: utt.get_info('arcs') is None)) ])
else: utterances = [] with open(os.path.join(filename, 'utterances.jsonl')) as f: for line in f: utterances.append(json.loads(line)) for utt in utterances: try: del utt['meta']['parsed'] except: continue with open(os.path.join(filename, 'utterances.jsonl'), 'w') as f: for utt in utterances: json.dump(utt, f) f.write('\n') if os.path.exists(os.path.join(filename, 'utterances.json')): os.remove(os.path.join(filename, 'utterances.json')) if os.path.exists(os.path.join(filename, 'parsed-bin.p')): os.remove(os.path.join(filename, 'parsed-bin.p')) if PARSE: print('loading corpus') corpus = Corpus(filename) print('parsing corpus') textparser = TextParser(verbosity=VERBOSITY) corpus = textparser.transform(corpus) print('dumping parses') corpus.dump_info('utterance', ['parsed']) os.remove(os.path.join(ROOT_DIR, corpus_name + '.zip'))
def dump_kwic(self, resultfile, separator=","): self.separator = separator self.ft = open(resultfile, "w") print("========================================================================") print("Started processing") print("========================================================================") self.corpus = corpus print("Creating KWIC") textprep = TextProcessor(proc_fn=self.prep_text, output_field='clean_text') textparser = TextParser(output_field='parsed', input_field='clean_text', mode='parse') getmodals = ModalSentences(input_field='parsed', output_field='ismodal') getquestions = QuestionSentences(input_field='parsed', output_field='questions') # header row self.ft.write( "Year" + self.separator + "Sentence ID" + self.separator + "Before" + self.separator + "Mod" + self.separator + "After" + self.separator + "Main Verb" + self.separator + "Passivized" + self.separator + "Passive" + self.separator + "Interrogative" + self.separator + "Role" + self.separator + "Speaker\n") # assuming utterance file is sorted by year, iterate. Skip all non modals. for u in self.corpus.iter_utterances(): # pre processing 1. Clean up text (convokit inbuilt functionality) u = textprep.transform_utterance(u) # pre processing 2. Parse and tag sentence tree (convokit inbuilt functionality) u = textparser.transform_utterance(u) # pre processing 3. Tag modal sentences (custom built functionality) u = getmodals.transform_utterance(u) if u.meta["ismodal"] == 1: # processing 4. Tag interrogative sentences (convokit inbuilt functionality) u = getquestions.transform_utterance(u) parsedsents = u.meta["parsed"] # Loop through modal sentences for parsedsent in parsedsents: try: # use first main verb immediately after modal, with different consideration for passive auxiliaries (e.g. skip "be" in "may be asked") if len(parsedsent): modalindex = passive = unmatchedmodal = i = 0 auxpass = mod = passivized = "" inter = 0 if u.meta["questions"] == [] else 1 # sentence for tokenized in parsedsent["toks"]: # word if tokenized["tag"] == "MD": mod = tokenized["tok"] unmatchedmodal = 1 modalindex = i # Skip aux passive verb, list passive terms (e.g. skip "be" in "may be asked") if tokenized["tag"] == "VB" and tokenized["dep"] == "auxpass": passive = 1 auxpass = tokenized["tok"] # Find verb tagged as main verb after modal aux. if ((tokenized["tag"] == "VB" and passive == 0) or ((passive == 1) and (tokenized["tag"] == "VBN"))) and unmatchedmodal == 1: verb = tokenized["tok"] if passive == 1: passivized = mod + " " + auxpass + " " + verb # Found main verb, print row self.printline(parsedsent, u, passive, inter, mod, modalindex, verb, passivized) auxpass = mod = passivized = "" modalindex = inter = passive = unmatchedmodal = 0 i = i + 1 # print("Processed line ", u.id) except Exception as e: print("Exception on line ", u.id, ":", e) self.ft.close() print("========================================================================") print("Finished processing. Result file saved in convokit/supreme-modal/results folder.") print("========================================================================")