def parliament_arc_pipeline():
    return ConvokitPipeline([
        # to avoid most computations, we'll only run the pipeline if the desired attributes don't exist
        ('parser',
         TextParser(input_filter=lambda utt, aux: utt.get_info('arcs') is None)
         ),
        ('censor_nouns',
         CensorNouns(
             'parsed_censored',
             input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
        ('arcs',
         TextToArcs(
             'arc_arr',
             input_field='parsed_censored',
             root_only=True,
             input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
        ('question_sentence_filter',
         QuestionSentences(
             'q_arc_arr',
             input_field='arc_arr',
             input_filter=lambda utt, aux: utt.get_info('q_arcs') is None)),
        ('join_arcs',
         TextProcessor(
             output_field='arcs',
             input_field='arc_arr',
             proc_fn=lambda x: '\n'.join(x),
             input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
        ('join_q_arcs',
         TextProcessor(
             output_field='q_arcs',
             input_field='q_arc_arr',
             proc_fn=lambda x: '\n'.join(x),
             input_filter=lambda utt, aux: utt.get_info('q_arcs') is None))
    ])
예제 #2
0
	def __init__(self, output_field='prompt_types', n_types=8, use_prompt_motifs=True, root_only=True,
				questions_only=True, enforce_caps=True, recompute_all=False, min_support=100,
				 min_df=100, svd__n_components=25, max_df=.1,
					max_dist=.9,
				 random_state=None, verbosity=10000, 
				):
		self.use_motifs = use_prompt_motifs
		self.random_state=random_state
		pipe = [
			('parser', TextParser(verbosity=verbosity, 
				 input_filter=lambda utt, aux: recompute_all or (utt.get_info('parsed') is None))),
			('censor_nouns', CensorNouns('parsed_censored', 
				 input_filter=lambda utt, aux: recompute_all or (utt.get_info('parsed_censored') is None),
										 verbosity=verbosity)),
			('shallow_arcs', TextToArcs('arcs', input_field='parsed_censored',
				input_filter=lambda utt, aux: recompute_all or (utt.get_info('arcs') is None),
									   root_only=root_only, verbosity=verbosity))
			
		]
		
		if questions_only:
			pipe.append(
				('question_sentence_filter', QuestionSentences('question_arcs',
									input_field='arcs', 
								   input_filter=lambda utt, aux: recompute_all or utt.meta['is_question'],
									use_caps=enforce_caps, verbosity=verbosity))
			)
		
			prompt_input_field = 'question_arcs'
			self.prompt_selector = lambda utt: utt.meta['is_question']
			self.reference_selector = lambda utt: (not utt.meta['is_question']) and (utt.reply_to is not None)
		else:
			prompt_input_field = 'arcs'
			self.prompt_selector = lambda utt: True
			self.reference_selector = lambda utt: True
		if use_prompt_motifs:
			pipe.append(
				('pm_model', PhrasingMotifs('motifs', prompt_input_field, min_support=min_support,
						fit_filter=self.prompt_selector, verbosity=verbosity))
			)
			prompt_field = 'motifs'
			prompt_transform_field = 'motifs__sink'
		else:
			prompt_field = 'arcs'
			prompt_transform_field = 'arcs'
		pipe.append(
			('pt_model', PromptTypes(prompt_field=prompt_field, reference_field='arcs', 
									 prompt_transform_field=prompt_transform_field,
									 output_field=output_field, n_types=n_types,
									 svd__n_components=svd__n_components,
									 prompt__tfidf_min_df=min_df,
									 prompt__tfidf_max_df=max_df,
									 reference__tfidf_min_df=min_df,
									 reference__tfidf_max_df=max_df,
									 max_dist=max_dist,
									 random_state=random_state, verbosity=verbosity
			))
		)
		self.pipe = ConvokitPipeline(pipe)
def scotus_arc_pipeline():
    return ConvokitPipeline([
        ('parser',
         TextParser(
             input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
        ('arcs',
         TextToArcs(
             'arc_arr',
             input_field='parsed',
             root_only=False,
             input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
        ('join_arcs',
         TextProcessor(
             output_field='arcs',
             input_field='arc_arr',
             proc_fn=lambda x: '\n'.join(x),
             input_filter=lambda utt, aux: utt.get_info('arcs') is None))
    ])
def wiki_arc_pipeline():
    return ConvokitPipeline([
        ('parser',
         TextParser(input_filter=lambda utt, aux: (utt.get_info(
             'arcs') is None) and (utt.get_info('parsed') is None))),
        ('censor_nouns',
         CensorNouns(
             'parsed_censored',
             input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
        ('arcs',
         TextToArcs(
             'arc_arr',
             input_field='parsed_censored',
             root_only=False,
             input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
        ('join_arcs',
         TextProcessor(
             output_field='arcs',
             input_field='arc_arr',
             proc_fn=lambda x: '\n'.join(x),
             input_filter=lambda utt, aux: utt.get_info('arcs') is None))
    ])
예제 #5
0
    else:
        utterances = []
        with open(os.path.join(filename, 'utterances.jsonl')) as f:
            for line in f:
                utterances.append(json.loads(line))
    for utt in utterances:
        try:
            del utt['meta']['parsed']
        except:
            continue

    with open(os.path.join(filename, 'utterances.jsonl'), 'w') as f:
        for utt in utterances:
            json.dump(utt, f)
            f.write('\n')

    if os.path.exists(os.path.join(filename, 'utterances.json')):
        os.remove(os.path.join(filename, 'utterances.json'))
    if os.path.exists(os.path.join(filename, 'parsed-bin.p')):
        os.remove(os.path.join(filename, 'parsed-bin.p'))

    if PARSE:
        print('loading corpus')
        corpus = Corpus(filename)
        print('parsing corpus')
        textparser = TextParser(verbosity=VERBOSITY)
        corpus = textparser.transform(corpus)

        print('dumping parses')
        corpus.dump_info('utterance', ['parsed'])
    os.remove(os.path.join(ROOT_DIR, corpus_name + '.zip'))
    def dump_kwic(self, resultfile, separator=","):
        self.separator = separator
        self.ft = open(resultfile, "w")
        print("========================================================================")
        print("Started processing")
        print("========================================================================")
        self.corpus = corpus

        print("Creating KWIC")
        textprep = TextProcessor(proc_fn=self.prep_text, output_field='clean_text')
        textparser = TextParser(output_field='parsed', input_field='clean_text', mode='parse')
        getmodals = ModalSentences(input_field='parsed', output_field='ismodal')
        getquestions = QuestionSentences(input_field='parsed', output_field='questions')
        # header row

        self.ft.write(
            "Year" + self.separator + "Sentence ID" + self.separator + "Before" + self.separator + "Mod" + self.separator + "After" + self.separator + "Main Verb" + self.separator + "Passivized" + self.separator + "Passive" + self.separator + "Interrogative" + self.separator + "Role" + self.separator + "Speaker\n")
        # assuming utterance file is sorted by year, iterate. Skip all non modals.
        for u in self.corpus.iter_utterances():
            # pre processing 1. Clean up text (convokit inbuilt functionality)
            u = textprep.transform_utterance(u)
            # pre processing 2. Parse and tag sentence tree (convokit inbuilt functionality)
            u = textparser.transform_utterance(u)
            # pre processing 3. Tag modal sentences (custom built functionality)
            u = getmodals.transform_utterance(u)


            if u.meta["ismodal"] == 1:
                # processing 4. Tag interrogative sentences (convokit inbuilt functionality)
                u = getquestions.transform_utterance(u)
                parsedsents = u.meta["parsed"]
                # Loop through modal sentences
                for parsedsent in parsedsents:
                    try:
                        #  use first main verb immediately after modal, with different consideration for passive auxiliaries  (e.g. skip "be" in "may be asked")
                        if len(parsedsent):
                            modalindex = passive = unmatchedmodal = i = 0
                            auxpass = mod = passivized = ""
                            inter = 0 if u.meta["questions"] == [] else 1
                            # sentence
                            for tokenized in parsedsent["toks"]:
                                # word
                                if tokenized["tag"] == "MD":
                                    mod = tokenized["tok"]
                                    unmatchedmodal = 1
                                    modalindex = i

                                # Skip aux passive verb, list passive terms (e.g. skip "be" in "may be asked")
                                if tokenized["tag"] == "VB" and tokenized["dep"] == "auxpass":
                                    passive = 1
                                    auxpass = tokenized["tok"]

                                # Find verb tagged as main verb after modal aux.
                                if ((tokenized["tag"] == "VB" and passive == 0) or
                                    ((passive == 1) and (tokenized["tag"] == "VBN"))) and unmatchedmodal == 1:
                                    verb = tokenized["tok"]
                                    if passive == 1:
                                        passivized = mod + "  " + auxpass + "  " + verb
                                    # Found main verb, print row
                                    self.printline(parsedsent, u, passive, inter, mod, modalindex, verb, passivized)
                                    auxpass = mod = passivized = ""
                                    modalindex = inter = passive = unmatchedmodal = 0
                                i = i + 1
                        # print("Processed line ", u.id)
                    except Exception as e:
                        print("Exception on line ", u.id, ":", e)
        self.ft.close()
        print("========================================================================")
        print("Finished processing. Result file saved in convokit/supreme-modal/results folder.")
        print("========================================================================")