def get_energy(self, hmm, rule_set_list, case_name): grammar = Grammar(hmm, RuleSet(rule_set_list)) self.write_to_dot_to_file(grammar.get_nfa(), "grammar_nfa_" + case_name) hypothesis = Hypothesis(grammar, self.data) energy = hypothesis.get_energy() print("{}: {}".format(case_name, hypothesis.get_recent_energy_signature())) return energy
def get_hypothesis_from_log_string(hypothesis_string): from grammar import Grammar from hypothesis import Hypothesis hmm = get_hmm_from_hypothesis_string(hypothesis_string) rule_set = get_rule_set_from_hypothesis_string(hypothesis_string) grammar = Grammar(hmm, rule_set) return Hypothesis(grammar)
def test_crossover(self): from simulations import dag_zook_opacity as simulation self.initialise_simulation(simulation) from copy import deepcopy rule_1 = Rule.load([[{'cont': '+'}], [{'coronal': '-'}], [{'coronal': '-'}], [], True]) rule_2 = Rule.load([[{'cons': '+', 'low': '-'}], [{'voice': '-'}], [{'voice': '-'}], [], True]) crossover_rule_1 = deepcopy(rule_1) crossover_rule_2 = deepcopy(rule_2) crossover_rule_1.left_context_feature_bundle_list = rule_2.left_context_feature_bundle_list crossover_rule_1.right_context_feature_bundle_list = rule_2.right_context_feature_bundle_list crossover_rule_1.change_feature_bundle_list = rule_2.change_feature_bundle_list crossover_rule_2.left_context_feature_bundle_list = rule_1.left_context_feature_bundle_list crossover_rule_2.right_context_feature_bundle_list = rule_1.right_context_feature_bundle_list crossover_rule_2.change_feature_bundle_list = rule_1.change_feature_bundle_list rule_set_1 = RuleSet([crossover_rule_1]) rule_set_2 = RuleSet([crossover_rule_2]) print(rule_set_1) print(rule_set_2) hmm = HMM({'q0': ['q1'], 'q1': (['q2', 'qf'], ['dag', 'kat', 'dot', 'kod']), 'q2': (['qf'], ['zo', 'go', 'do']) }) grammar_1 = Grammar(hmm, rule_set_1) grammar_2 = Grammar(hmm, rule_set_2) data = ['kat', 'dot', 'dag', 'kod'] + \ ['katso', 'dotso', 'dagzo', 'kodzo'] + \ ['katko', 'dotko', 'daggo', 'kodgo'] + \ ['katto', 'dotto', 'dagdo', 'koddo'] self.configurations.simulation_data = data hypothesis_1 = Hypothesis(grammar_1) hypothesis_2 = Hypothesis(grammar_2) print(hypothesis_1.get_energy()) print(hypothesis_2.get_energy())
def test_morphology_only2(self): self.initialise_segment_table("plural_english_segment_table.txt") self.configurations["DATA_ENCODING_LENGTH_MULTIPLIER"] = 25 data = [u'tozata', u'tozaso', u'tozakt', u'tozzookata', u'tozzookaso', u'tozzookakt', u'tozzook', u'tozdodata', u'tozdodaso', u'tozdodakt', u'tozdod', u'tozgosata', u'tozgosaso', u'tozgosakt', u'tozgos', u'toz', u'dagata', u'dagaso', u'dagakt', u'dagzookata', u'dagzookaso', u'dagzookakt', u'dagzook', u'dagdodata', u'dagdodaso', u'dagdodakt', u'dagdod', u'daggosata', u'daggosaso', u'daggosakt', u'daggos', u'dag', u'gasata', u'gasaso', u'gasakt', u'gaszookata', u'gaszookaso', u'gaszookakt', u'gaszook', u'gasdodata', u'gasdodaso', u'gasdodakt', u'gasdod', u'gasgosata', u'gasgosaso', u'gasgosakt', u'gasgos', u'gas', u'kodata', u'kodaso', u'kodakt', u'kodzookata', u'kodzookaso', u'kodzookakt', u'kodzook', u'koddodata', u'koddodaso', u'koddodakt', u'koddod', u'kodgosata', u'kodgosaso', u'kodgosakt', u'kodgos', u'kod', u'katata', u'kataso', u'katakt', u'katzookata', u'katzookaso', u'katzookakt', u'katzook', u'katdodata', u'katdodaso', u'katdodakt', u'katdod', u'katgosata', u'katgosaso', u'katgosakt', u'katgos', u'kat', u'dotata', u'dotaso', u'dotakt', u'dotzookata', u'dotzookaso', u'dotzookakt', u'dotzook', u'dotdodata', u'dotdodaso', u'dotdodakt', u'dotdod', u'dotgosata', u'dotgosaso', u'dotgosakt', u'dotgos', u'dot'] hmm = HMM({'q0': [u'q1'], 'q1': ([u'q2', u'q3', u'qf'], ['toz', 'dag', 'kat', 'dot', 'kod', 'gas']), 'q2': ([u'q3',u'qf'], ['zook', 'gos', 'dod']), 'q3': ([u'qf'], ['aso', 'akt', 'ata'])}) self.configurations.simulation_data = data hypothesis = Hypothesis(Grammar(hmm, []))
def test_abadnese_no_rule(self): self.initialise_segment_table("abd_segment_table.txt") data = ['bbabbba', 'baabbba', 'babbbba', 'bbabadba', 'baabadba', 'babbadba', 'bbabbad', 'baabbad', 'babbbad', 'bbabadad', 'baabadad', 'babbadad', 'bbabbab', 'baabbab', 'babbbab', 'bbabadab', 'baabadab', 'babbadab'] hmm = HMM({'q0': ['q1'], 'q1': (['q2'], ['bbab', 'baab', 'babb', 'bbaba', 'baaba', 'babba']), 'q2': (['qf'], ['dba', 'dad', 'dab', 'bba', 'bad', 'bab'])}) grammar = Grammar(hmm, []) self.configurations.simulation_data = data hypothesis = Hypothesis(grammar) self.assertEqual(int(hypothesis.get_energy()), 243)
def test_plural_english_hypothesis(self): self.initialise_segment_table("plural_english_segment_table.txt") self.rule_set = self.get_rule_set("plural_english_rule_set.json") plural_english_data = 1 * ['kats', 'dogz', 'kat', 'dog'] hmm = HMM({INITIAL_STATE: ['q1'], 'q1': (['q2', FINAL_STATE], ['dog', 'kat']), 'q2': ([FINAL_STATE], ['z'])}) grammar = Grammar(hmm, self.rule_set) self.write_to_dot_file(self.rule_set.rules[0].get_transducer(), "plural_english_rule") self.write_to_dot_file(grammar.get_nfa(), "grammar_nfa") self.configurations.simulation_data = plural_english_data hypothesis = Hypothesis(grammar) self.assertEqual(int(hypothesis.get_energy()), 117)
def test_hypothesis_unique_representation(self): from simulations import french_two_rules as simulation self.initialise_simulation(simulation) hmm_1 = HMM({'q0': ['q1'], 'q1': (['q2'], ['klop', 'kylt', 'provok']), 'q2': (['qf'], ['kif', 'timid', 'fad', 'mal', 'byvabl', EPSILON]) }) rule_1 = Rule.load([[], [{"center": "+"}], [{"cons": "+", "son": "+"}, {"son": "+", "cons": "+"}], [{"MB": True}, {"cons": "+"}], False]) rule_set_1 = RuleSet([rule_1]) grammar_1 = Grammar(hmm_1, rule_set_1) hypothesis_1 = Hypothesis(grammar_1) repr_1 = repr(hypothesis_1) hmm_2 = HMM({'q0': ['q1'], 'q2': (['qf'], ['kif', 'timid', 'mal', 'fad', 'byvabl', EPSILON]), 'q1': (['q2'], ['provok', 'kylt', 'klop']) }) rule_2 = Rule.load([[], [{"center": "+"}], [{"son": "+", "cons": "+"}, {"cons": "+", "son": "+"}], [{"MB": True}, {"cons": "+"}], False]) rule_set_2 = RuleSet([rule_2]) grammar_2 = Grammar(hmm_2, rule_set_2) hypothesis_2 = Hypothesis(grammar_2) repr_2 = repr(hypothesis_2) print(repr_1) print(repr_2) assert repr_1 == repr_2
def test_opacity_two_hypotheses(self): from simulations import dag_zook_opacity as simulation self.initialise_simulation(simulation) hmm = HMM({'q0': ['q1'], 'q1': (['q2', 'q3'], ['daot', 'dkoz', 'dog', 'dok', 'gdaas', 'gkas', 'kaos', 'kat', 'kood', 'ksoag', 'ogtd', 'oktdo', 'skaz', 'tak', 'tso']), 'q2': (['qf'], ['go', 'kazka', 'soka', 'ta', EPSILON]), 'q3': (['qf'], ['da', 'saat', 'tsk', 'zoka']) }) epenthesis_rule = Rule([], [{'low': '+'}], [{'coronal': '+'}], [{'coronal': '+'}], True) assimilation_rule = Rule([{'cons': '+'}], [{'voice': '-'}], [{'voice': '-'}], [], True) rule_set = RuleSet([assimilation_rule, epenthesis_rule]) grammar = Grammar(hmm, rule_set) hypothesis = Hypothesis(grammar) print(hypothesis.get_energy())
def test_incest(self): self.initialise_segment_table("plural_english_segment_table.txt") self.rule_set = self.get_rule_set("plural_english_rule_set.json") plural_english_data = 1 * ['kats', 'dogz', 'kat', 'dog'] hmm = HMM({ INITIAL_STATE: ['q1'], 'q1': (['q2', FINAL_STATE], ['dog', 'kat']), 'q2': ([FINAL_STATE], ['z']) }) grammar = Grammar(hmm, self.rule_set) h1 = Hypothesis(grammar, plural_english_data) h2 = deepcopy(h1) h2.data = ['kakakakkakakaka'] print(GeneticAlgorithm._is_incest(h1, h2))
def test_abadnese(self): self.initialise_segment_table("abd_segment_table.txt") data = ['bbabbba', 'baabbba', 'babbbba', 'bbabadba', 'baabadba', 'babbadba', 'bbabbad', 'baabbad', 'babbbad', 'bbabadad', 'baabadad', 'babbadad', 'bbabbab', 'baabbab', 'babbbab', 'bbabadab', 'baabadab', 'babbadab'] hmm = HMM({'q0': ['q1'], 'q1': (['q2'], ['bbab', 'baab', 'babb', 'bbaba', 'baaba', 'babba']), 'q2': (['qf'], ['dba', 'dad', 'dab'])}) rule = Rule.load([[{"cons": "+"}], [{"labial": "+"}], [{"labial": "+"}], [], True]) rule_set = RuleSet([rule]) grammar = Grammar(hmm, rule_set) self.configurations.simulation_data = data hypothesis = Hypothesis(grammar) hypothesis.get_energy() self.assertEqual(ceil(hypothesis.get_energy()), 231)
def test_assimilation_no_rule(self): self.initialise_segment_table("plural_english_segment_table.txt") data = ['kat', 'dot', 'dag', 'kod'] + \ ['katso', 'dotso', 'dagzo', 'kodzo'] + \ ['katko', 'dotko', 'daggo', 'kodgo'] + \ ['katto', 'dotto', 'dagdo', 'koddo'] hmm = HMM({'q0': ['q1'], 'q1': (['q2', 'qf'], ['dag', 'kat', 'dot', 'kod']), 'q2': (['qf'], ['zo', 'go', 'do', 'to', 'so', 'ko']) }) grammar = Grammar(hmm, []) hypothesis = Hypothesis(grammar) self.configurations.simulation_data = data self.assertEqual(int(hypothesis.get_energy()), 230)
def test_parser(self): hmm_multiple_paths = HMM({ INITIAL_STATE: ['q1', 'q3'], 'q1': (['q2', FINAL_STATE], ['dog', 'kat', 'kats', 'dogz']), 'q2': ([FINAL_STATE], ['z']), 'q3': (['q3', FINAL_STATE], self.plural_english_segments) }) grammar = Grammar(hmm_multiple_paths, self.plural_english_rule_set) hypothesis = Hypothesis(grammar, ['dogz']) nfa = grammar.get_nfa() parses, outputs = nfa_parser(nfa, 'dogz') print(parses) print(outputs) nfa = grammar.get_nfa() self.write_to_dot_to_file(nfa, "test_parser_nfa")
def test_assimilation2(self): self.initialise_segment_table("plural_english_segment_table.txt") self.rule_set = self.get_rule_set("plural_english_rule_set.json") data = ['kat', 'dot', 'dag', 'kod'] + \ ['katso', 'dotso', 'dagzo', 'kodzo'] + \ ['katko', 'dotko', 'daggo', 'kodgo'] + \ ['katto', 'dotto', 'dagdo', 'koddo'] hmm = HMM({'q0': ['q1'], 'q1': (['q2', 'qf'], ['dag', 'kat', 'dot', 'kod']), 'q2': (['qf'], ['zo', 'go', 'do']) }) grammar = Grammar(hmm, self.rule_set) self.configurations.simulation_data = data hypothesis = Hypothesis(grammar) for _ in range(10): #1.4 energy = hypothesis.get_energy()
def __init__(self, threadID, dataSet, minStartWeight, maxStartWeight): Process.__init__(self) self.threadID = threadID self.weights = Weights(dataSet.getDataPoint(0).dimension()) self.weights.generateRandom(minStartWeight, maxStartWeight) weight = """*****\nThread {0} initial weights:\n{1}\n*****""" print(weight.format(self.threadID, self.weights.vector)) self.trainingErrors = Errors(dataSet.trainingDataPoints, dataSet.trainingActualValues) self.testingErrors = Errors(dataSet.testingDataPoints, dataSet.testingActualValues) self.hypothesis = Hypothesis() self.trainingErrors.updateToLower(self.weights, self.hypothesis) self.iterations = 0 self.alpha = STARTING_ALPHA
def mergeEntries(self, entriesLst, cube_indx): # First process the goal: this will be a (regular/glue) rule sf_f_obj = sff.initNew(entriesLst[0].lm_heu) score = entriesLst[0].getScoreSansLmHeu() # Now process the antecedents anteHyps = [] anteSfFeats = [] anteItemsStates = [] for ante_ent in entriesLst[1:]: score += ante_ent.getScoreSansLmHeu() anteHyps.append(ante_ent.tgt) anteSfFeats.append(ante_ent.sf_feat) anteItemsStates.append(ante_ent.consItems) (tgt_hyp, newConsItems) = lmm.helperConsItem(Lazy.is_last_cell, Lazy.cell_type, \ Lazy.cell_span, entriesLst[0].tgt.split(), anteHyps, anteItemsStates) if settings.opts.force_decode and not Lazy.candMatchesRef(tgt_hyp): return (score, None ) # Hypothesis wouldn't lead to reference; ignore this """ Get hypothesis status from the classmethod (in Lazy); hypothesis status can take one of these three values: -2 : Hyp was not see earlier; create a new entry -1 : Hyp was seen earlier but current one has a better score; create a new entry to replace the existing one 0 : Hyp was seen earlier and has a poor score than the existing one; ignore this """ score_wo_LM = score - sf_f_obj.aggregSFScore(anteSfFeats) hyp_status = Lazy.getHypothesisStatus(tgt_hyp, score_wo_LM) """ Should we recombine hypothesis? A new hypothesis is always added; query LM for lm-score and create new entry_obj. If an identical hypothesis exists then the current hyp is added under below conditions: i) the use_unique_nbest flag is False (add new hyp; but use the LM score of the existing one) ii) use_unique_nbest is True and the new hyp is better than the existing one. """ if (hyp_status == 0 and settings.opts.use_unique_nbest): entry_obj = None else: score += sf_f_obj.helperScore(newConsItems, Lazy.is_last_cell) entry_obj = Hypothesis(score, self.src_side, tgt_hyp, sf_f_obj, self.depth_hier, Lazy.cell_span, \ entriesLst[0], entriesLst[1:], newConsItems) return (score, entry_obj)
def test_abadnese_no_rule(self): self.initialise_segment_table("abd_segment_table.txt") data = [ 'bbabbba', 'baabbba', 'babbbba', 'bbabadba', 'baabadba', 'babbadba', 'bbabbad', 'baabbad', 'babbbad', 'bbabadad', 'baabadad', 'babbadad', 'bbabbab', 'baabbab', 'babbbab', 'bbabadab', 'baabadab', 'babbadab' ] hmm = HMM({ 'q0': ['q1'], 'q1': (['q2', 'qf'], ['bbab', 'baab', 'babb', 'bbaba', 'baaba', 'babba']), 'q2': (['qf'], ['dba', 'dad', 'dab', 'bba', 'bad', 'bab']) }) grammar = Grammar(hmm, []) hypothesis = Hypothesis(grammar, data) self.assertEqual(hypothesis.get_energy(), 252)
def get_energy(self, simulation_case): case_name = simulation_case.case_name if isinstance(simulation_case.hmm_dict, HMM): hmm = simulation_case.hmm_dict else: hmm = HMM(simulation_case.hmm_dict) if isinstance(simulation_case.flat_rule_set_list, RuleSet): rule_set = simulation_case.flat_rule_set_list else: rule_set_list = [] for flat_rule in simulation_case.flat_rule_set_list: rule_set_list.append(Rule(*flat_rule)) rule_set = RuleSet(rule_set_list) grammar = Grammar(hmm, rule_set) self.write_to_dot_to_file(hmm, "hmm") self.write_to_dot_to_file(grammar.get_nfa(), "grammar_nfa_"+case_name) hypothesis = Hypothesis(grammar, self.data) energy = hypothesis.get_energy() print("{}: {}".format(case_name, hypothesis.get_recent_energy_signature())) return energy
def test_abnese(self): self.initialise_segment_table("ab_segment_table.txt") self.configurations["BRACKET_TRANSDUCER"] = True data = ['bab', 'aabab'] hmm = HMM( {'q0': ['q1'], 'q1': (['qf'], ['bb', 'aabb']) }) rule = Rule([], [{"cons": "-"}], [{"cons": "+"}], [{"cons": "+"}], False) # e->a / b_b rule_set = RuleSet([rule]) print(rule_set.get_outputs_of_word("bb")) grammar = Grammar(hmm, rule_set) self.write_to_dot_file(grammar.get_nfa(), "grammar_nfa") self.configurations.simulation_data = data hypothesis = Hypothesis(grammar) print(hypothesis.get_energy()) print(hypothesis.get_recent_energy_signature())
def test_katso_two_rule(self): #configurations["DATA_ENCODING_LENGTH_MULTIPLIER"] = 25 self.initialise_segment_table("plural_english_segment_table.txt") data = ['kat', 'dot', 'dag', 'kod', 'gas', 'toz'] + \ ['katso', 'dotso', 'dagzo', 'kodzo', 'gasazo', 'tozazo'] + \ ['katko', 'dotko', 'daggo', 'kodgo', 'gasko', 'tozgo'] + \ ['katto', 'dotto', 'dagdo', 'koddo', 'gasto', 'tozdo'] hmm = {'q0': ['q1'], 'q1': (['q2', 'qf'], ['dag', 'kat', 'dot', 'kod', 'gas', 'toz']), 'q2': (['qf'], ['zo', 'go', 'do'])} epenthesis_rule = Rule.load([[], [{"cons": "-", "low": "+"}], [{"cons": "+", "cont": "+"}], [{"cons": "+", "cont": "+"}], True]) assimilation_rule = Rule.load([[{"cons": "+"}], [{"voice": "-"}], [{"voice": "-"}], [], True]) rule_set = RuleSet([epenthesis_rule, assimilation_rule]) hmm = HMM(hmm) grammar = Grammar(hmm, rule_set) self.write_to_dot_file(grammar.get_nfa(), "grammar_nfa") self.configurations.simulation_data = data hypothesis = Hypothesis(grammar) self.assertEqual(int(hypothesis.get_energy()), 364)
def test_abadnese_for_ezer(self): self.initialise_segment_table("abd_segment_table.txt") data = ['aabad', 'abad', 'badaabad', 'aba', 'aaba', 'badaa'] hmm = HMM({ 'q0': ['q1'], 'q1': (['qf'], ['aabd', 'abd', 'bdaabd', 'aba', 'aaba', 'bdaa']) }) rule = Rule([], [{}], [{ "cons": "+", "labial": "+" }], [{ "cons": "+", "labial": "+" }], obligatory=True) rule_set = RuleSet([rule]) print(rule_set.get_outputs_of_word("abb")) grammar = Grammar(hmm, rule_set) hypothesis = Hypothesis(grammar, data)
def test_get_parsing_results_infinite(self): self.initialise_segment_table("abnese_lengthening_segment_table.txt") configurations["MORPHEME_BOUNDARY_FLAG"] = True configurations["LENGTHENING_FLAG"] = True configurations["HMM_ENCODING_LENGTH_MULTIPLIER"] = 100 configurations["DATA_ENCODING_LENGTH_MULTIPLIER"] = 20 hmm = HMM({ 'q0': ['q1'], 'q1': (['qf'], ['ab:a', 'baaa', 'baab', 'baab:a']) }) rule = Rule([], [{'long': '-'}], [], [{}], obligatory=False) rule_set = RuleSet([rule]) grammar = Grammar(hmm, rule_set) data = [ u'baba:a', u'babaab:ab', u'ab:a', u'aab:a', u'aab:ab', u'ab:ab' ] hypothesis = Hypothesis(grammar, data) simulated_annealing = SimulatedAnnealing(hypothesis, 0) print(simulated_annealing._get_parsing_results())
def get_action_infos(query, actions, force_copy=False): action_info_list = [] prediction = Hypothesis() for t, action in enumerate(actions): action_info = ActionInfo(action) action_info.t = t if prediction.frontier_node: action_info.parent_t = prediction.frontier_node.created_time action_info.frontier_prod = prediction.frontier_node.production action_info.frontier_field = prediction.frontier_field.field if isinstance(action, GenTokenAction): try: token_source_index = query.index(str(action.token)) action_info.copy_from_src = True action_info.src_token_position = token_source_index except ValueError: if force_copy: raise ValueError('Can\'t copy input token %s' % action.token) prediction.apply_action(action) action_info_list.append(action_info) return action_info_list
file_log_handler.setFormatter(file_log_formatter) logger.addHandler(file_log_handler) feature_tables_dir_path = join(dir_name, "tests/fixtures/feature_tables") constraint_sets_dir_path = join(dir_name, "tests/fixtures/constraint_sets") feature_table_file_path = join(feature_tables_dir_path, current_simulation.feature_table_file_name) feature_table = FeatureTable.load(feature_table_file_path) constraint_set_file_path = join( constraint_sets_dir_path, current_simulation.constraint_set_file_name) constraint_set = ConstraintSet.load(constraint_set_file_path) corpus = Corpus(current_simulation.corpus) data = corpus.get_words() max_word_length_in_data = max([len(word) for word in data]) lexicon = Lexicon(data, max_word_length_in_data) grammar = Grammar(constraint_set, lexicon) hypothesis = Hypothesis(grammar, data) if hasattr(current_simulation, "target_energy"): target_energy = current_simulation.target_energy else: target_energy = None simulated_annealing = SimulatedAnnealing(hypothesis, target_energy) simulated_annealing.run()
import csv, traceback, json from hypothesis import Hypothesis from collections import defaultdict h = Hypothesis(username='******', token='{TOKEN}') def orderByFields(names): fields = 'project_id,report_id,report_title,report_url,report_date,media_content,media_url,report_status,report_author,time_delta_to_first_status,time_delta_to_last_status,time_original_media_publishing,type,contributing_users,tags,notes_count,notes_ugc_count,tasks_count,tasks_resolved_count,task_question_1,task_user_1,task_date_1,task_answer_1,task_note_1,task_question_2,task_user_2,task_date_2,task_answer_2,task_note_2,task_question_3,task_user_3,task_date_3,task_answer_3,task_note_3,task_question_4,task_user_4,task_date_4,task_answer_4,task_note_4,task_question_5,task_user_5,task_date_5,task_answer_5,task_note_5,task_question_6,task_user_6,task_date_6,task_answer_6,task_note_6,task_question_7,task_user_7,task_date_7,task_answer_7,task_note_7,task_question_8,task_user_8,task_date_8,task_answer_8,task_note_8,task_question_9,task_user_9,task_date_9,task_answer_9,task_note_9,task_question_10,task_user_10,task_date_10,task_answer_10,task_note_10,task_question_11,task_user_11,task_date_11,task_answer_11,task_note_11,task_question_12,task_user_12,task_date_12,task_answer_12,task_note_12,task_question_13,task_user_13,task_date_13,task_answer_13,task_note_13,task_question_14,task_user_14,task_date_14,task_answer_14,task_note_14,task_question_15,task_user_15,task_date_15,task_answer_15,task_note_15,task_question_16,task_user_16,task_date_16,task_answer_16,task_note_16,task_question_17,task_user_17,task_date_17,task_answer_17,task_note_17,task_question_18,task_user_18,task_date_18,task_answer_18,task_note_18,task_question_19,task_user_19,task_date_19,task_answer_19,task_note_19,task_question_20,task_user_20,task_date_20,task_answer_20,task_note_20,task_question_21,task_user_21,task_date_21,task_answer_21,task_note_21,task_question_22,task_user_22,task_date_22,task_answer_22,task_note_22,task_question_23,task_user_23,task_date_23,task_answer_23,task_note_23,task_question_24,task_user_24,task_date_24,task_answer_24,task_note_24,note_date_1,note_user_1,note_content_1,task_question_25,task_user_25,task_date_25,task_answer_25,task_note_25'.split( ',') orderedList = [] for fieldName in fields: if fieldName in names: orderedList.append(fieldName) return orderedList headerTemplate = """<p> <i>Checked at {report_url}</i> </p> <hr> """ questionTemplate = """<p>{question}</p>""" multiAnswerTemplate = """<p><ul>{answers}</ul></p>""" singleAnswerTemplate = """<li>{answer}</li>""" targets = [ 'https://www.usatoday.com/story/news/nation-now/2017/06/16/coconut-oil-isnt-healthy-its-never-been-healthy/402719001/',
def parse(self, sentence): primitive_vocab = self.vocab.primitive processed_sentence = self.process([sentence], self.vocab.source) source_encodings, (last_encoder_state, last_encoder_cell) = TranxParser.encode( processed_sentence, [len(sentence)]) source_encodings_attention_linear_layer = nn.Linear(256, 256, bias=False) decoder_initial_vector = F.tanh( self.decoder_cell_initializer_linear_layer(last_encoder_cell)) h_t1 = decoder_initial_vector hypothesis_scores = Variable(torch.cuda.FloatTensor([0.]), volatile=True) source_token_positions_by_token = OrderedDict() for token_position, token in enumerate(sentence): source_token_positions_by_token.setdefault( token, []).append(token_position) t = 0 hypotheses = [Hypothesis()] hypotheses_states = [[]] finished_hypotheses = [] while len(finished_hypotheses) < 15 and t < 100: num_of_hypotheses = len(hypotheses) expanded_source_encodings = source_encodings.expand( num_of_hypotheses, source_encodings.size(1), source_encodings.size(2)) expanded_source_encodings_attention_linear_layer = \ source_encodings_attention_linear_layer.expand(num_of_hypotheses, source_encodings_attention_linear_layer.size(1), source_encodings_attention_linear_layer.size(2)) if t == 0: x = Variable(torch.cuda.FloatTensor(1, 128).zero_(), volatile=True) else: actions = [h.actions[-1] for h in hypotheses] action_embeddings = [] for action in actions: if action: if isinstance(action, ApplyRuleAction): action_embedding = self.apply_const_and_reduce_emb.weight[ self.grammar.production_to_id[ action.production]] elif isinstance(action, ReduceAction): action_embedding = self.apply_const_and_reduce_emb.weight[ len(self.grammar)] else: action_embedding = self.primitives_emb.weight[ self.vocab.primitive[action.token]] action_embeddings.append(action_embedding) else: action_embeddings.append( Variable(torch.cuda.FloatTensor(128).zero_())) action_embeddings = torch.stack(action_embeddings) encoder_inputs = [action_embeddings] encoder_inputs.append(att_t1) frontier_fields = [h.frontier_field.field for h in hypotheses] frontier_field_embeddings = self.fields_emb( Variable( torch.cuda.FloatTensor([ self.grammar.field_to_id[f] for f in frontier_fields ]))) encoder_inputs.append(frontier_field_embeddings) parent_created_times = [ h.frontier_node.created_time for h in hypotheses ] parent_states = torch.stack( [hypotheses_states[h_id][parent_created_time][0]] for h_id, parent_created_time in enumerate( parent_created_times)) parent_cells = torch.stack([ hypotheses_states[h_id][parent_created_time][1] for h_id, parent_created_time in enumerate(parent_created_times) ]) encoder_inputs.append(parent_states) x = torch.cat(encoder_inputs, dim=-1) (h_t, cell), attention = self.step( x, h_t1, expanded_source_encodings, expanded_source_encodings_attention_linear_layer) log_p_of_each_apply_rule_action = F.log_softmax( self.production_prediction(attention), dim=-1) p_of_generating_each_primitive_in_vocab = F.softmax( self.primitive_prediction(attention), dim=-1) p_of_copying_from_source_sentence = self.ptr_net_lin( source_encodings, None, attention.unsqueeze(0).squeeze(0)) p_of_making_primitive_prediction = F.softmax( self.gen_vs_copy_lin(attention), dim=-1) p_of_each_primitive = p_of_making_primitive_prediction[:, 0].unsqueeze( 1) * p_of_generating_each_primitive_in_vocab hypothesis_ids_for_which_we_gentoken = [] hypothesis_unknowns_resulting_from_gentoken = [] hypothesis_ids_for_which_we_applyrule = [] hypothesis_production_ids_resulting_from_applyrule_actions = [] hypothesis_scores_resulting_from_applyrule_actions = [] for hypothesis_id, hypothesis in enumerate(hypotheses): action_types = self.transition_sys.get_valid_continuation_types( hypothesis) for action_type in action_types: if action_type == ApplyRuleAction: productions = self.transition_sys.get_valid_continuating_productions( hypothesis) for production in productions: production_id = self.grammar.production_to_id[ production] hypothesis_production_ids_resulting_from_applyrule_actions.append( production_id) production_score = log_p_of_each_apply_rule_action[ hypothesis_id, production_id].data[0] new_hypothesis_score = hypothesis.score + production_score hypothesis_scores_resulting_from_applyrule_actions.append( new_hypothesis_score) hypothesis_ids_for_which_we_applyrule.append( hypothesis_id) elif action_type == ReduceAction: reduce_score = log_p_of_each_apply_rule_action[ hypothesis_id, len(self.grammar)].data[0] new_hypothesis_score = hypothesis.score + reduce_score hypothesis_scores_resulting_from_applyrule_actions.append( new_hypothesis_score) hypothesis_production_ids_resulting_from_applyrule_actions.append( len(self.grammar)) hypothesis_ids_for_which_we_applyrule.append( hypothesis_id) else: hypothesis_ids_for_which_we_gentoken.append( hypothesis_id) hypothesis_copy_probabilities_by_token = dict() copied_unks_info = [] for token, token_positions in source_token_positions_by_token.items( ): total_copy_prob = torch.gather( p_of_copying_from_source_sentence[ hypothesis_id], 0, Variable(torch.cuda.LongTensor( token_positions))).sum() p_of_making_copy = p_of_making_primitive_prediction[ hypothesis_id, 1] * total_copy_prob if token in primitive_vocab: token_id = primitive_vocab[token] p_of_each_primitive[ hypothesis_id, token_id] = p_of_each_primitive[ hypothesis_id, token_id] + p_of_making_copy hypothesis_copy_probabilities_by_token[ token] = (token_positions, p_of_making_copy.data[0]) else: copied_unks_info.append({ 'token': token, 'token_positions': token_positions, 'copy_prob': p_of_making_copy.data[0] }) if len(copied_unks_info) > 0: copied_unk = np.array([ unk['copy_prob'] for unk in copied_unks_info ]).argmax() copied_token = copied_unks_info[copied_unk][ 'token'] p_of_each_primitive[ hypothesis_id, primitive_vocab.unk_id] = copied_unks_info[ copied_unk]['copy_prob'] hypothesis_unknowns_resulting_from_gentoken.append( copied_token) hypothesis_copy_probabilities_by_token[ copied_token] = ( copied_unks_info[copied_unk] ['token_positions'], copied_unks_info[copied_unk]['copy_prob']) new_hypothesis_scores = None if hypothesis_scores_resulting_from_applyrule_actions: new_hypothesis_scores = Variable( torch.cuda.FloatTensor( hypothesis_scores_resulting_from_applyrule_actions)) if hypothesis_ids_for_which_we_gentoken: log_p_of_each_primitive = torch.log(p_of_each_primitive) gen_token_new_hypothesis_scores = ( hypothesis_scores[hypothesis_ids_for_which_we_gentoken]. unsqueeze(1) + log_p_of_each_primitive[ hypothesis_ids_for_which_we_gentoken, :]).view(-1) if new_hypothesis_scores is None: new_hypothesis_scores = gen_token_new_hypothesis_scores else: new_hypothesis_scores = torch.cat([ new_hypothesis_scores, gen_token_new_hypothesis_scores ]) top_new_hypothesis_scores, top_new_hypothesis_positions = torch.topk( new_hypothesis_scores, k=min(new_hypothesis_scores.size(0), 15 - len(finished_hypotheses))) working_hypothesis_ids = [] new_hypotheses = [] for new_hypothesis_score, new_hypothesis_position in zip( top_new_hypothesis_scores.data.cpu(), top_new_hypothesis_positions.data.cpu()): action_info = ActionInfo() if new_hypothesis_position < len( hypothesis_scores_resulting_from_applyrule_actions): previous_hypothesis_id = hypothesis_ids_for_which_we_applyrule[ new_hypothesis_position] previous_hypothesis = hypotheses[previous_hypothesis_id] production_id = hypothesis_scores_resulting_from_applyrule_actions[ new_hypothesis_position] if production_id < len(self.grammar): apply_production = self.grammar.id_to_production[ production_id] action = ApplyRuleAction(apply_production) else: action = ReduceAction() else: token_id = ( new_hypothesis_position - len(hypothesis_scores_resulting_from_applyrule_actions) ) % p_of_each_primitive.size(1) previous_hypothesis_id = hypothesis_ids_for_which_we_gentoken[ (new_hypothesis_position - len(hypothesis_scores_resulting_from_applyrule_actions )) // p_of_each_primitive.size(1)] previous_hypothesis = hypotheses[previous_hypothesis_id] if token_id == primitive_vocab.unk_id: if hypothesis_unknowns_resulting_from_gentoken: token = hypothesis_unknowns_resulting_from_gentoken[ (new_hypothesis_position - len( hypothesis_scores_resulting_from_applyrule_actions )) // p_of_each_primitive.size(1)] else: token = primitive_vocab.id_to_word[ primitive_vocab.unk_id] else: token = primitive_vocab.id_2_word[token_id] action = GenTokenAction(token) if token in source_token_positions_by_token: action_info.copy_from_src = True action_info.src_token_position = source_token_positions_by_token[ token] action_info.action = action action_info.t = t if t > 0: action_info.parent_t = previous_hypothesis.frontier_node.created_time action_info.frontier_prod = previous_hypothesis.frontier_node.production action_info.frontier_field = previous_hypothesis.frontier_field.field new_hypothesis = previous_hypothesis.clone_and_apply_action_info( action_info) new_hypothesis.score = new_hypothesis_score if new_hypothesis.completed: finished_hypotheses.append(new_hypothesis) else: new_hypotheses.append(new_hypothesis) working_hypothesis_ids.append(previous_hypothesis_id) if working_hypothesis_ids: hypothesis_states = [ hypothesis_states[i] + [(h_t[i], cell[i])] for i in working_hypothesis_ids ] h_t1 = (h_t[working_hypothesis_ids], cell[working_hypothesis_ids]) att_t1 = attention[working_hypothesis_ids] hypotheses = new_hypotheses hypothesis_scores = Variable( torch.cuda.FloatTensor([hyp.score for hyp in hypotheses])) t += 1 else: break finished_hypotheses.sort(key=lambda hyp: -hyp.score) return finished_hypotheses
target_stems_and_suffixes = get_target_stems_and_suffixes(target_stems, surface_suffixes) corpus = target_stems + target_stems_and_suffixes corpus = Corpus(corpus) data = corpus.get_words() max_word_length_in_data = max([len(word) for word in data]) #initial hypothesis initial_lexicon = Lexicon(data, max_word_length_in_data) initial_grammar = Grammar(initial_constraint_set, initial_lexicon) initial_hypothesis = Hypothesis(initial_grammar, data) initial_energy = initial_hypothesis.get_energy() print(f"initial hypothesis: {initial_hypothesis.get_recent_energy_signature()}") print(f"initial energy: {initial_energy}") print_empty_line() target_hmm_transitions = {INITIAL_STATE: ["q1"], "q1": ["q2", FINAL_STATE], "q2": [FINAL_STATE]} target_hmm_emissions = {"q1": target_stems, "q2": ["kun"]} target_hmm_inner_states = ["q1", "q2"] target_hmm = HMM(target_hmm_transitions, target_hmm_emissions, target_hmm_inner_states)
def empty_hypothesis() -> Hypothesis: return Hypothesis([], 0.0, 0.0, None)
def init_target_GD(self, time): if len(self.unmatched.keys()) < 2: return for cid, value in self.unmatched.items(): _detections = list() for detection in value['detections']: if get_believe(detection) > self.conf_threshold: _detections.append(detection) value['detections'] = np.array(_detections) H = [] for idx, (key, value) in enumerate(self.unmatched.items()): if idx == 0: H = [ Hypothesis(value['camera'], detection, self.epi_threshold) for detection in value['detections'] ] else: n_hyp = len(H) n_det = len(value['detections']) C = np.zeros((n_hyp, n_det)) Mask = np.zeros_like(C).astype('int32') for hid, hyp in enumerate(H): for pid, detection in enumerate(value['detections']): pose_cost, veto = hyp.calculate_cost( value['camera'], detection) C[hid, pid] = pose_cost if veto: Mask[hid, pid] = 1 rows, cols = linear_sum_assignment(C) handled_pids = set() for hid, pid in zip(rows, cols): is_masked = Mask[hid, pid] == 1 handled_pids.add(pid) if is_masked: H.append( Hypothesis(value['camera'], value['detections'][pid], self.epi_threshold)) else: H[hid].merge(value['camera'], value['detections'][pid]) for pid, detection in enumerate(value['detections']): if pid not in handled_pids: H.append( Hypothesis(value['camera'], value['detections'][pid], self.epi_threshold)) for hid, hyp in enumerate(H): if hyp.size() > 1: # cameras, poses2d, pose3d, joints_views, succeed = hyp.get_3dpose(self.args.lambda_t) cameras, poses2d, pose3d, joints_views, succeed = hyp.get_3dpose_jf( self.args.init_threshold, self.args.lambda_t) if not succeed: continue if len(self.tracks_ids) == 0: track_id = 0 else: track_id = max(self.tracks_ids) + 1 self.tracks.append( IterTrack(track_id, time, cameras, poses2d, pose3d, joints_views, self.args, self.build3D)) self.tracks_ids.add(track_id)
def get_target_hypo(self): target_hmm = deepcopy(self.simulation.target_hmm) target_rule_set = RuleSet.load_from_flat_list(self.simulation.target_tuple[1]) return Hypothesis(Grammar(target_hmm, target_rule_set))
def hypo_from_strings(self, hmm_str, rules_str): final_hmm = self.parse_hmm(hmm_str) final_rule_set = self.parse_rules(rules_str) return Hypothesis(Grammar(final_hmm, final_rule_set))