def process(self, input_interface): logging.info("Start the " + self._name + " archit submodule") first = True spos = set() for gf in input_interface.get_generated_facts(): spos.add((gf.get_subject().get(), gf.get_predicate().get(), gf.get_object().get())) new_gfs = [] with open(FILENAME) as f: for line in f: if first: first = False continue line = line.strip().split("\t") subj = line[0] pred = line[1] obj = line[2] if (subj, pred, obj) not in spos: continue score = float(line[self._index]) if score == 0: continue multi_score = MultipleScore() multi_score.add_score(score, self._module_reference, self) new_gfs.append( GeneratedFact(subj, pred, obj, "", 0, multi_score, MultipleSourceOccurrence())) return input_interface.add_generated_facts(new_gfs)
def test_conceptual_caption(self): sc = ConceptualCaptionsComparatorSubmodule(None) self.empty_input = Inputs() self.dummy_reference = ReferencableInterface("DUMMY") dataset = [("elephant", "download", "baby", 0), ("elephant", "have", "tusks", 1), ("lion", "eat", "gazella", 0), ("penguin", "eat", "fish", 0), ("gorilla", "eat", "banana", 0), ("sky", "hasProperty", "blue", 0), ("computer", "is", "working", 1), ("raccoon", "hasProperty", "blue", 0)] subjects = { Subject("elephant"), Subject("penguin"), Subject("lion"), Subject("gorilla"), Subject("sky"), Subject("computer"), Subject("raccoon") } gfs = [] pos = 0 for subject, predicate, obj, truth in dataset: pos += 1 score = MultipleScore() if pos % 2 == 0: score.add_score( truth, self.dummy_reference, GoogleAutocompleteSubmodule(self.dummy_reference)) else: score.add_score( truth, self.dummy_reference, BingAutocompleteSubmodule(self.dummy_reference)) gfs.append( GeneratedFact(subject, predicate, obj, "", False, score, MultipleSourceOccurrence())) score2 = MultipleScore() score2.add_score(1, self.dummy_reference, GoogleAutocompleteSubmodule(self.dummy_reference)) gfs.append( GeneratedFact( "elephant", "be", "big", "", False, score2, MultipleSourceOccurrence.from_raw("elephants are big", None, 1))) inputs = self.empty_input.add_generated_facts(gfs).add_subjects( subjects) inputs = sc.process(inputs) self.assertEqual(len(dataset) + 1, len(inputs.get_generated_facts())) self.assertEqual( len(inputs.get_generated_facts()[0].get_score().scores), 2) self.assertNotAlmostEqual( inputs.get_generated_facts()[1].get_score().scores[1][0], 0, delta=1e-5)
def get_generated_fact_with_score_from_classifier(self, fact, clf): multiple_score = MultipleScore() row = self.get_fact_row(fact) score = clf.predict(fact, row) multiple_score.add_score(score, self.modules[fact], self.submodules[fact]) return GeneratedFact( fact.get_subject(), fact.get_predicate(), fact.get_object(), Modality.from_modalities_and_scores(self.modalities[fact].items()), fact.is_negative(), multiple_score, self.sentences[fact], self.patterns[fact])
def test_save(self): inputs = Inputs() subjects = [Subject("baba"), Subject("coko")] patterns = [ PatternGoogle("why are"), PatternGoogle("Why are", "hasProperty", True) ] mmr = MultipleModuleReference(ModuleReferenceInterface("Module0")) mmr.add_reference(ModuleReferenceInterface("Module1")) msr = MultipleSubmoduleReference( SubmoduleReferenceInterface("Submodule0")) msr.add_reference(SubmoduleReferenceInterface("Submodule0")) ms0 = MultipleScore() ms0.add_score(1.0, ModuleReferenceInterface("Module0"), SubmoduleReferenceInterface("Submodule0")) ms1 = MultipleScore() ms1.add_score(1.0, mmr, msr) ms1.add_score(0.5, ModuleReferenceInterface("Module1"), SubmoduleReferenceInterface("Submodule2")) mp0 = MultiplePattern() mp0.add_pattern(patterns[0]) mp1 = MultiplePattern() mp1.add_pattern(patterns[0]) mp1.add_pattern(patterns[1]) gfs = [ GeneratedFact( "baba", "is", "you", "sometimes", False, ms0, MultipleSourceOccurrence.from_raw("baba is you", msr, 1), mp0), GeneratedFact( "coko", "is", "dead", "always", True, ms1, MultipleSourceOccurrence.from_raw("toto is always dead", msr, 1), mp1) ] seeds = [ Fact("baba", "is", "us", None, False), Fact("coko", "are", "missing", "coucou", True) ] objects = [Object("missing"), Object("you")] inputs = inputs.replace_seeds(seeds) inputs = inputs.replace_patterns(patterns) inputs = inputs.replace_subjects(subjects) inputs = inputs.replace_generated_facts(gfs) inputs = inputs.replace_objects(objects) inputs.save("temp.json") inputs_read = inputs.load("temp.json") self.assertEqual(len(inputs.get_generated_facts()), len(inputs_read.get_generated_facts())) self.assertEqual(len(inputs.get_subjects()), len(inputs_read.get_generated_facts())) self.assertEqual(len(inputs.get_patterns()), len(inputs_read.get_patterns())) self.assertEqual(len(inputs.get_seeds()), len(inputs_read.get_seeds())) self.assertEqual(len(inputs.get_objects()), len(inputs_read.get_objects()))
def read_score(score): if score["type"] == "MultipleScore": multiple_score = MultipleScore() for score_temp in score["scores"]: multiple_score.add_score( score_temp["score"], read_module_reference(score_temp["module_from"]), read_submodule_reference(score_temp["submodule_from"]) ) return multiple_score raise UnknownSerializedObject("Unknown score type:" + json.dumps(score))
def test_cache(self): wikipedia_cache = SimpleWikipediaCooccurrenceSubmodule(None, True, "simple-wikipedia-cache-test") generated_fact = GeneratedFact("lion", "is a", "cat", "", False, MultipleScore(), MultipleSourceOccurrence()) inputs = self.empty_input.add_generated_facts([generated_fact]) wikipedia_cache.process(inputs) generated_fact = GeneratedFact("lion", "is a", "cat", "", False, MultipleScore(), MultipleSourceOccurrence()) inputs = self.empty_input.add_generated_facts([generated_fact]) inputs = wikipedia_cache.process(inputs) self.assertEqual(1, len(inputs.get_generated_facts())) scores = inputs.get_generated_facts()[0].get_score() scores_wikipedia = [x for x in scores.scores if x[2].get_name() == "Simple Wikipedia Cooccurrence"] self.assertEqual(1, len(scores_wikipedia)) self.assertTrue(scores_wikipedia[0][0] != 0) wikipedia_cache.cache.delete_cache()
def test_combination_modalities_long(self): score0 = MultipleScore() score0.add_score(1, None, None) score1 = MultipleScore() score1.add_score(0.5, None, None) generated_fact0 = GeneratedFact("parent", "go", "to Paris", "TBC[many]", False, score0, MultipleSourceOccurrence.from_raw( "parents have many children", None, 1)) generated_fact1 = GeneratedFact("parent", "go to", "Paris", "", False, score1, MultipleSourceOccurrence.from_raw( "parents have children", None, 1)) inputs = self.empty_input.add_generated_facts([generated_fact0, generated_fact1]) fact_combinor = FactCombinor(None) inputs = fact_combinor.process(inputs) self.assertEqual(1, len(inputs.get_generated_facts())) self.assertIn("TBC[many]", inputs.get_generated_facts()[0].get_modality().get()) self.assertIn("parents have many children x#x1", str(inputs.get_generated_facts()[ 0].get_sentence_source())) self.assertIn("parents have children x#x1", str(inputs.get_generated_facts()[ 0].get_sentence_source())) self.assertEqual("go to", inputs.get_generated_facts()[ 0].get_predicate())
def test_do_nothing(self): generated_fact = GeneratedFact("crisis", "is a", "cat", "", False, MultipleScore(), MultipleSourceOccurrence()) inputs = self.empty_input.add_generated_facts([generated_fact]).add_subjects({Subject("lion")}) inputs = self.to_lower_case.process(inputs) generated_facts = inputs.get_generated_facts() self.assertEqual(1, len(generated_facts)) self.assertEqual("crisis", generated_facts[0].get_subject().get())
def test_lion(self): generated_fact = GeneratedFact("lion", "is a", "cat", "", False, MultipleScore(), MultipleSourceOccurrence()) inputs = self.empty_input.add_generated_facts([generated_fact]) inputs = self.simple_wikipedia_no_cache.process(inputs) self.assertEqual(1, len(inputs.get_generated_facts())) scores = inputs.get_generated_facts()[0].get_score() scores_wikipedia = [x for x in scores.scores if x[2].get_name() == "Simple Wikipedia Cooccurrence"] self.assertEqual(1, len(scores_wikipedia)) self.assertTrue(scores_wikipedia[0][0] != 0)
def test_turn_singular_duplicate(self): generated_fact = GeneratedFact("lions", "is a", "cat", "", False, MultipleScore(), MultipleSourceOccurrence()) inputs = self.empty_input.add_generated_facts( [generated_fact, generated_fact]).add_subjects({Subject("lion")}) inputs = self.to_singular.process(inputs) generated_facts = inputs.get_generated_facts() self.assertEqual(2, len(generated_facts)) self.assertEqual("lion", generated_facts[0].get_subject().get())
def test_combination_modalities(self): score0 = MultipleScore() score0.add_score(1, None, None) score1 = MultipleScore() score1.add_score(0.5, None, None) generated_fact0 = GeneratedFact("lion", "eat", "zebra", "some", False, score0, MultipleSourceOccurrence.from_raw("lions eat zebras", None, 1)) generated_fact1 = GeneratedFact("lion", "eat", "zebra", "often", False, score1, MultipleSourceOccurrence.from_raw("lions eat zebras", None, 1)) inputs = self.empty_input.add_generated_facts([generated_fact0, generated_fact1]) fact_combinor = FactCombinor(None) inputs = fact_combinor.process(inputs) self.assertEqual(1, len(inputs.get_generated_facts())) self.assertIn("some", inputs.get_generated_facts()[0].get_modality().get()) self.assertIn("often", inputs.get_generated_facts()[0].get_modality().get())
def test_beach(self): score0 = MultipleScore() score0.add_score(1, None, None) mso = MultipleSourceOccurrence() mso.add_raw("beaches have sand", "Google Autocomplete", 4) mso.add_raw("some beaches have sand", "Google Autocomplete", 2) mso.add_raw("some beaches have sand and some rocks", "Google " "Autocomplete", 1) mso.add_raw("all beaches have sand", "Google Autocomplete", 4) mso.add_raw("beach have sand", "Google Autocomplete", 1) generated_fact0 = GeneratedFact("beach", "have", "sand", "some[subj/some] x#x3 // " "some[subj/all] x#x4", False, score0, mso) inputs = self.empty_input.add_generated_facts([generated_fact0]) fact_combinor = FactCombinor(None) inputs = fact_combinor.process(inputs) self.assertEqual(1, len(inputs.get_generated_facts()))
def test_not_remove(self): inputs = Inputs() mso = MultipleSourceOccurrence() mso.add_raw("elephants eat big bananas", None, 2) gfs = [ GeneratedFact("elephant", "eat", "bananas", "TBC[big bananas]", 0, MultipleScore(), mso) ] inputs = inputs.add_generated_facts(gfs) tbc_cleaner = TBCCleaner(None) inputs = tbc_cleaner.process(inputs) self.assertEqual(len(inputs.get_generated_facts()), 1)
def test_cache(self): google_book_cache = GoogleBookSubmodule( None, True, cache_name="google-book-cache-temp") generated_fact = GeneratedFact("lion", "eat", "zebra", "", False, MultipleScore(), MultipleSourceOccurrence()) inputs = self.empty_input.add_generated_facts([generated_fact]) google_book_cache.process(inputs) generated_fact = GeneratedFact("lion", "eat", "zebra", "", False, MultipleScore(), MultipleSourceOccurrence()) inputs = self.empty_input.add_generated_facts([generated_fact]) inputs = google_book_cache.process(inputs) self.assertEqual(1, len(inputs.get_generated_facts())) scores = inputs.get_generated_facts()[0].get_score() scores_google_book = [ x for x in scores.scores if x[2].get_name() == "Google Book Submodule" ] self.assertEqual(1, len(scores_google_book)) self.assertTrue(scores_google_book[0][0] != 0) google_book_cache.cache.delete_cache()
def test_combination(self): dataset = [("elephant", "download", "baby", 0), ("elephant", "climb", "trunk", 0), ("elephant", "bear", "baby", 1), ("elephant", "download this cute illustration with", "baby", 0), ("elephant", "be", "ear", 0), ("elephant", "fry", "ear", 0), ("elephant", "trek", "travel", 0), ("elephant", "forbid love in", "water", 0), ("elephant", "eat", "bark", 1), ("elephant", "have", "tusks", 1)] gfs = [] pos = 0 for subject, predicate, obj, truth in dataset: pos += 1 score = MultipleScore() if pos % 2 == 0: score.add_score( truth, self.dummy_reference, GoogleAutocompleteSubmodule(self.dummy_reference)) else: score.add_score( truth, self.dummy_reference, BingAutocompleteSubmodule(self.dummy_reference)) gfs.append( GeneratedFact(subject, predicate, obj, "", False, score, MultipleSourceOccurrence())) score2 = MultipleScore() score2.add_score(1, self.dummy_reference, GoogleAutocompleteSubmodule(self.dummy_reference)) gfs.append( GeneratedFact( "elephant", "be", "big", "", False, score2, MultipleSourceOccurrence.from_raw("elephants are big", None, 1))) inputs = self.empty_input.add_generated_facts(gfs) inputs = self.linear_combination.process(inputs) self.assertEqual(len(dataset) + 1, len(inputs.get_generated_facts()))
def test_panda_flickr_cluster(self): new_gfs = [ GeneratedFact("panda", "live", "china", "", False, MultipleScore(), MultipleSourceOccurrence()) ] inputs = self.empty_input.add_generated_facts(new_gfs).add_subjects( {Subject("panda")}) inputs = self.associations_flick_cluster.process(inputs) self.assertEqual(1, len(inputs.get_generated_facts())) scores = inputs.get_generated_facts()[0].get_score() scores_flickr = [ x for x in scores.scores if x[2].get_name() == "Flickr" ] self.assertEqual(1, len(scores_flickr))
def test_lion_eat_code(self): generated_fact = GeneratedFact("lion", "eat", "code", "", False, MultipleScore(), MultipleSourceOccurrence()) inputs = self.empty_input.add_generated_facts([generated_fact]) inputs = self.google_book_no_cache.process(inputs) self.assertEqual(1, len(inputs.get_generated_facts())) scores = inputs.get_generated_facts()[0].get_score() scores_google_book = [ x for x in scores.scores if x[2].get_name() == "Google Book Submodule" ] self.assertEqual(1, len(scores_google_book)) self.assertTrue(scores_google_book[0][0] == 0)
def test_panda_imagetag(self): new_gfs = [ GeneratedFact("panda", "climb", "tree", "", False, MultipleScore(), MultipleSourceOccurrence()) ] inputs = self.empty_input.add_generated_facts(new_gfs).add_subjects( {"panda"}) inputs = self.associations.process(inputs) self.assertEqual(1, len(inputs.get_generated_facts())) scores = inputs.get_generated_facts()[0].get_score() scores_imagetag = [ x for x in scores.scores if x[2].get_name() == "Image Tag submodule" ] self.assertEqual(1, len(scores_imagetag))
def test_lion(self): n_copies = 10 gfs = [] for _ in range(n_copies): generated_fact = GeneratedFact("lion", "is a", "cat", "", False, MultipleScore(), MultipleSourceOccurrence()) gfs.append(generated_fact) inputs = self.empty_input.add_generated_facts(gfs) inputs = self.wikipedia_no_cache.process(inputs) self.assertEqual(n_copies, len(inputs.get_generated_facts())) scores = inputs.get_generated_facts()[0].get_score() scores_wikipedia = [ x for x in scores.scores if x[2].get_name() == "Wikipedia Cooccurrence" ] self.assertEqual(1, len(scores_wikipedia)) self.assertTrue(scores_wikipedia[0][0] != 0)
def add_facts_to_generated_facts(self, generated_facts, subject, predicate, obj, modality, negative, score_based_on_ranking, suggestion): if suggestion[SUBJECT] not in subject: return multiple_score = MultipleScore() multiple_score.add_score(1.0, self._module_reference, reference_corenlp) multiple_score.add_score(score_based_on_ranking, self._module_reference, self) new_fact_corenlp = GeneratedFact( subject, predicate, obj, modality, negative, multiple_score, MultipleSourceOccurrence.from_raw(suggestion[0], self, 1), suggestion[2]) generated_facts.append(new_fact_corenlp)
def get_fact_from_simple_extraction(self, extraction, score, suggestion): negative = get_negativity(suggestion) or extraction[3] multiple_score = MultipleScore() multiple_score.add_score(score, self._module_reference, self) multiple_score.add_score(1.0, self._module_reference, reference_manual) new_fact = GeneratedFact( extraction[0], extraction[1], extraction[2], None, negative, # For the score, inverse the ranking (higher is # better) and add the confidence of the triple multiple_score, MultipleSourceOccurrence.from_raw(suggestion[0], self, 1), suggestion[2]) return new_fact
def _openie_from_file(self, suggestions): openie_reader = OpenIEReader() generated_facts = [] new_suggestions = [] for suggestion in suggestions: self.transforms_suggestion_into_batch_component( suggestion, new_suggestions) for suggestion in new_suggestions: sentence = suggestion[STATEMENT] facts = openie_reader.get_from_sentence(sentence) negative = get_negativity(suggestion) facts = [ fact for fact in facts if len(fact) > 0 and len(fact[0]) > 1 and len(fact[1]) > 1 and len(fact[2]) > 1 ] score_based_on_ranking = self.get_score_based_on_ranking( suggestion) facts = self._take_earliest_predicate(sentence, facts) for fact in facts: if suggestion[SUBJECT] not in fact[0]: continue try: score = float(fact[3].replace(",", ".")) except: logging.info( "Problem in score reading in openie5 reader with " + fact[3]) continue multiple_score = MultipleScore() multiple_score.add_score(score, self._module_reference, reference_openie5) multiple_score.add_score(score_based_on_ranking, self._module_reference, self) generated_facts.append( GeneratedFact( fact[0], fact[1], fact[2], "", negative, multiple_score, MultipleSourceOccurrence.from_raw(sentence, self, 1), suggestion[2])) del openie_reader return generated_facts
def test_combination(self): score0 = MultipleScore() score0.add_score(1, None, None) score1 = MultipleScore() score1.add_score(0.5, None, None) score2 = MultipleScore() score2.add_score(0.7, None, None) generated_fact0 = GeneratedFact("lion", "eat", "zebra", "", False, score0, MultipleSourceOccurrence.from_raw("lions eat zebras", None, 1)) mso = MultipleSourceOccurrence() mso.add_raw("lions eat zebras", None, 2) mso.add_raw("lions eat small zebras", None, 1) generated_fact1 = GeneratedFact("lion", "eat", "zebra", "", False, score1, mso) generated_fact2 = GeneratedFact("lion", "eat", "zebra", "", False, score2, MultipleSourceOccurrence.from_raw("lions eat small zebras", None, 1)) new_gfs = [generated_fact0, generated_fact1, generated_fact2] inputs = self.empty_input.add_generated_facts(new_gfs) fact_combinor = FactCombinor(None) inputs = fact_combinor.process(inputs) self.assertEqual(1, len(inputs.get_generated_facts())) self.assertEqual(3, len(inputs.get_generated_facts()[0].get_score().scores)) sentence = str(inputs.get_generated_facts()[0].get_sentence_source()) self.assertIn("lions eat zebras", sentence) self.assertIn("lions eat small zebras", sentence) self.assertIn("x#x3", sentence) self.assertIn("x#x2", sentence)