def test_contains_adjective_noun_pair_nominal_modifier(self): caption = "the car that is driving down the street is white" nouns = {"car"} adjectives = {"white"} pos_tagged_caption = self.nlp_pipeline(caption).sentences[0] expected_pattern = (True, True, True) self.assertEqual( expected_pattern, contains_adjective_noun_pair(pos_tagged_caption, nouns, adjectives), )
def test_contains_adjective_noun_pair_hyphen_on_noun(self): caption = "a white compact-car is driving down the street" nouns = {"car"} adjectives = {"white"} pos_tagged_caption = self.nlp_pipeline(caption).sentences[0] expected_pattern = (True, True, True) self.assertEqual( expected_pattern, contains_adjective_noun_pair(pos_tagged_caption, nouns, adjectives), )
def test_contains_adjective_noun_pair_plural(self): caption = "two white cars are driving down the street" nouns = {"car"} adjectives = {"white"} pos_tagged_caption = self.nlp_pipeline(caption).sentences[0] expected_pattern = (True, True, True) self.assertEqual( expected_pattern, contains_adjective_noun_pair(pos_tagged_caption, nouns, adjectives), )
def test_contains_adjective_noun_pair_wrong_noun_2(self): caption = "person inside display area with a young elephant" nouns = {"person"} adjectives = {"young"} pos_tagged_caption = self.nlp_pipeline(caption).sentences[0] expected_pattern = (True, True, False) self.assertEqual( expected_pattern, contains_adjective_noun_pair(pos_tagged_caption, nouns, adjectives), )
def test_contains_adjective_noun_pair_wrong_noun_3(self): caption = ( "a gray shaggy dog hanging out the driver side window of a blue minivan." ) nouns = {"window"} adjectives = {"blue"} pos_tagged_caption = self.nlp_pipeline(caption).sentences[0] expected_pattern = (True, True, False) self.assertEqual( expected_pattern, contains_adjective_noun_pair(pos_tagged_caption, nouns, adjectives), )
def count_adjective_noun_pairs(nouns_file, adjectives_file, preprocessed_data_folder): with open(nouns_file, "r") as json_file: nouns = json.load(json_file) with open(adjectives_file, "r") as json_file: adjectives = json.load(json_file) with open( os.path.join(preprocessed_data_folder, POS_TAGGED_CAPTIONS_FILENAME), "rb" ) as pickle_file: captions = pickle.load(pickle_file) first_noun = nouns[0] first_adjective = adjectives[0] print("Looking for pairs: {} - {}".format(adjectives, nouns)) data = {} data[NOUNS] = nouns data[ADJECTIVES] = adjectives occurrence_data = {} for coco_id, tagged_caption in tqdm(captions.items()): occurrence_data[coco_id] = {} occurrence_data[coco_id][PAIR_OCCURENCES] = 0 occurrence_data[coco_id][ADJECTIVE_OCCURRENCES] = 0 occurrence_data[coco_id][NOUN_OCCURRENCES] = 0 occurrence_data[coco_id][DATA_COCO_SPLIT] = tagged_caption[DATA_COCO_SPLIT] for caption in tagged_caption["pos_tagged_captions"]: noun_is_present, adjective_is_present, combination_is_present = contains_adjective_noun_pair( caption, nouns, adjectives ) if combination_is_present: print(" ".join([token.text for token in caption.tokens])) occurrence_data[coco_id][PAIR_OCCURENCES] += 1 if adjective_is_present: occurrence_data[coco_id][ADJECTIVE_OCCURRENCES] += 1 if noun_is_present: occurrence_data[coco_id][NOUN_OCCURRENCES] += 1 data[OCCURRENCE_DATA] = occurrence_data data_path = "{}_{}.json".format(first_adjective, first_noun) print("\nSaving results to {}".format(data_path)) with open(data_path, "w") as json_file: json.dump(data, json_file) for n in range(1, 6): noun_occurences = len( [d for d in occurrence_data.values() if d[NOUN_OCCURRENCES] >= n] ) adjective_occurences = len( [d for d in occurrence_data.values() if d[ADJECTIVE_OCCURRENCES] >= n] ) pair_occurences = len( [d for d in occurrence_data.values() if d[PAIR_OCCURENCES] >= n] ) print( "\nFound {}\timages where the noun occurs at least {} time(s).".format( noun_occurences, n ) ) print( "Found {}\timages where the adjective occurs at least {} time(s).".format( adjective_occurences, n ) ) print( "Found {}\timages where the pair occurs at least {} time(s).".format( pair_occurences, n ) )