def test_contains_adjective_noun_pair_nominal_modifier(self):
        caption = "the car that is driving down the street is white"
        nouns = {"car"}
        adjectives = {"white"}

        pos_tagged_caption = self.nlp_pipeline(caption).sentences[0]

        expected_pattern = (True, True, True)
        self.assertEqual(
            expected_pattern,
            contains_adjective_noun_pair(pos_tagged_caption, nouns, adjectives),
        )
    def test_contains_adjective_noun_pair_hyphen_on_noun(self):
        caption = "a white compact-car is driving down the street"
        nouns = {"car"}
        adjectives = {"white"}

        pos_tagged_caption = self.nlp_pipeline(caption).sentences[0]

        expected_pattern = (True, True, True)
        self.assertEqual(
            expected_pattern,
            contains_adjective_noun_pair(pos_tagged_caption, nouns, adjectives),
        )
    def test_contains_adjective_noun_pair_plural(self):
        caption = "two white cars are driving down the street"
        nouns = {"car"}
        adjectives = {"white"}

        pos_tagged_caption = self.nlp_pipeline(caption).sentences[0]

        expected_pattern = (True, True, True)
        self.assertEqual(
            expected_pattern,
            contains_adjective_noun_pair(pos_tagged_caption, nouns, adjectives),
        )
    def test_contains_adjective_noun_pair_wrong_noun_2(self):
        caption = "person inside display area with a young elephant"
        nouns = {"person"}
        adjectives = {"young"}

        pos_tagged_caption = self.nlp_pipeline(caption).sentences[0]

        expected_pattern = (True, True, False)
        self.assertEqual(
            expected_pattern,
            contains_adjective_noun_pair(pos_tagged_caption, nouns, adjectives),
        )
    def test_contains_adjective_noun_pair_wrong_noun_3(self):
        caption = (
            "a gray shaggy dog hanging out the driver side window of a blue minivan."
        )
        nouns = {"window"}
        adjectives = {"blue"}

        pos_tagged_caption = self.nlp_pipeline(caption).sentences[0]

        expected_pattern = (True, True, False)
        self.assertEqual(
            expected_pattern,
            contains_adjective_noun_pair(pos_tagged_caption, nouns, adjectives),
        )
Пример #6
0
def count_adjective_noun_pairs(nouns_file, adjectives_file, preprocessed_data_folder):
    with open(nouns_file, "r") as json_file:
        nouns = json.load(json_file)

    with open(adjectives_file, "r") as json_file:
        adjectives = json.load(json_file)

    with open(
        os.path.join(preprocessed_data_folder, POS_TAGGED_CAPTIONS_FILENAME), "rb"
    ) as pickle_file:
        captions = pickle.load(pickle_file)

    first_noun = nouns[0]
    first_adjective = adjectives[0]

    print("Looking for pairs: {} - {}".format(adjectives, nouns))

    data = {}
    data[NOUNS] = nouns
    data[ADJECTIVES] = adjectives

    occurrence_data = {}

    for coco_id, tagged_caption in tqdm(captions.items()):
        occurrence_data[coco_id] = {}
        occurrence_data[coco_id][PAIR_OCCURENCES] = 0
        occurrence_data[coco_id][ADJECTIVE_OCCURRENCES] = 0
        occurrence_data[coco_id][NOUN_OCCURRENCES] = 0
        occurrence_data[coco_id][DATA_COCO_SPLIT] = tagged_caption[DATA_COCO_SPLIT]

        for caption in tagged_caption["pos_tagged_captions"]:
            noun_is_present, adjective_is_present, combination_is_present = contains_adjective_noun_pair(
                caption, nouns, adjectives
            )
            if combination_is_present:
                print(" ".join([token.text for token in caption.tokens]))
                occurrence_data[coco_id][PAIR_OCCURENCES] += 1
            if adjective_is_present:
                occurrence_data[coco_id][ADJECTIVE_OCCURRENCES] += 1
            if noun_is_present:
                occurrence_data[coco_id][NOUN_OCCURRENCES] += 1

    data[OCCURRENCE_DATA] = occurrence_data

    data_path = "{}_{}.json".format(first_adjective, first_noun)
    print("\nSaving results to {}".format(data_path))
    with open(data_path, "w") as json_file:
        json.dump(data, json_file)

    for n in range(1, 6):
        noun_occurences = len(
            [d for d in occurrence_data.values() if d[NOUN_OCCURRENCES] >= n]
        )
        adjective_occurences = len(
            [d for d in occurrence_data.values() if d[ADJECTIVE_OCCURRENCES] >= n]
        )
        pair_occurences = len(
            [d for d in occurrence_data.values() if d[PAIR_OCCURENCES] >= n]
        )

        print(
            "\nFound {}\timages where the noun occurs at least {} time(s).".format(
                noun_occurences, n
            )
        )
        print(
            "Found {}\timages where the adjective occurs at least {} time(s).".format(
                adjective_occurences, n
            )
        )
        print(
            "Found {}\timages where the pair occurs at least {} time(s).".format(
                pair_occurences, n
            )
        )