예제 #1
0
 def test__should_filter_by_duration_with_randomness(self):
     # speaker_id, clipped_utterance_file_name, clipped_utterance_duration, audio_id, snr
     utterances = [
         (1, "file_1.wav", 4, "2010123", 13),
         (2, "file_2.wav", 2, "2010124", 11),
         (3, "file_3.wav", 4, "2010125", 18),
         (4, "file_4.wav", 4, "2010126", 19),
         (4, "file_4.wav", 2, "2010126", 24),
         (4, "file_4.wav", 2, "2010126", 24),
         (5, "file_4.wav", 2, "2010126", 24),
         (5, "file_4.wav", 2, "2010126", 24),
         (6, "file_4.wav", 2, "2010126", 24),
         (6, "file_4.wav", 2, "2010126", 24),
         (6, "file_4.wav", 2, "2010126", 24),
     ]
     data_filter = DataFilter()
     filtered1 = data_filter.by_duration(utterances, 7 / 3600, "true", 1)
     filtered2 = data_filter.by_duration(utterances, 7 / 3600, "true", 1)
     expected_utterances = [
         (1, "file_1.wav", 4, "2010123", 13),
         (2, "file_2.wav", 2, "2010124", 11),
     ]
     print(str(filtered1))
     print(str(filtered2))
     self.assertNotEqual(expected_utterances, filtered1)
     self.assertNotEqual(filtered1, filtered2)
예제 #2
0
    def test__should_apply_filters_with_by_snr_then_by_speaker(self):
        utterances = [
            (1, "file_10.wav", 4, "1", 13),
            (1, "file_11.wav", 1, "2", 13),
            (1, "file_12.wav", 2, "3", 13),
            (1, "file_13.wav", 4, "4", 13),
            (2, "file_2.wav", 4, "5", 11),
            (2, "file_2.wav", 4, "5", 18),
            (3, "file_3.wav", 2, "6", 18),
            (3, "file_4.wav", 4, "7", 18),
            (4, "file_5.wav", 4, "8", 19),
            (4, "file_6.wav", 4, "9", 24),
            (4, "file_7.wav", 4, "10", 24),
            (5, "file_50.wav", 5, "10", 25),
            (5, "file_51.wav", 4, "10", 26),
            (5, "file_52.wav", 5, "10", 27),
            (6, "file_53.wav", 5, "10", 25),
            (6, "file_54.wav", 4, "10", 26),
            (6, "file_55.wav", 5, "10", 27),
        ]

        filters = {
            "by_source": "swayamprabha_chapter_30",
            "by_snr": {
                "gte": 13,
                "lte": 26
            },
            "by_speaker": {
                "lte_per_speaker_duration": 8 / 60,
                "gte_per_speaker_duration": 5 / 60,
                "with_threshold": 0,
            }
            # 'then_by_duration': 35
        }

        expected_utterances = [
            (1, "file_10.wav", 4, "1", 13),
            (1, "file_11.wav", 1, "2", 13),
            (1, "file_12.wav", 2, "3", 13),
            (3, "file_3.wav", 2, "6", 18),
            (3, "file_4.wav", 4, "7", 18),
            (4, "file_5.wav", 4, "8", 19),
            (4, "file_6.wav", 4, "9", 24),
            (5, "file_50.wav", 5, "10", 25),
            (6, "file_53.wav", 5, "10", 25),
        ]
        data_filter = DataFilter()
        filtered = data_filter.apply_filters(filters, utterances)
        self.assertEqual(type(expected_utterances),
                         type(filtered))  # check they are the same type
        self.assertEqual(len(expected_utterances),
                         len(filtered))  # check they are the same length
        self.assertEqual(expected_utterances, filtered)
예제 #3
0
    def test__should_apply_filters_with_by_snr_then_by_duration_randomness(
            self):
        utterances = [
            (1, "file_10.wav", 4, "1", 13),
            (1, "file_11.wav", 1, "2", 13),
            (1, "file_12.wav", 2, "3", 13),
            (1, "file_13.wav", 4, "4", 13),
            (2, "file_2.wav", 4, "5", 11),
            (2, "file_2.wav", 4, "5", 18),
            (3, "file_3.wav", 2, "6", 18),
            (3, "file_4.wav", 4, "7", 18),
            (4, "file_5.wav", 4, "8", 19),
            (4, "file_6.wav", 4, "9", 24),
            (4, "file_7.wav", 4, "10", 24),
            (5, "file_50.wav", 5, "10", 25),
            (5, "file_51.wav", 4, "10", 26),
            (5, "file_52.wav", 5, "10", 27),
            (6, "file_53.wav", 5, "10", 25),
            (6, "file_54.wav", 4, "10", 26),
            (6, "file_55.wav", 5, "10", 27),
        ]

        filters = {
            "by_source": "swayamprabha_chapter_30",
            "by_snr": {
                "gte": 13,
                "lte": 26
            },
            "by_duration": 21,
            "with_randomness": "true",
        }

        expected_utterances = [
            (1, "file_10.wav", 4, "1", 13),
            (1, "file_11.wav", 1, "2", 13),
            (1, "file_12.wav", 2, "3", 13),
            (1, "file_13.wav", 4, "4", 13),
            (2, "file_2.wav", 4, "5", 18),
            (3, "file_3.wav", 2, "6", 18),
            (3, "file_4.wav", 4, "7", 18),
        ]
        data_filter = DataFilter()
        filtered1 = data_filter.apply_filters(filters, utterances)
        filtered2 = data_filter.apply_filters(filters, utterances)
        self.assertEqual(type(expected_utterances),
                         type(filtered1))  # check they are the same type
        self.assertNotEqual(expected_utterances, filtered1)
        self.assertNotEqual(expected_utterances, filtered1)
        self.assertNotEqual(expected_utterances, filtered2)
        self.assertNotEqual(filtered1, filtered2)
    def __init__(self, postgres_client, gcs_instance, **kwargs):
        self.postgres_client = postgres_client
        self.gcs_instance = gcs_instance
        self.data_tagger_config = None
        self.data_filter = DataFilter()
        Logger.info("Total available cpu count:" +
                    str(multiprocessing.cpu_count()))
        self.data_mover = MediaFilesMover(
            GCPFileSystem(self.gcs_instance),
            multiprocessing.cpu_count() / ESTIMATED_CPU_SHARE,
        )
        self.catalogue_dao = CatalogueDao(self.postgres_client)

        super().__init__(**kwargs)
예제 #5
0
    def test__should_apply_filters_only_source(self):
        utterances = [
            (1, "file_10.wav", 4, "1", 13),
            (1, "file_11.wav", 1, "2", 13),
            (1, "file_12.wav", 2, "3", 13),
            (1, "file_13.wav", 4, "4", 13),
            (2, "file_2.wav", 4, "5", 11),
            (3, "file_3.wav", 2, "6", 18),
            (3, "file_4.wav", 4, "7", 18),
            (4, "file_5.wav", 4, "8", 19),
            (4, "file_6.wav", 4, "9", 24),
            (4, "file_7.wav", 4, "10", 24),
            (5, "file_50.wav", 5, "10", 25),
            (5, "file_51.wav", 4, "10", 26),
            (5, "file_52.wav", 5, "10", 27),
            (6, "file_53.wav", 5, "10", 25),
            (6, "file_54.wav", 4, "10", 26),
            (6, "file_55.wav", 5, "10", 27),
        ]

        filters = {}

        expected_utterances = [
            (1, "file_10.wav", 4, "1", 13),
            (1, "file_11.wav", 1, "2", 13),
            (1, "file_12.wav", 2, "3", 13),
            (1, "file_13.wav", 4, "4", 13),
            (2, "file_2.wav", 4, "5", 11),
            (3, "file_3.wav", 2, "6", 18),
            (3, "file_4.wav", 4, "7", 18),
            (4, "file_5.wav", 4, "8", 19),
            (4, "file_6.wav", 4, "9", 24),
            (4, "file_7.wav", 4, "10", 24),
            (5, "file_50.wav", 5, "10", 25),
            (5, "file_51.wav", 4, "10", 26),
            (5, "file_52.wav", 5, "10", 27),
            (6, "file_53.wav", 5, "10", 25),
            (6, "file_54.wav", 4, "10", 26),
            (6, "file_55.wav", 5, "10", 27),
        ]
        data_filter = DataFilter()
        filtered = data_filter.apply_filters(filters, utterances)
        self.assertEqual(type(expected_utterances),
                         type(filtered))  # check they are the same type
        self.assertEqual(len(expected_utterances),
                         len(filtered))  # check they are the same length
        self.assertEqual(expected_utterances, filtered)
예제 #6
0
 def test__should_filter_by_speaker_duration(self):
     # speaker_id, clipped_utterance_file_name, clipped_utterance_duration, audio_id, snr
     utterances = [
         (1, "file_10.wav", 40, "1", 13),
         (1, "file_11.wav", 10, "2", 13),
         (1, "file_12.wav", 20, "3", 13),
         (1, "file_13.wav", 4, "4", 13),
         (2, "file_2.wav", 40, "5", 11),
         (3, "file_3.wav", 20, "6", 18),
         (3, "file_4.wav", 40, "7", 18),
         (4, "file_5.wav", 40, "8", 19),
         (4, "file_6.wav", 40, "9", 24),
         (4, "file_7.wav", 40, "10", 24),
         (5, "file_50.wav", 50, "10", 25),
         (5, "file_51.wav", 40, "10", 26),
         (5, "file_52.wav", 5, "10", 27),
         (6, "file_53.wav", 15, "10", 25),
         (6, "file_54.wav", 25, "10", 26),
         (6, "file_55.wav", 40, "10", 27),
     ]
     data_filter = DataFilter()
     filtered = list(
         data_filter.by_per_speaker_duration(
             utterances,
             {
                 "lte_per_speaker_duration": 1,
                 "gte_per_speaker_duration": 0,
                 "with_threshold": 2 / 60,
             },
         ))
     expected_utterances = [
         (1, "file_10.wav", 40, "1", 13),
         (1, "file_11.wav", 10, "2", 13),
         (2, "file_2.wav", 40, "5", 11),
         (3, "file_3.wav", 20, "6", 18),
         (3, "file_4.wav", 40, "7", 18),
         (4, "file_5.wav", 40, "8", 19),
         (5, "file_50.wav", 50, "10", 25),
         (6, "file_53.wav", 15, "10", 25),
         (6, "file_54.wav", 25, "10", 26),
     ]
     self.assertEqual(type(expected_utterances),
                      type(filtered))  # check they are the same type
     self.assertEqual(len(expected_utterances),
                      len(filtered))  # check they are the same length
     self.assertEqual(expected_utterances, filtered)
예제 #7
0
 def test__should_filter_by_duration(self):
     # speaker_id, clipped_utterance_file_name, clipped_utterance_duration, audio_id, snr
     utterances = [
         (1, "file_1.wav", 4, "2010123", 13),
         (2, "file_2.wav", 2, "2010124", 11),
         (3, "file_3.wav", 4, "2010125", 18),
         (4, "file_4.wav", 4, "2010126", 19),
         (4, "file_4.wav", 2, "2010126", 24),
     ]
     data_filter = DataFilter()
     filtered = data_filter.by_duration(utterances, 7 / 3600)
     expected_utterances = [
         (1, "file_1.wav", 4, "2010123", 13),
         (2, "file_2.wav", 2, "2010124", 11),
     ]
     print(str(expected_utterances))
     self.assertEqual(expected_utterances, filtered)
예제 #8
0
    def test__should_apply_filters_only_source_and_utterance_duration_and_then_by_snr(
        self, ):
        utterances = [
            (1, "file_10.wav", 4, "1", 13),
            (1, "file_11.wav", 1, "2", 13),
            (1, "file_12.wav", 2, "3", 13),
            (1, "file_13.wav", 4, "4", 13),
            (2, "file_2.wav", 4, "5", 11),
            (3, "file_3.wav", 2, "6", 18),
            (3, "file_4.wav", 4, "7", 18),
            (4, "file_5.wav", 4, "8", 19),
            (4, "file_6.wav", 4, "9", 24),
            (4, "file_7.wav", 4, "10", 24),
            (5, "file_50.wav", 5, "10", 25),
            (5, "file_51.wav", 4, "10", 26),
            (5, "file_52.wav", 5, "10", 27),
            (6, "file_53.wav", 5, "10", 25),
            (6, "file_54.wav", 4, "10", 26),
            (6, "file_55.wav", 5, "10", 27),
        ]

        filters = {
            "by_utterance_duration": {
                "gte": 2,
                "lte": 4
            },
            "by_snr": {
                "gte": 13,
                "lte": 19
            },
        }

        expected_utterances = [
            (1, "file_10.wav", 4, "1", 13),
            (1, "file_12.wav", 2, "3", 13),
            (1, "file_13.wav", 4, "4", 13),
            (3, "file_3.wav", 2, "6", 18),
            (3, "file_4.wav", 4, "7", 18),
            (4, "file_5.wav", 4, "8", 19),
        ]
        data_filter = DataFilter()
        filtered = data_filter.apply_filters(filters, utterances)
        self.assertEqual(type(expected_utterances),
                         type(filtered))  # check they are the same type
        self.assertEqual(expected_utterances, filtered)
예제 #9
0
 def test__should_filter_by_snr(self):
     # speaker_id, clipped_utterance_file_name, clipped_utterance_duration, audio_id, snr
     utterances = [
         (1, "file_1.wav", 10, 202009112003117071, 13),
         (2, "file_2.wav", 11, 202009112003117071, 11),
         (3, "file_3.wav", 12, 202009112003117071, 45),
         (4, "file_4.wav", 13, 202009112003117071, 19),
         (4, "file_4.wav", 6, 202009112003117071, 21),
         (4, "file_4.wav", 13, 202009112003117071, 100),
     ]
     data_filter = DataFilter()
     filtered = list(data_filter.by_snr(utterances, {"gte": 15, "lte": 50}))
     expected_utterances = [
         (3, "file_3.wav", 12, 202009112003117071, 45),
         (4, "file_4.wav", 13, 202009112003117071, 19),
         (4, "file_4.wav", 6, 202009112003117071, 21),
     ]
     print(type(filtered[0][3]))
     self.assertEqual(expected_utterances, filtered)
예제 #10
0
 def test__should_return_empty_list_if_zero_utterances_pased(self):
     # speaker_id, clipped_utterance_file_name, clipped_utterance_duration, audio_id, snr
     utterances = []
     data_filter = DataFilter()
     filters = {
         "by_source": "swayamprabha_chapter_30",
         "by_snr": {
             "gte": 13,
             "lte": 26
         },
         "by_speaker": {
             "lte_per_speaker_duration": 8 / 60,
             "gte_per_speaker_duration": 5 / 60,
             "with_threshold": 0,
         },
         "by_duration": 21 / 3600,
     }
     filtered = list(data_filter.apply_filters(filters, utterances))
     expected_utterances = []
     self.assertEqual(expected_utterances, filtered)
예제 #11
0
 def test__should_filter_by_utterance_duration(self):
     # speaker_id, clipped_utterance_file_name, clipped_utterance_duration, audio_id, snr
     utterances = [
         (1, "file_1.wav", 1, "2010123", 13),
         (2, "file_2.wav", 2, "2010124", 11),
         (3, "file_3.wav", 4, "2010125", 18),
         (4, "file_4.wav", 4, "2010126", 19),
         (4, "file_4.wav", 2, "2010126", 24),
         (5, "file_4.wav", 5, "2010126", 24),
     ]
     data_filter = DataFilter()
     utterance_filter = {"lte": 4, "gte": 2}
     filtered = list(
         data_filter.by_utterance_duration(utterances, utterance_filter))
     expected_utterances = [
         (2, "file_2.wav", 2, "2010124", 11),
         (3, "file_3.wav", 4, "2010125", 18),
         (4, "file_4.wav", 4, "2010126", 19),
         (4, "file_4.wav", 2, "2010126", 24),
     ]
     self.assertEqual(expected_utterances, filtered)
class DataMarker(BaseProcessor):
    """
    1. Load Configuration
    2. Filter data baased on criteria
    2. Tag/Mark data in the DB
    3. Move marked data
    """
    @staticmethod
    def get_instance(data_processor_instance, gcs_instance, **kwargs):
        return DataMarker(data_processor_instance, gcs_instance, **kwargs)

    def __init__(self, postgres_client, gcs_instance, **kwargs):
        self.postgres_client = postgres_client
        self.gcs_instance = gcs_instance
        self.data_tagger_config = None
        self.data_filter = DataFilter()
        Logger.info("Total available cpu count:" +
                    str(multiprocessing.cpu_count()))
        self.data_mover = MediaFilesMover(
            GCPFileSystem(self.gcs_instance),
            multiprocessing.cpu_count() / ESTIMATED_CPU_SHARE,
        )
        self.catalogue_dao = CatalogueDao(self.postgres_client)

        super().__init__(**kwargs)

    def process(self, **kwargs):
        """
        Main function for running all processing that takes places in the data marker
        """
        Logger.info("*************Starting data marker****************")
        self.data_tagger_config = self.postgres_client.config_dict.get(
            CONFIG_NAME)
        source, filter_criteria = self.get_config(**kwargs)
        Logger.info("Fetching utterances for source:" + source)
        utterances = self.catalogue_dao.get_utterances_by_source(
            source, "Clean")

        filtered_utterances = self.data_filter.apply_filters(
            filter_criteria, utterances)
        Logger.info("updating utterances that need to be staged, count=" +
                    str(len(filtered_utterances)))

        if len(filtered_utterances) > 0:
            rows_updated = (
                self.catalogue_dao.update_utterances_staged_for_transcription(
                    filtered_utterances, source))
            Logger.info("Rows updated:" + str(rows_updated))
        else:
            Logger.info("No utterances found for filter criteria")
        source_dir = filter_criteria.get("landing_source_dir", source)
        landing_path_with_source = (
            f"{self.data_tagger_config.get(LANDING_BASE_PATH)}/{source_dir}")
        source_path_with_source = (
            f"{self.data_tagger_config.get(SOURCE_BASE_PATH)}/{source}")
        files = self.to_files(filtered_utterances, source_path_with_source)
        Logger.info("Staging utterances to dir:" + landing_path_with_source)
        self.data_mover.move_media_files(files, landing_path_with_source)
        Logger.info("************* Data marker completed ****************")

    def to_files(self, utterances, source_path_with_source):
        list(
            map(lambda u: f"{source_path_with_source}/{u[3]}/clean/{u[1]}",
                utterances))
        return list(
            map(lambda u: f"{source_path_with_source}/{u[3]}/clean/{u[1]}",
                utterances))

    def get_config(self, **kwargs):
        filter_criteria = kwargs.get(FILTER_CRITERIA, {})
        source = kwargs.get("source")

        if source is None:
            raise Exception("filter by source is mandatory")

        return source, filter_criteria.get(FILTER_CRITERIA)