def change_n_random_words(
        self,
        n: int,
        words: List[str],
        except_idxs: List[int] = None,
        shuffle: bool = True,
    ):
        word_indices = list(range(len(words)))
        if shuffle:
            random.shuffle(word_indices)
        changed_idxs = []

        idx = 0
        count = 0
        while count < n and idx < len(word_indices):
            index = word_indices[idx]
            idx += 1

            if except_idxs and index in except_idxs:
                continue

            _, word_chars, word_diacritics = extract_haraqat(words[index])

            rand: int = random.randint(0, len(word_diacritics) - 1)
            word_diacritics[rand] = get_different_haraqah(
                word_diacritics[rand])
            words[index] = combine_txt_and_haraqat(word_chars, word_diacritics)

            changed_idxs.append(index)

            count += 1

        assert count == n

        return words, changed_idxs
    def change_n_random_core_word(
        self,
        n: int,
        words: List[str],
        except_idxs: List[int] = None,
        shuffle: bool = True,
    ):
        word_indices = list(range(len(words)))

        if shuffle:
            random.shuffle(word_indices)

        changed_idxs = []
        idx = 0
        count = 0

        while count < n and idx < len(word_indices):
            index = word_indices[idx]
            word = words[index]
            idx += 1

            if except_idxs and index in except_idxs:
                continue

            if count_diacritics(words[index], skip_count_equal=2) < 2:
                continue

            _, word_chars, word_diacritics = extract_haraqat(words[index])

            indices = get_case_ending_indices_from_un_diacritized_txt(
                word_chars)

            if len(indices) == 1:
                choices = [
                    val for val in range(len(word_diacritics))
                    if val != indices[-1]
                ]
                rand = random.choice(choices)
            else:
                rand = random.randint(0, len(word_diacritics) - 2)

            rand = 0

            word_diacritics[rand] = get_different_haraqah(
                word_diacritics[rand])
            words[index] = combine_txt_and_haraqat(word_chars, word_diacritics)

            assert get_word_without_case_ending(
                words[index]) != get_word_without_case_ending(word)

            changed_idxs.append(index)

            count += 1

        assert count == n

        return words, changed_idxs
    def test_haraqat_extraction(self):
        _, text, haraqat = extract_haraqat(self.content)
        combined_text = combine_txt_and_haraqat(text, haraqat)

        should_be_equal = True
        for char1, char2 in zip(combined_text, self.content):
            if char1 != char2:
                should_be_equal = False
                break

        self.assertTrue(should_be_equal)
    def change_n_random_words_last_char(
        self,
        n: int,
        words: List[str],
        except_idxs: List[int] = None,
        shuffle: bool = True,
    ):

        word_indices = list(range(len(words)))

        if shuffle:
            random.shuffle(word_indices)

        changed_idxs = []

        idx = 0
        count = 0
        while count < n and idx < len(word_indices):
            index = word_indices[idx]
            idx += 1
            if except_idxs and index in except_idxs:
                continue

            word = words[index]
            _, word_chars, word_diacritics = extract_haraqat(words[index])

            if count_diacritics(words[index], skip_count_equal=2) < 2:
                continue

            indices = get_case_ending_indices_from_un_diacritized_txt(
                word_chars)

            if len(indices) != 1:
                continue

            last_idx = indices[-1]

            word_diacritics[last_idx] = get_different_haraqah(
                word_diacritics[last_idx])
            words[index] = combine_txt_and_haraqat(word_chars, word_diacritics)

            changed_idxs.append(index)

            count += 1

        assert count == n

        return words, changed_idxs
    def test_der_case_and_not_case_ending(self):
        _, text, haraqat = extract_haraqat(self.content)
        assert self.number_of_changes <= len(haraqat)
        haraqat_indices = list(range(len(haraqat)))
        case_ending_indices = get_case_ending_indices_from_un_diacritized_txt(
            text)

        case_ending_map = {}
        for i in case_ending_indices:
            case_ending_map[i] = 0

        not_case_ending_indices = [
            index for index in haraqat_indices
            if case_ending_map.get(index) is None
        ]

        random.shuffle(case_ending_indices)
        for i in range(self.case_ending_change):
            wrong_haraqah = get_different_haraqah(
                haraqat[case_ending_indices[i]])
            haraqat[case_ending_indices[i]] = wrong_haraqah

        random.shuffle(not_case_ending_indices)
        for i in range(self.not_case_ending_change):
            wrong_haraqah = get_different_haraqah(
                haraqat[not_case_ending_indices[i]])
            haraqat[not_case_ending_indices[i]] = wrong_haraqah

        predicted_content = combine_txt_and_haraqat(text, haraqat)

        result = calculate_der(self.content, predicted_content)
        result_should_be = calculate_rate(
            len(haraqat) - self.number_of_changes, self.number_of_changes)
        self.assertEqual(result_should_be, result)

        result = calculate_der(self.content,
                               predicted_content,
                               case_ending=False)
        result_should_be = calculate_rate(
            len(haraqat) - len(case_ending_indices),
            self.not_case_ending_change)
        self.assertEqual(result_should_be, result)
    def test_case_ending_indices(self):
        txt = (
            "الْمَسْأَلَةُ السَّابِعَةُ : فَضْلُ الْفَاتِحَةِ : لَيْسَ فِي أُمِّ الْقُرْآنِ حَدِيثٌ يَدُلُّ عَلَى "
            "فَضْلِهَا إلَّا حَدِيثَانِ : أَحَدُهُمَا : حَدِيثُ : { قَسَمْتُ الصَّلَاةَ بَيْنِي وَبَيْنَ عَبْدِي "
            "نِصْفَيْنِ . ")
        _, text, haraqah = extract_haraqat(txt)
        indices = get_case_ending_indices_from_un_diacritized_txt(text)
        # correct
        expected_out = "ةةلةسيمنثلىااناثتةينين"
        self.assertTrue(len(indices), len(expected_out))

        should_be = True
        i = 0
        for index in indices:
            if expected_out[i] != text[index]:
                should_be = False
                break

            i += 1
        self.assertTrue(should_be)
Exemplo n.º 7
0
    def __getitem__(self, index):
        "Generates one sample of data"
        # Select sample
        id = self.list_ids[index]
        if self.config["is_data_preprocessed"]:
            data = self.data.iloc[id]
            inputs = torch.Tensor(self.text_encoder.input_to_sequence(data[1]))
            targets = torch.Tensor(
                self.text_encoder.target_to_sequence(data[2].split(
                    self.config["diacritics_separator"])))
            return inputs, targets, data[0]

        data = self.data[id]
        data = self.text_encoder.clean(data)

        text, inputs, diacritics = util.extract_haraqat(data)
        inputs = torch.Tensor(
            self.text_encoder.input_to_sequence("".join(inputs)))
        diacritics = torch.Tensor(
            self.text_encoder.target_to_sequence(diacritics))

        return inputs, diacritics, text