示例#1
0
 def __getitem__(self, key):
     if isinstance(key, int):
         if key <= 0:
             g_list = list(grapheme.graphemes(str(self)))
             return g_list[key]
         else: 
             return self.__class__(grapheme.slice(str(self), key, key + 1))
     elif isinstance(key, slice):
         if (key.start and key.start < 0) or (key.stop and key.stop < 0) or key.step != 1:
             g_list = list(grapheme.graphemes(str(self)))
             return "".join(g_list[key.start:key.stop:key.step])
         else:
             return self.__class__(grapheme.slice(str(self), key.start, key.stop))
     else:
         return super().__getitem__(key)
示例#2
0
def mphon_weight(mphon):
    """Computes a weight for a raw morphophoneme"""
    global vowels, consonants, mphon_separator, weight_cache
    if mphon in weight_cache:
        return weight_cache[mphon]
    if mphon_separator == "":
        phon_list = grapheme.graphemes(mphon)
    else:
        phon_list = mphon.split(mphon_separator)
    phon_set = set(phon_list)
    if cfg.verbosity >= 30:
        print("phon_set =", phon_set)
    if phon_set == {"Ø"}:
        #    weight = 100.0        # all-zero morphophonemes must be allowed
        weight = cfg.all_zero_weight
    elif len(phon_set) == 1:
        weight = 0.0
    elif phon_set <= consonants:
        weight = cons_set_weight(phon_set)
    elif phon_set <= vowels:
        weight = vowel_set_weight(phon_set)
    else:
        #weight = float("Infinity")
        weight = 1000000.0
    weight_cache[mphon] = weight
    if cfg.verbosity >= 35:
        print("mphon:", mphon, "weight:", weight)
    return weight
示例#3
0
    def create_from_source(name: str, source: Iterable[str],
                           morpheme_delimiter: str,
                           end_of_morpheme_symbol: str, padding_symbol: str,
                           blacklist_char: str) -> "Alphabet":
        alphabet_set: Set[str] = set()
        for line in source:  # type: str
            for character in grapheme.graphemes(line.strip()):  # type: str
                category: str = unicodedata.category(character)
                if category[0] != "Z" and category[
                        0] != "C" and character != morpheme_delimiter and character != end_of_morpheme_symbol and character != blacklist_char:
                    alphabet_set.add(character)

        for symbol in alphabet_set:  # type: str
            for character in symbol:  # type: str
                category: str = unicodedata.category(character)
                if category[0] == "Z":
                    logging.warning(
                        f"WARNING - alphabet contains whitespace character:\t{Alphabet.unicode_info(symbol)}"
                    )

                elif (category[0] == "C" and character != morpheme_delimiter
                      and character != end_of_morpheme_symbol):
                    logging.warning(
                        f"WARNING - alphabet contains control character:\t{Alphabet.unicode_info(symbol)}"
                    )

        return Alphabet(name=name,
                        symbols=alphabet_set,
                        end_of_morpheme_symbol=end_of_morpheme_symbol,
                        padding_symbol=padding_symbol)
示例#4
0
    def predict(self, word: str) -> typing.List[str]:
        """Predict phonemes for the given word"""
        # encoder
        graphemes = list(grapheme.graphemes(word))
        enc = self._encode(graphemes)
        enc = _gru(
            enc,
            len(graphemes) + 1,
            self.enc_w_ih,
            self.enc_w_hh,
            self.enc_b_ih,
            self.enc_b_hh,
            h0=np.zeros((1, self.enc_w_hh.shape[-1]), np.float32),
        )
        last_hidden = enc[:, -1, :]

        # decoder
        dec = np.take(self.dec_emb, [2], axis=0)  # 2: <s>
        h = last_hidden

        preds = []
        for _ in range(self.dec_maxlen):
            h = _grucell(dec, h, self.dec_w_ih, self.dec_w_hh, self.dec_b_ih,
                         self.dec_b_hh)  # (b, h)
            logits = np.matmul(h, self.fc_w.T) + self.fc_b
            pred = logits.argmax()
            if pred == self.eos_idx:
                break  # </s>
            preds.append(pred)
            dec = np.take(self.dec_emb, [pred], axis=0)

        preds = [self.phonemes[idx] for idx in preds]
        return preds
示例#5
0
 def test_mixed_text(self):
     input_str = " \U0001F476\U0001F3FB ascii \u000D\u000A"
     graphemes = [
         " ", "\U0001F476\U0001F3FB", " ", "a", "s", "c", "i", "i", " ",
         input_str[-2:]
     ]
     self.assertEqual(list(grapheme.graphemes(input_str)), graphemes)
     self.assertEqual(list(grapheme.grapheme_lengths(input_str)),
                      [len(g) for g in graphemes])
     self.assertEqual(grapheme.slice(input_str, 0, 2),
                      " \U0001F476\U0001F3FB")
     self.assertEqual(grapheme.slice(input_str, 0, 3),
                      " \U0001F476\U0001F3FB ")
     self.assertEqual(grapheme.slice(input_str, end=3),
                      " \U0001F476\U0001F3FB ")
     self.assertEqual(grapheme.slice(input_str, 1, 4),
                      "\U0001F476\U0001F3FB a")
     self.assertEqual(grapheme.slice(input_str, 2), input_str[3:])
     self.assertEqual(grapheme.slice(input_str, 2, 4), " a")
     self.assertEqual(grapheme.length(input_str), 10)
     self.assertEqual(grapheme.length(input_str, until=0), 0)
     self.assertEqual(grapheme.length(input_str, until=1), 1)
     self.assertEqual(grapheme.length(input_str, until=4), 4)
     self.assertEqual(grapheme.length(input_str, until=10), 10)
     self.assertEqual(grapheme.length(input_str, until=11), 10)
示例#6
0
def text_to_chars(text):
    '''
    Takes as input Tibetan text, and creates a list of individual characters.

    '''

    temp = graphemes(text)
    out = list(temp)

    return out
示例#7
0
 def helper():
     for message in messages:
         if isinstance(message, list):
             message = ''.join(message)
         try:
             for grapheme_cluster in grapheme.graphemes(message):
                 if grapheme_cluster in emoji.UNICODE_EMOJI_ENGLISH or \
                         len(grapheme_cluster) > 1:
                     yield grapheme_cluster
         except:  # pylint: disable=bare-except
             pass
示例#8
0
文件: fs.py 项目: koskenni/pytwolc
def string_to_fsa(grapheme_string):
    """Return a FSA which accepts the sequence of graphemes in the string"""
    bfsa = hfst.HfstBasicTransducer()
    grapheme_list = list(grapheme.graphemes(grapheme_string))
    string_pair_path = tuple(zip(grapheme_list, grapheme_list))
    if cfg.verbosity >= 10:
        print(grapheme_list)
        print(string_pair_path)
    bfsa.disjunct(string_pair_path, 0)
    fsa = hfst.HfstTransducer(bfsa)
    return (fsa)
示例#9
0
    def text(update, context):
        text_received = update.message.text

        # Hack to check if all graphemes are emojis
        if emojis.count(text_received) == len(
                list(grapheme.graphemes(text_received))):
            print("emojis:", text_received)
            update.message.reply_text(f'uwu {text_received}')
            SSEFuckery.sse_broadcast("emojis", text_received)
            return

        print("scrolly-text:", text_received)
        update.message.reply_text(f'auzi cica >{text_received}')
        SSEFuckery.sse_broadcast("scrolly-text", text_received)
示例#10
0
    def test_contains(self):
        input_str = " \U0001F476\U0001F3FB ascii \u000D\u000A"

        self.assertFalse(grapheme.contains(input_str, " \U0001F476"))
        self.assertFalse(grapheme.contains(input_str, "\u000D"))
        self.assertFalse(grapheme.contains(input_str, "\U0001F3FB"))
        self.assertTrue(grapheme.contains(input_str, ""))

        graphemes = list(grapheme.graphemes(input_str))
        for grapheme_ in graphemes:
            self.assertTrue(grapheme.contains(input_str, grapheme_))

        for i in range(len(graphemes) - 1):
            self.assertTrue(grapheme.contains(input_str, "".join(graphemes[i:i+2])))
示例#11
0
def mphon_is_valid(mphon):
    """Tests if a raw morphophoneme is all consonants or all vowels"""
    global vowels, consonants, mphon_separator
    if mphon_separator == "":
        phon_list = grapheme.graphemes(mphon)
    else:
        phon_list = mphon.split(mphon_separator)
    phon_set = set(phon_list)
    if phon_set <= vowels:
        return True
    elif phon_set <= consonants:
        return True
    else:
        return False
示例#12
0
 def render_keys(self):
     need_more = False
     previous = ""
     if self.body.strip().startswith("#"):
         pa.typewrite(self.body, interval=0.005)
     else:
         for gr in grapheme.graphemes(self.body):
             if need_more and previous == "^":
                 pa.keyDown('ctrl')
                 pa.press(gr)
                 pa.keyUp('ctrl')
                 need_more = False
                 previous = ""
                 continue
             if gr in ["^"]:
                 need_more = True
                 previous = gr
                 continue
             pa.press(gr)
             time.sleep(random.choice(range(1, 50)) / 1000.0)
     if not self.body.strip().endswith("<<"):
         pa.press("enter")
示例#13
0
def reverse_string(string: str) -> str:
    """reverse string."""
    trans = str.maketrans(
        {
            ",": "،",
            ",": "،",
            "?": "¿",
            "?": "¿",
            "(": ")",
            ")": "(",
            "(": ")",
            ")": "(",
            "《": "》",
            "》": "《",
            "«": "»",
            "»": "«",
            "/": "\\",
            "\\": "/",
            "“": "”",
            "”": "“",
            ">": "<",
            "<": ">",
            "〔": "〕",
            "〕": "〔",
            "[": "]",
            "]": "[",
            "{": "}",
            "}": "{",
            "「": "」",
            "」": "「",
            "【": "】",
            "】": "【",
            "[": "]",
            "]": "[",
        }
    )

    return "".join(reversed(list(grapheme.graphemes(string)))).translate(trans)
示例#14
0
文件: png.py 项目: kristennn/wttr.in
def _fix_graphemes(text):
    """
    Extract long graphemes sequences that can't be handled
    by pyte correctly because of the bug pyte#131.
    Graphemes are omited and replaced with placeholders,
    and returned as a list.

    Return:
        text_without_graphemes, graphemes
    """

    output = ""
    graphemes = []

    for gra in grapheme.graphemes(text):
        if len(gra) > 1:
            character = "!"
            graphemes.append(gra)
        else:
            character = gra
        output += character

    return output, graphemes
示例#15
0
def split_graphemes(text):
    from grapheme import graphemes
    return tuple(graphemes(text))
示例#16
0
 def to_sort_list(q: str):
     q = unicodedata.normalize('NFKD', q)  #compatible decomposed form
     gc = grapheme.graphemes(q)
     return sorted([convert(c) for c in gc if filter(c)])
示例#17
0
 def generate_message(msg, mapping):
     for c in grapheme.graphemes(msg):
         try:
             yield '{l}{m}{r}'.format(m=mapping[c], l=l_wrap, r=r_wrap)
         except KeyError:
             yield c
示例#18
0
def prep_string_Arab(string):
    g = list(graphemes(re.sub(r"\s", "", string)))
    return [g[0] + "\u200d"] + ["\u200d" + c + "\u200d" for c in g[1:-1]] + ["\u200d" + g[1]]
示例#19
0
 def test_simple(self):
     self.assertEqual(list(grapheme.graphemes("alvin")), list("alvin"))
示例#20
0
# STEP 3:
# Compute the zero filled morphs out of the sequences of aligned symbols

aligned_morphs = {}
"""index: (morpheme, morph), value: zero-filled morph
"""

for morpheme, aligned_sym_seq in alignments.items():
    # e.g. "KOTA", ['kkkk', 'oooo', 'tdtd', 'aaØØ']
    if args.verbosity >= 25:
        print("aligned_sym_seq:", aligned_sym_seq)
    if morpheme not in aligned_morphs:
        aligned_morphs[morpheme] = collections.OrderedDict()
    if aligned_sym_seq:
        aligned_vec_seq = [
            tuple(grapheme.graphemes(aligned_sym))
            for aligned_sym in aligned_sym_seq
        ]
        l = len(aligned_vec_seq[0])
        zero_filled_morphs = [
            "".join([x[i] for x in aligned_vec_seq]) for i in range(0, l)
        ]
        original_morphs = [x.replace("Ø", "")
                           for x in zero_filled_morphs]  ##########
        for origm, zerofm in zip(original_morphs, zero_filled_morphs):
            #if origm:
            #    aligned_morphs[morpheme][origm] = zerofm
            aligned_morphs[morpheme][origm] = zerofm
    else:
        aligned_morphs[morpheme] = {"": ""}
if args.verbosity >= 20:
def main(
    max_characters: int,
    max_morphemes: int,
    alphabet_file: str,
    end_of_morpheme_symbol: str,
    morpheme_delimiter: str,
    input_file: str,
    output_file: str,
    verbose: int,
    blacklist_char: str,
) -> None:

    import pickle

    if grapheme.length(end_of_morpheme_symbol) != 1:
        raise RuntimeError(
            "The end of morpheme symbol must consist of a single grapheme cluster "
            + "(see Unicode Standard Annex #29).")

    with open(alphabet_file, "rb") as f:
        alphabet: Alphabet = pickle.load(f)

    with (sys.stdin
          if input_file == "-" else open(input_file)) as input_source:

        with gzip.open(output_file, "wb") as output:

            characters_dimension: Dimension = Dimension(
                "characters", max_characters)
            morphemes_dimension: Dimension = Dimension("morphemes",
                                                       max_morphemes)

            tpr: TensorProductRepresentation = TensorProductRepresentation(
                alphabet=alphabet, characters_dimension=characters_dimension)

            result: Dict[str, torch.Tensor] = {}
            skipped_morphemes: Set[str] = set()
            for number, line in enumerate(input_source):
                logging.debug(f"Processing line {number}\t{line.strip()}")
                for word in line.strip().split():
                    if blacklist_char in word:
                        logging.info(f"Skipping unanalyzed word {word}")
                    elif word not in result:
                        for character in grapheme.graphemes(word):
                            if character not in alphabet and character != morpheme_delimiter and character != end_of_morpheme_symbol:
                                logging.warning(
                                    f"WARNING - not in alphabet:\t{Alphabet.unicode_info(character)}"
                                )

                        morphemes = word.split(morpheme_delimiter)
                        for morpheme in morphemes:
                            if len(morpheme) == 0:
                                logging.debug(
                                    f"Line {number} - skipping morpheme of length 0 in word {word}"
                                )
                            elif len(morpheme) == max_characters:
                                logging.warning(
                                    f"Line {number} - skipping morpheme {morpheme} of {word} because its length {len(morpheme)} equals max length {max_characters}, and there is no space to insert the required end of morpheme symbol"
                                )
                            elif len(morpheme) > max_characters:
                                logging.warning(
                                    f"Line {number} - skipping morpheme {morpheme} of {word} because its length {len(morpheme)} exceeds max length {max_characters}"
                                )
                            else:
                                try:
                                    tensor: Tensor = tpr.process_morpheme(
                                        morpheme)
                                    #                                    if validate_tensors:
                                    #                                        reconstructed_surface_form = TensorProductRepresentation.extract_surface_form(alphabet=tpr.alphabet, morpheme_tensor=tensor.data, max_chars_per_morpheme=len(tpr.character_roles))
                                    #                                       assert(reconstructed_surface_form == morpheme)
                                    result[morpheme] = tensor.data
                                except IndexError:
                                    logging.warning(
                                        f"Line {number} - unable to process morpheme {morpheme} (length {len(morpheme)}) of {word}"
                                    )
                                    #                                    elif isinstance(e, AssertionError):
                                    #                                        logging.warning(f"Line {number} - unable to reconstruct morpheme {morpheme} (length {len(morpheme)}) of {word} from tensor representation")

                                    skipped_morphemes.add(morpheme)


#                                    raise e

            logging.info(
                f"Writing binary file containing {len(result)} morphemes to disk at {output}..."
            )
            pickle.dump(result, output)
            logging.info(f"...done writing binary file to disk at {output}",
                         file=sys.stderr)

            logging.info(
                f"Failed to process {len(skipped_morphemes)} morphemes:\n" +
                "\n".join(skipped_morphemes))
示例#22
0
def test_default_grapheme_suit(input_string, expected_graphemes, description):
    assert list(grapheme.graphemes(input_string)) == expected_graphemes
    assert grapheme.length(input_string) == len(expected_graphemes)
示例#23
0
 def test_cr_lf(self):
     self.assertEqual(list(grapheme.graphemes("\u000D\u000A")),
                      ["\u000D\u000A"])
示例#24
0
 def test_emoji_with_modifier(self):
     input_str = "\U0001F476\U0001F3FB"
     self.assertEqual(list(grapheme.graphemes(input_str)), [input_str])
示例#25
0
def main(
    morpheme_delimiter: str,
    end_of_morpheme_symbol: str,
    padding_symbol: str,
    input_file,
    output_file,
    verbose: int,
    blacklist_char: str,
) -> None:

    if grapheme.length(end_of_morpheme_symbol) != 1:
        raise RuntimeError(
            "The end of morpheme symbol must consist of a single grapheme cluster "
            + "(see Unicode Standard Annex #29).")

    alphabet_set: Set[str] = set()
    logging.info(f"Reading alphabet from input file {input_file.name}...")

    for line in input_file:
        for character in grapheme.graphemes(line.strip()):
            category = unicodedata.category(character)
            if category[0] == "Z":
                logging.debug(
                    "Input contains whitespace character:\t{unicode_info(symbol)}. This character will not be included in the alphabet."
                )
            elif category[0] == "C":
                logging.debug(
                    "Input contains control character:\t{unicode_info(symbol)}. This character will not be included in the alphabet."
                )
            elif character == morpheme_delimiter:
                logging.debug(
                    "Not including morpheme delimeter {morpheme_delimiter} in the alphabet."
                )
            elif character == blacklist_char:
                logging.debug(
                    "Not including character {blacklist_char} in the alphabet."
                )
            elif character == padding_symbol:
                raise RuntimeError(
                    f"Input contains reserved padding character {padding_symbol}, but this character must not occur in the corpus."
                )
            elif character == end_of_morpheme_symbol:
                raise RuntimeError(
                    f"Input contains reserved end of morpheme character {end_of_morpheme_symbol}, but this character must not occur in the corpus."
                )
            else:
                alphabet_set.add(character)

    # Zero is reserved for OOV
    output(
        output_file=output_file,
        int_value=0,
        character="",
        unicode_name="",
        description=
        "Integer value 0 is reserved to represent out-of-vocabulary characters in a tensor product representation"
    )

    # We reserve another character to represent the end of morpheme in a tensor product representation
    output(
        output_file=output_file,
        int_value=1,
        character=escaped_codepoints(end_of_morpheme_symbol),
        unicode_name=unicode_info(end_of_morpheme_symbol),
        description=
        "Integer value 1 is reserved to represent the end of a morpheme in a tensor product representation"
    )

    # We reserve another character to represent the padding after the end of morpheme in a tensor product representation
    output(
        output_file=output_file,
        int_value=2,
        character=escaped_codepoints(padding_symbol),
        unicode_name=unicode_info(padding_symbol),
        description=
        "Integer value 2 is reserved to represent padding beyond the end of a morpheme in a tensor product representation"
    )

    # Remaining actual characters
    for i, symbol in enumerate(sorted(alphabet_set), start=3):
        output(output_file=output_file,
               int_value=i,
               character=symbol,
               unicode_name=unicode_info(symbol),
               description="")
示例#26
0
def emojificate(string):
    return "".join(convert(ch) for ch in graphemes(string))
示例#27
0
def convert_emoji(string):
    return "".join(convert(ch) for ch in graphemes(string))
示例#28
0
def prep_string(string):
    return list(graphemes(re.sub(r"\s", "", string.upper())))
示例#29
0
    for line in sys.stdin:
        line = line.strip()
        lst = line.split("!", maxsplit=1)
        if len(lst) > 1:
            line = lst[0].strip()
            number = lst[1].strip() + " "
        else:
            number = ""
        words = line.split()
        comment = number + " ".join(words)

        best = aligner(words, args.zeros, line)

        #best2 = [re.sub(r"^([a-zšžŋđüõåäöáâ`´])\1\1*$", r"\1", cc)
        #         for cc in best]

        if args.layout == "horizontal":
            mphonemic_best = []
            for cc in best:
                grapheme_list = list(grapheme.graphemes(cc))
                lab = grapheme_list[0] if len(set(grapheme_list)) == 1 else cc
                mphonemic_best.append(lab)
            print(" ".join(mphonemic_best).ljust(40), "!", comment)
        elif args.layout == "vertical":
            #print("best:", best) ###
            #print("len(best):", [len(x) for x in best]) ###
            print("\n".join(list_of_aligned_words(best)))
            print()
        elif args.layout == "list":
            print(" ".join(list_of_aligned_words(best)))
示例#30
0
 def test_empty(self):
     self.assertEqual(list(grapheme.graphemes("")), [])