示例#1
0
文件: segment.py 项目: oya163/gibbs
def split_over_length(word):
    split_list = []
    for n in range(1, grapheme.length(word) + 1):
        # split_list.append((word[:n], word[n:len(word)]))
        split_list.append((grapheme.slice(word, 0, n),
                           grapheme.slice(word, n, grapheme.length(word))))
    return split_list
示例#2
0
文件: segment.py 项目: oya163/gibbs
def geometric_split(word, prob):
    split_point = set(np.random.geometric(prob, size=len(word)))
    split_list = []
    for each in split_point:
        split_list.append((grapheme.slice(word, 0, each),
                           grapheme.slice(word, each, grapheme.length(word))))

    # for n in range(1, grapheme.length(word) + 1):
    # split_list.append((word[:n], word[n:len(word)]))
    # split_list.append((grapheme.slice(word, 0, n), grapheme.slice(word, n, grapheme.length(word))))
    return split_list
示例#3
0
 def __getitem__(self, key):
     if isinstance(key, int):
         if key <= 0:
             g_list = list(grapheme.graphemes(str(self)))
             return g_list[key]
         else: 
             return self.__class__(grapheme.slice(str(self), key, key + 1))
     elif isinstance(key, slice):
         if (key.start and key.start < 0) or (key.stop and key.stop < 0) or key.step != 1:
             g_list = list(grapheme.graphemes(str(self)))
             return "".join(g_list[key.start:key.stop:key.step])
         else:
             return self.__class__(grapheme.slice(str(self), key.start, key.stop))
     else:
         return super().__getitem__(key)
示例#4
0
 def generate_splits(self, word):
     '''
       Generate all possible splits
       Parameter:
         word = Word to be split
       Output:
         splits = List of all possible splits
     '''
     splits = []
     for s in range(grapheme.length(word) + 1):
         stem = grapheme.slice(word, 0, s)
         stem = stem if (grapheme.length(stem) > 0) else '$'
         suffix = grapheme.slice(word, s)
         suffix = suffix if (grapheme.length(suffix) > 0) else '$'
         splits.append((stem, suffix))
     return splits
示例#5
0
def preview(html: str, min: int = 50, max: int = 158) -> str:  # NOQA: A002
    """
    Return a preview of a HTML blob as plain text, for use as a description tag.

    This function will attempt to return a HTML paragraph at a time, to avoid truncating
    sentences. Multiple paragraphs will be used if they are under min characters.

    :param str html: HTML text to generate a preview from
    :param int min: Minimum number of characters in the preview (default 50)
    :param int max: Maximum number of characters in the preview (default 158,
        recommended for Google)
    """
    # Get the max length we're interested in, for efficiency in grapheme counts. A large
    # blob of text can impair performance if we're only interested in a small preview.
    # `max` can be < `min` when the caller specifies a custom `max` without `min`
    max_length = (max if max > min else min) + 1
    blocks = text_blocks(html)
    if blocks:
        text = compress_whitespace(blocks.pop(0))
        length = grapheme.length(text, max_length)
        while blocks and length < min:
            text += ' ' + compress_whitespace(blocks.pop(0))
            length = grapheme.length(text, max_length)
        if length > max:
            text = grapheme.slice(text, 0, max - 1) + '…'
        return text
    return ''
示例#6
0
    def generate_pairs(lines):
        for line in lines:
            line = line.strip()
            if line.startswith('#') or len(line) == 0:
                continue

            # lines are broken up like:
            #     CodePoints ; Status # Emoji EmojiName
            # so everything we need is after the #
            _, comparison = line.split('#', 1)
            comparison = comparison.strip()

            # grapheme breaks strings down by unicode character so we just need
            # to get the first one
            emoji = grapheme.slice(comparison, end=1)
            name = grapheme.slice(comparison, start=2)
            yield emoji, name
示例#7
0
def list_of_aligned_words(sym_lst):
    if not sym_lst:
        return []
    l = grapheme.length(sym_lst[0])
    res = []
    for i in range(l):
        syms = [grapheme.slice(itm, start=i, end=i + 1) for itm in sym_lst]
        res.append("".join(syms))
    return res
    def generate_splits(self, no_of_splits, tokens):

        splits = []

        for token in tokens[:100]:
            for s in range(no_of_splits):
                # Draw a sample from a Geometric Distribution
                split_point = np.random.geometric(p=0.5)
                stem = grapheme.slice(token, 0, split_point)
                stem = stem if (grapheme.length(stem) > 0) else '$'
                suffix = grapheme.slice(token, split_point)
                suffix = suffix if (grapheme.length(suffix) > 0) else '$'
                splits.append((stem, suffix))

        print('Total data:', len(splits))
        print('Data Sample \n', splits[:5])

        return splits
示例#9
0
 def test_mixed_text(self):
     input_str = " \U0001F476\U0001F3FB ascii \u000D\u000A"
     graphemes = [
         " ", "\U0001F476\U0001F3FB", " ", "a", "s", "c", "i", "i", " ",
         input_str[-2:]
     ]
     self.assertEqual(list(grapheme.graphemes(input_str)), graphemes)
     self.assertEqual(list(grapheme.grapheme_lengths(input_str)),
                      [len(g) for g in graphemes])
     self.assertEqual(grapheme.slice(input_str, 0, 2),
                      " \U0001F476\U0001F3FB")
     self.assertEqual(grapheme.slice(input_str, 0, 3),
                      " \U0001F476\U0001F3FB ")
     self.assertEqual(grapheme.slice(input_str, end=3),
                      " \U0001F476\U0001F3FB ")
     self.assertEqual(grapheme.slice(input_str, 1, 4),
                      "\U0001F476\U0001F3FB a")
     self.assertEqual(grapheme.slice(input_str, 2), input_str[3:])
     self.assertEqual(grapheme.slice(input_str, 2, 4), " a")
     self.assertEqual(grapheme.length(input_str), 10)
     self.assertEqual(grapheme.length(input_str, until=0), 0)
     self.assertEqual(grapheme.length(input_str, until=1), 1)
     self.assertEqual(grapheme.length(input_str, until=4), 4)
     self.assertEqual(grapheme.length(input_str, until=10), 10)
     self.assertEqual(grapheme.length(input_str, until=11), 10)
示例#10
0
文件: gfold.py 项目: tos-kamiya/gfold
def fold_lines_iter(lines_it: Iterable[str],
                    width: int,
                    max_removal: int = 14,
                    separate_by_spaces: bool = False) -> Iterator[str]:
    assert width >= 16
    assert max_removal < width

    w2 = width // 2

    for L in lines_it:
        L = L.rstrip()
        if not L:
            yield ''
            continue  # for L

        len_L_1 = len(L) - 1
        idx = 0
        while idx < len_L_1:
            c = w2
            s = slice(L, idx, idx + c)
            sl = wcswidth(s)
            while idx + c < len_L_1 and sl < width:
                c += ((width - sl) // 2) or 1
                s = slice(L, idx, idx + c)
                sl = wcswidth(s)

            if separate_by_spaces and idx + c < len_L_1 and not s[-1].isspace(
            ):
                max_rc = min(max_removal, len(s) - 1)
                for rc in range(0, max_rc):
                    if s[-1 - rc].isspace():
                        s = s[:-rc]
                        break  # for rc

            assert s
            yield s
            idx += len(s)
示例#11
0
def list_of_aligned_words(mphon_lst):
    """Converts a list of morphophonemes into a list of aligned words

mphon_lst -- list of same length morphophonemes, e.g.
    ["lll", "ooo", "vvv", "ieØ"]

Returns a list of words constructed out of the 1st, 2nd ... alphabetic
symbols of the morphophonemes, e.g.  ["lll", "ooo", "vvv", "ieØ"] -->
["lovi", "love", "lovØ"]

    """
    if not mphon_lst:
        return []
    lgth = grapheme.length(mphon_lst[0])
    res = []
    for i in range(lgth):
        syms = [grapheme.slice(itm,start=i, end=i+1) for itm in mphon_lst]
        res.append("".join(syms))
    return res
示例#12
0
def print_result(aligned_result, comments, weights, layout="horizontal"):
    """Prints the result of the alignment in one of the three formats

aligned_result -- tuple of the weight and a list of aligned words where each aligned word is a list of 

comments -- possible comments which will be passed over

weights -- whether to print also the overall weight of this alignment

layout -- one of "horizontal" (a sequence of morphophonemes on a single line), "vertical" (each zero-filled word on a line of its own) or "list" (all zero-filled words on a single line)"""

    weight, aligned_words_lst = aligned_result
    if cfg.verbosity >= 10:
        print("aligned_result", aligned_result)
    
    if layout == "horizontal":
        lgth = grapheme.length(aligned_words_lst[0])
        mphon_lst = []
        for i in range(lgth):
            lst = []
            for aligned_word in aligned_words_lst:
                symbol = grapheme.slice(aligned_word, start=i, end=i+1)
                lst.append(symbol)
            if len(set(lst)) == 1:
                mphon_str = lst[0]  # abbreviate if all identical
            else:
                mphon_str = "".join(lst)
            mphon_lst.append(mphon_str)
        zstem_pairsym_str = " ".join(mphon_lst)

        mphonemic_str = " ".join(mphon_lst)
        if weights:
            print(mphonemic_str.ljust(40), weight)
        else:
            print(mphonemic_str)
    elif layout == "vertical":
        print("\n".join(aligned_words_lst))
        print()
    elif layout == "list":
        print(" ".join(aligned_words_lst))
    return