示例#1
0
文件: segment.py 项目: oya163/gibbs
def split_over_length(word):
    split_list = []
    for n in range(1, grapheme.length(word) + 1):
        # split_list.append((word[:n], word[n:len(word)]))
        split_list.append((grapheme.slice(word, 0, n),
                           grapheme.slice(word, n, grapheme.length(word))))
    return split_list
示例#2
0
def preview(html: str, min: int = 50, max: int = 158) -> str:  # NOQA: A002
    """
    Return a preview of a HTML blob as plain text, for use as a description tag.

    This function will attempt to return a HTML paragraph at a time, to avoid truncating
    sentences. Multiple paragraphs will be used if they are under min characters.

    :param str html: HTML text to generate a preview from
    :param int min: Minimum number of characters in the preview (default 50)
    :param int max: Maximum number of characters in the preview (default 158,
        recommended for Google)
    """
    # Get the max length we're interested in, for efficiency in grapheme counts. A large
    # blob of text can impair performance if we're only interested in a small preview.
    # `max` can be < `min` when the caller specifies a custom `max` without `min`
    max_length = (max if max > min else min) + 1
    blocks = text_blocks(html)
    if blocks:
        text = compress_whitespace(blocks.pop(0))
        length = grapheme.length(text, max_length)
        while blocks and length < min:
            text += ' ' + compress_whitespace(blocks.pop(0))
            length = grapheme.length(text, max_length)
        if length > max:
            text = grapheme.slice(text, 0, max - 1) + '…'
        return text
    return ''
示例#3
0
文件: segment.py 项目: oya163/gibbs
    def _fit(self, data, init_data):
        H = len(init_data)
        performance = []
        for i, x in enumerate(data):
            # Remove data point
            cluster = remove_current_data(i, data, init_data)
            x_len = grapheme.length(x)

            # Compress the cluster
            # to prevent from ever increasing cluster id
            keys = sorted(cluster.keys())
            for j in range(0, len(keys)):
                cluster[j] = cluster.pop(keys[j])

            # Parameters
            cluster_prob = []
            final_prob = []

            # Estimate parameters of each cluster
            for k, v in sorted(cluster.items()):
                n = len(v)
                curr_prob = 0.0
                sum_of_grapheme = sum([grapheme.length(d) for d in v])
                if self.method == 'mle':
                    # estimate theta using MLE
                    theta_es = n / sum_of_grapheme
                    curr_prob = g0(theta_es, x_len)
                elif self.method == 'collapsed':
                    curr_prob = self.beta_geometric_posterior(
                        x_len, n, sum_of_grapheme)

                cluster_prob.append(curr_prob)

                # Count of data points in each cluster
                likelihood = n / (H + self.A - 1)
                final_prob.append(likelihood * cluster_prob[-1])

            # Probability of joining new cluster
            final_prob.append(
                (self.A / (H + self.A - 1)) * g0(self.prob_c, x_len))

            # Normalize the probability
            norm_prob = final_prob / np.sum(final_prob)

            # Update cluster assignment based on the calculated probability
            init_data[i] = np.random.choice(len(norm_prob), 1, p=norm_prob)[0]

            # Computer log-likelihood
            performance.append(np.log(np.sum(final_prob)))

        return init_data, performance
示例#4
0
def test_width_chars():
    """
    测试字符的元素
    """
    chars = {
        # 泰语
        'ผ': 1,
        'ผู': 1,
        'ผู้': 1,
        # 中文
        '国': 1,
        '國': 1,
        # 日语
        'ア': 1,
        'あ': 1,
        # 韩语
        '한': 1,
        '헤': 1,
        '후': 1,
        '훼': 1,
        # 越南
        'ế': 1,
        # Emoji
        "🏳️‍🌈": 1,
    }

    for char, length in chars.items():
        assert length == grapheme.length(char)
示例#5
0
 def generate_splits(self, word):
     '''
       Generate all possible splits
       Parameter:
         word = Word to be split
       Output:
         splits = List of all possible splits
     '''
     splits = []
     for s in range(grapheme.length(word) + 1):
         stem = grapheme.slice(word, 0, s)
         stem = stem if (grapheme.length(stem) > 0) else '$'
         suffix = grapheme.slice(word, s)
         suffix = suffix if (grapheme.length(suffix) > 0) else '$'
         splits.append((stem, suffix))
     return splits
示例#6
0
 def update_parameters(self, data):
     '''
       Update Paramters of Geometric Distribtuion based on given data
       Parameters:
         data = Observed data
     '''
     self.alpha = self.alpha + len(data)
     self.beta = self.beta + np.sum([(grapheme.length(x) - 1) for x in data])
示例#7
0
文件: segment.py 项目: oya163/gibbs
    def evaluate(self, st_cluster, sf_cluster, stem_list, suffix_list,
                 gold_file):
        # Read gold file and collect only words
        hit = 0
        insert = 0
        delete = 0
        with open(gold_file, 'r') as f, open(self.result_filename, 'w') as g:
            reader = csv.reader(f, delimiter='\t')
            for word, morphemes in reader:
                # Do this process for each word
                split_list = split_over_length(word)
                stem_cluster = self.clusterize(st_cluster, stem_list)
                suffix_cluster = self.clusterize(sf_cluster, suffix_list)

                final_prob = []
                for stem, suffix in split_list:
                    p_stem = self.get_posterior_by_sampling(
                        stem_cluster, st_cluster, stem_list, stem)
                    p_suffix = self.get_posterior_by_sampling(
                        suffix_cluster, sf_cluster, suffix_list, suffix)
                    final_prob.append(p_stem * p_suffix)

                best_split = split_list[np.argmax(final_prob)]
                pred_stem, pred_suffix = best_split[0], best_split[1]
                gold_stem, gold_suffix = morphemes.split()[0], morphemes.split(
                )[1]
                pred_stem_len = grapheme.length(pred_stem)
                gold_stem_len = grapheme.length(gold_stem)
                if pred_stem_len == gold_stem_len:
                    hit += 1
                elif pred_stem_len < gold_stem_len:
                    insert += 1
                elif pred_stem_len > gold_stem_len:
                    delete += 1

                note = word + '\t' + morphemes + '\t' + best_split[
                    0] + ' ' + best_split[1] + '\n'

                g.write(note)

            # Return prec, recall, f1
            prec = hit / (hit + insert)
            recall = hit / (hit + delete)
            fscore = (2 * hit) / ((2 * hit) + insert + delete)
            return prec, recall, fscore
    def generate_splits(self, no_of_splits, tokens):

        splits = []

        for token in tokens[:100]:
            for s in range(no_of_splits):
                # Draw a sample from a Geometric Distribution
                split_point = np.random.geometric(p=0.5)
                stem = grapheme.slice(token, 0, split_point)
                stem = stem if (grapheme.length(stem) > 0) else '$'
                suffix = grapheme.slice(token, split_point)
                suffix = suffix if (grapheme.length(suffix) > 0) else '$'
                splits.append((stem, suffix))

        print('Total data:', len(splits))
        print('Data Sample \n', splits[:5])

        return splits
示例#9
0
def list_of_aligned_words(sym_lst):
    if not sym_lst:
        return []
    l = grapheme.length(sym_lst[0])
    res = []
    for i in range(l):
        syms = [grapheme.slice(itm, start=i, end=i + 1) for itm in sym_lst]
        res.append("".join(syms))
    return res
示例#10
0
def adjust_spaces(text):
    fixed_length = len(unicodedata.normalize('NFKD', text))
    unicode_length = grapheme.length(text)
    if unicode_length != fixed_length:
        _text = ' ' * fixed_length
        _text += save_cursor()
        _text += move_cursor_left(fixed_length)
        _text += unicodedata.normalize('NFKD', text)
        _text += restore_cursor()
        return _text
    return text
示例#11
0
文件: segment.py 项目: oya163/gibbs
def geometric_split(word, prob):
    split_point = set(np.random.geometric(prob, size=len(word)))
    split_list = []
    for each in split_point:
        split_list.append((grapheme.slice(word, 0, each),
                           grapheme.slice(word, each, grapheme.length(word))))

    # for n in range(1, grapheme.length(word) + 1):
    # split_list.append((word[:n], word[n:len(word)]))
    # split_list.append((grapheme.slice(word, 0, n), grapheme.slice(word, n, grapheme.length(word))))
    return split_list
示例#12
0
文件: segment.py 项目: oya163/gibbs
 def get_posterior_by_index(self, cluster, morpheme_assignment,
                            initial_list, morpheme):
     index = initial_list.index(
         morpheme) if morpheme in initial_list else -1
     if index >= 0:
         cluster_id = morpheme_assignment[index]
         n_si = len(cluster[cluster_id])
         return n_si / (len(morpheme_assignment) + self.A)
     else:
         return self.A * g0(self.prob_c, grapheme.length(morpheme)) / (
             len(morpheme_assignment) + self.A)
示例#13
0
文件: fif.py 项目: GammaGames/Fif
    def preprocess(self) -> None:
        self.labels = {}
        command: Optional[str] = None

        for index in range(len(self.program)):
            line: str = self.program[index].rstrip("\n")
            length: int = grapheme.length(line.rstrip("\n"))
            command_length = length % 32
            if command_length in self.parse and command is None:
                command = self.parse[command_length](line, length, index)
            elif command in self.parse:
                command = self.parse[command](line, length, index)
示例#14
0
def graphemecenter(text, width, fillchar=' '):
    """Return text centered in a string of *grapheme* length width (not len()).

	Padding is done using the specified fillchar (default is an ASCII space).
	The original string is returned if width is less than or equal to len(s).
	"""
    length = grapheme.length(text)
    if length > width:
        return text
    a = (width - length) // 2
    b = a + (width - length) % 2
    return (a * fillchar) + text + (b * fillchar)
示例#15
0
 def test_mixed_text(self):
     input_str = " \U0001F476\U0001F3FB ascii \u000D\u000A"
     graphemes = [
         " ", "\U0001F476\U0001F3FB", " ", "a", "s", "c", "i", "i", " ",
         input_str[-2:]
     ]
     self.assertEqual(list(grapheme.graphemes(input_str)), graphemes)
     self.assertEqual(list(grapheme.grapheme_lengths(input_str)),
                      [len(g) for g in graphemes])
     self.assertEqual(grapheme.slice(input_str, 0, 2),
                      " \U0001F476\U0001F3FB")
     self.assertEqual(grapheme.slice(input_str, 0, 3),
                      " \U0001F476\U0001F3FB ")
     self.assertEqual(grapheme.slice(input_str, end=3),
                      " \U0001F476\U0001F3FB ")
     self.assertEqual(grapheme.slice(input_str, 1, 4),
                      "\U0001F476\U0001F3FB a")
     self.assertEqual(grapheme.slice(input_str, 2), input_str[3:])
     self.assertEqual(grapheme.slice(input_str, 2, 4), " a")
     self.assertEqual(grapheme.length(input_str), 10)
     self.assertEqual(grapheme.length(input_str, until=0), 0)
     self.assertEqual(grapheme.length(input_str, until=1), 1)
     self.assertEqual(grapheme.length(input_str, until=4), 4)
     self.assertEqual(grapheme.length(input_str, until=10), 10)
     self.assertEqual(grapheme.length(input_str, until=11), 10)
示例#16
0
    def calculate_probability(self, data_i, cluster_data):
        '''
          Calculate posterior predictive probability of new data x
          Parameters:
            data_i        = New data
            cluster_data  = Data of cluster k
          Output:
            p = Calculated probability
        '''
        beta_geom = self.base_distribution
        beta_geom.update_parameters(cluster_data)

        x = grapheme.length(data_i)
        p = beta_geom.probability(x)

        return p
示例#17
0
def word_to_fsa_with_zeros(word, target_len, zero="Ø"):
    """Insert zeros freeley to make the word into a fst of target_lenth words

word -- a string

target_len -- length of the strings accepted by the result fsa

Returns a fsa that accepts all strings of length target_length where some zeros have been added to make the word be target_len long.
"""
    word_len = grapheme.length(word)
    zeros_fst = fs.string_to_fsa(zero)
    zeros_fst.repeat_n(target_len - word_len)
    fst = fs.string_to_fsa(word)
    fst.shuffle(zeros_fst)
    fst.minimize()
    return fst
示例#18
0
文件: fif.py 项目: GammaGames/Fif
    def process(self) -> None:
        self.index = 0
        self.stack_key = self.program_name
        self.stacks = {self.stack_key: []}
        self.stack_indexes = {self.stack_key: -1}
        command: Optional[str] = None

        while self.index < len(self.program):
            line = self.program[self.index].rstrip("\n")
            length = grapheme.length(line.rstrip("\n"))
            command_length = length % 32
            if command_length in self.commands and command is None:
                command = self.commands[command_length](line, length,
                                                        self.index)
            elif command in self.commands:
                command = self.commands[command](line, length, self.index)
            self.index += 1
示例#19
0
文件: segment.py 项目: oya163/gibbs
    def get_posterior_by_sampling(self, cluster, morpheme_assignment,
                                  initial_list, morpheme):
        initial_list = np.array(initial_list)
        indices = np.where(initial_list == morpheme)[0].tolist()
        L = len(morpheme_assignment)
        prob = []
        if indices:
            for index in indices:
                cluster_id = morpheme_assignment[index]
                n_si = len(cluster[cluster_id])
                prob.append(n_si / (L + self.A))

            # Sampling from the assigned clusters
            norm_prob = prob / np.sum(prob)
            prob_index = np.random.choice(len(norm_prob), 1, p=norm_prob)[0]
            return prob[prob_index]
        else:
            return self.A * g0(0.5, grapheme.length(morpheme)) / (L + self.A)
示例#20
0
def list_of_aligned_words(mphon_lst):
    """Converts a list of morphophonemes into a list of aligned words

mphon_lst -- list of same length morphophonemes, e.g.
    ["lll", "ooo", "vvv", "ieØ"]

Returns a list of words constructed out of the 1st, 2nd ... alphabetic
symbols of the morphophonemes, e.g.  ["lll", "ooo", "vvv", "ieØ"] -->
["lovi", "love", "lovØ"]

    """
    if not mphon_lst:
        return []
    lgth = grapheme.length(mphon_lst[0])
    res = []
    for i in range(lgth):
        syms = [grapheme.slice(itm,start=i, end=i+1) for itm in mphon_lst]
        res.append("".join(syms))
    return res
示例#21
0
def aligner(words, max_zeros_in_longest, line):
    """Aligns a list of words according to similarity of their phonemes

    words -- a list of words (or morphs) to be aligned

    max_zeros_in_longest -- maximum number of zeros to be inserted into
    the longest word

    line -- the input line (used only in warning messages)

    cfg.all_zero_weight -- if phoneme set is {"Ø"} (default 100.0)

    Returns the best alignment as a list of raw morphophoneme.
    """
    max_length = max([grapheme.length(x) for x in words])
    weighted_fsa = hfst.empty_fst()
    for m in range(max_length, max_length + max_zeros_in_longest + 1):
        R = multialign(words, m)
        if R.compare(hfst.empty_fst()):
            if cfg.verbosity > 1:
                print("target length", m, "failed")
            continue
        weighted_fsa.disjunct(R)
        weighted_fsa.minimize()
    weighted_fsa.n_best(10)
    weighted_fsa.minimize()  # accepts 10 best results
    results = weighted_fsa.extract_paths(output="raw")
    if cfg.verbosity >= 5:
        for w, sym_pair_seq in results:
            lst = [isym for isym, outsym in sym_pair_seq]
            mpw = ["{}::{:.2f}".format(x, mphon_weight(x)) for x in lst]
            print(" ".join(mpw), "total weight = {:.3f}".format(w))
    if len(results) < 1:
        print("*** NO ALIGNMENTS FOR:", line, "***", results)
        return ([])
    best_syl_struct = prefer_syl_struct(results)
    if cfg.final:
        best = prefer_final_zeros(best_syl_struct)
    else:
        best = best_syl_struct[0]
    return best
示例#22
0
def print_result(aligned_result, comments, weights, layout="horizontal"):
    """Prints the result of the alignment in one of the three formats

aligned_result -- tuple of the weight and a list of aligned words where each aligned word is a list of 

comments -- possible comments which will be passed over

weights -- whether to print also the overall weight of this alignment

layout -- one of "horizontal" (a sequence of morphophonemes on a single line), "vertical" (each zero-filled word on a line of its own) or "list" (all zero-filled words on a single line)"""

    weight, aligned_words_lst = aligned_result
    if cfg.verbosity >= 10:
        print("aligned_result", aligned_result)
    
    if layout == "horizontal":
        lgth = grapheme.length(aligned_words_lst[0])
        mphon_lst = []
        for i in range(lgth):
            lst = []
            for aligned_word in aligned_words_lst:
                symbol = grapheme.slice(aligned_word, start=i, end=i+1)
                lst.append(symbol)
            if len(set(lst)) == 1:
                mphon_str = lst[0]  # abbreviate if all identical
            else:
                mphon_str = "".join(lst)
            mphon_lst.append(mphon_str)
        zstem_pairsym_str = " ".join(mphon_lst)

        mphonemic_str = " ".join(mphon_lst)
        if weights:
            print(mphonemic_str.ljust(40), weight)
        else:
            print(mphonemic_str)
    elif layout == "vertical":
        print("\n".join(aligned_words_lst))
        print()
    elif layout == "list":
        print(" ".join(aligned_words_lst))
    return
示例#23
0
def align_words(word_lst, zero="Ø", extra_zeros=0, best_count=10):
    """Aligns a list of words

word_lst -- the list of words to be aligned

zero -- the symbol inserted as a mark for deletion or epenthesis

extra_zeros -- the maximun number of zeros to be added in the longest
    words (the shorter may have more)

best_count -- the maximum number of results to be returned (maybe less
    if no feasible results are found)

Returns a list of tuples where each tuple consists of a weight and a
list morphophonemes.
    """
    target_len = max([grapheme.length(x) for x in word_lst]) + extra_zeros
    fst = word_to_fsa_with_zeros(word_lst[0], target_len, zero)
    if cfg.verbosity >= 10:
        print("first fst with shuffled zeros:\n", fst)
    i = 1
    for word in word_lst[1:]:
        word_fsa = word_to_fsa_with_zeros(word, target_len, zero)
        fst.cross_product(word_fsa)
        fst = accum_input_labels(fst)
        if cfg.verbosity >= 10:
            print("fst accumulated with", word, ":\n", fst)

    fst.n_best(best_count)
    fst.minimize()
    if cfg.verbosity >= 10:
        print("aligned result:\n", fst)
    weight_path_lst = fst.extract_paths(output='raw')
    weight_mphon_lst_lst = []
    for weight, path_lst in weight_path_lst:
        mphon_lst = [insym for insym, outsym in path_lst]
        weight_mphon_lst_lst.append((weight, mphon_lst))
    if cfg.verbosity >= 10:
        print("weight_mphon_lst_lst:", weight_mphon_lst_lst)
    return weight_mphon_lst_lst
def _grapheme_len(text, fail_with_zero=False):
    """Return the number of graphemes in `text`.

    This is the length of the `text` when printed::

        >>> s = 'Â'
        >>> len(s)
        2
        >>> _grapheme_len(s)
        1

    If `fail_with_zero` is given a True, return 0 if `text` is not a string,
    instead of throwing a TypeError::

        >>> _grapheme_len(None, fail_with_zero=True)
        0
    """
    try:
        return grapheme.length(text)
    except TypeError:
        if fail_with_zero:
            return 0
        raise
示例#25
0
def shuffle_with_zeros(string, target_length):
    """Return a fsa where zeros are inserted in all possible ways
    
    string -- the string to which zero symbols are inserted

    target_length -- how long the strings after insertions must be

    Returns a fsa which accepts all the strings with the inserted zeros.
    All strings have exactly target_length symbols.
    """
    ### result_fsa = hfst.fst(string) # not correct for composed graphemes !!!
    result_fsa = fs.string_to_fsa(string)
    l = grapheme.length(string)
    if l < target_length:
        n = target_length - l
        n_zeros_fsa = hfst.regex(" ".join(n * "Ø"))
        result_fsa.shuffle(n_zeros_fsa)
    result_fsa.minimize()
    result_fsa.set_name(string)
    if cfg.verbosity >= 30:
        print("shuffle_with_zeros:")
        print(result_fsa)
    return result_fsa
示例#26
0
import grapheme as gr
import sys as sys
import json as json

f = open("testdata.json", "r")
t_emoji = json.loads(f.read())

print("Grapheme LEN |    LEN    |   LEN (UTF8)   | System Size in Bytes")

for key in t_emoji:

    x = t_emoji[key]

    l1 = gr.length(x)
    l2 = len(x)
    l3 = len(x.encode("utf8"))
    l4 = sys.getsizeof(x)

    print(key)
    print(x, "\n", l1, "\t", l2, "\t", l3, "\t", l4, "\n")
def main(
    max_characters: int,
    max_morphemes: int,
    alphabet_file: str,
    end_of_morpheme_symbol: str,
    morpheme_delimiter: str,
    input_file: str,
    output_file: str,
    verbose: int,
    blacklist_char: str,
) -> None:

    import pickle

    if grapheme.length(end_of_morpheme_symbol) != 1:
        raise RuntimeError(
            "The end of morpheme symbol must consist of a single grapheme cluster "
            + "(see Unicode Standard Annex #29).")

    with open(alphabet_file, "rb") as f:
        alphabet: Alphabet = pickle.load(f)

    with (sys.stdin
          if input_file == "-" else open(input_file)) as input_source:

        with gzip.open(output_file, "wb") as output:

            characters_dimension: Dimension = Dimension(
                "characters", max_characters)
            morphemes_dimension: Dimension = Dimension("morphemes",
                                                       max_morphemes)

            tpr: TensorProductRepresentation = TensorProductRepresentation(
                alphabet=alphabet, characters_dimension=characters_dimension)

            result: Dict[str, torch.Tensor] = {}
            skipped_morphemes: Set[str] = set()
            for number, line in enumerate(input_source):
                logging.debug(f"Processing line {number}\t{line.strip()}")
                for word in line.strip().split():
                    if blacklist_char in word:
                        logging.info(f"Skipping unanalyzed word {word}")
                    elif word not in result:
                        for character in grapheme.graphemes(word):
                            if character not in alphabet and character != morpheme_delimiter and character != end_of_morpheme_symbol:
                                logging.warning(
                                    f"WARNING - not in alphabet:\t{Alphabet.unicode_info(character)}"
                                )

                        morphemes = word.split(morpheme_delimiter)
                        for morpheme in morphemes:
                            if len(morpheme) == 0:
                                logging.debug(
                                    f"Line {number} - skipping morpheme of length 0 in word {word}"
                                )
                            elif len(morpheme) == max_characters:
                                logging.warning(
                                    f"Line {number} - skipping morpheme {morpheme} of {word} because its length {len(morpheme)} equals max length {max_characters}, and there is no space to insert the required end of morpheme symbol"
                                )
                            elif len(morpheme) > max_characters:
                                logging.warning(
                                    f"Line {number} - skipping morpheme {morpheme} of {word} because its length {len(morpheme)} exceeds max length {max_characters}"
                                )
                            else:
                                try:
                                    tensor: Tensor = tpr.process_morpheme(
                                        morpheme)
                                    #                                    if validate_tensors:
                                    #                                        reconstructed_surface_form = TensorProductRepresentation.extract_surface_form(alphabet=tpr.alphabet, morpheme_tensor=tensor.data, max_chars_per_morpheme=len(tpr.character_roles))
                                    #                                       assert(reconstructed_surface_form == morpheme)
                                    result[morpheme] = tensor.data
                                except IndexError:
                                    logging.warning(
                                        f"Line {number} - unable to process morpheme {morpheme} (length {len(morpheme)}) of {word}"
                                    )
                                    #                                    elif isinstance(e, AssertionError):
                                    #                                        logging.warning(f"Line {number} - unable to reconstruct morpheme {morpheme} (length {len(morpheme)}) of {word} from tensor representation")

                                    skipped_morphemes.add(morpheme)


#                                    raise e

            logging.info(
                f"Writing binary file containing {len(result)} morphemes to disk at {output}..."
            )
            pickle.dump(result, output)
            logging.info(f"...done writing binary file to disk at {output}",
                         file=sys.stderr)

            logging.info(
                f"Failed to process {len(skipped_morphemes)} morphemes:\n" +
                "\n".join(skipped_morphemes))
示例#28
0
文件: data_prep.py 项目: oya163/gibbs
    text = re.sub(r'\[[^\]]*\]', r'', text)
    text = re.sub(r'<[^>]*>', r'', text)
    text = re.sub(r'[!।,\'\’\‘\—()?]', r'', text)
    text = re.sub(r'[०१२३४५६७८९]', r'', text)
    text = text.replace(u'\ufeff', '')
    text = text.replace(u'\xa0', u' ')
    text = re.sub(r'( )+', r' ', text)
    text = re.sub(r"^\s+", "", text)
    return text


data = set()
with open(
        '../data/test_corpus.txt', 'r',
        encoding='utf-8') as input_file, open('../data/gold_standard.txt',
                                              'w',
                                              encoding='utf-8') as output_file:
    sent = input_file.read()
    for each in sent.split():
        each = filter(each)
        # Do not collect no-split results
        # Do not collect same result twice
        # if each != result and result not in data:
        if each and each not in data and grapheme.length(each) > 1:
            result = nepstem.stem(each)
            data.add(result)
            if each == result:
                result = result + ' ' + '$'
            output_file.write(each + '\t' + result + '\n')
            print(each + '\t' + result)
示例#29
0
 def __len__(self):
     return grapheme.length(self)
示例#30
0
def test_default_grapheme_suit(input_string, expected_graphemes, description):
    assert list(grapheme.graphemes(input_string)) == expected_graphemes
    assert grapheme.length(input_string) == len(expected_graphemes)