def split_over_length(word): split_list = [] for n in range(1, grapheme.length(word) + 1): # split_list.append((word[:n], word[n:len(word)])) split_list.append((grapheme.slice(word, 0, n), grapheme.slice(word, n, grapheme.length(word)))) return split_list
def preview(html: str, min: int = 50, max: int = 158) -> str: # NOQA: A002 """ Return a preview of a HTML blob as plain text, for use as a description tag. This function will attempt to return a HTML paragraph at a time, to avoid truncating sentences. Multiple paragraphs will be used if they are under min characters. :param str html: HTML text to generate a preview from :param int min: Minimum number of characters in the preview (default 50) :param int max: Maximum number of characters in the preview (default 158, recommended for Google) """ # Get the max length we're interested in, for efficiency in grapheme counts. A large # blob of text can impair performance if we're only interested in a small preview. # `max` can be < `min` when the caller specifies a custom `max` without `min` max_length = (max if max > min else min) + 1 blocks = text_blocks(html) if blocks: text = compress_whitespace(blocks.pop(0)) length = grapheme.length(text, max_length) while blocks and length < min: text += ' ' + compress_whitespace(blocks.pop(0)) length = grapheme.length(text, max_length) if length > max: text = grapheme.slice(text, 0, max - 1) + '…' return text return ''
def _fit(self, data, init_data): H = len(init_data) performance = [] for i, x in enumerate(data): # Remove data point cluster = remove_current_data(i, data, init_data) x_len = grapheme.length(x) # Compress the cluster # to prevent from ever increasing cluster id keys = sorted(cluster.keys()) for j in range(0, len(keys)): cluster[j] = cluster.pop(keys[j]) # Parameters cluster_prob = [] final_prob = [] # Estimate parameters of each cluster for k, v in sorted(cluster.items()): n = len(v) curr_prob = 0.0 sum_of_grapheme = sum([grapheme.length(d) for d in v]) if self.method == 'mle': # estimate theta using MLE theta_es = n / sum_of_grapheme curr_prob = g0(theta_es, x_len) elif self.method == 'collapsed': curr_prob = self.beta_geometric_posterior( x_len, n, sum_of_grapheme) cluster_prob.append(curr_prob) # Count of data points in each cluster likelihood = n / (H + self.A - 1) final_prob.append(likelihood * cluster_prob[-1]) # Probability of joining new cluster final_prob.append( (self.A / (H + self.A - 1)) * g0(self.prob_c, x_len)) # Normalize the probability norm_prob = final_prob / np.sum(final_prob) # Update cluster assignment based on the calculated probability init_data[i] = np.random.choice(len(norm_prob), 1, p=norm_prob)[0] # Computer log-likelihood performance.append(np.log(np.sum(final_prob))) return init_data, performance
def test_width_chars(): """ 测试字符的元素 """ chars = { # 泰语 'ผ': 1, 'ผู': 1, 'ผู้': 1, # 中文 '国': 1, '國': 1, # 日语 'ア': 1, 'あ': 1, # 韩语 '한': 1, '헤': 1, '후': 1, '훼': 1, # 越南 'ế': 1, # Emoji "🏳️🌈": 1, } for char, length in chars.items(): assert length == grapheme.length(char)
def generate_splits(self, word): ''' Generate all possible splits Parameter: word = Word to be split Output: splits = List of all possible splits ''' splits = [] for s in range(grapheme.length(word) + 1): stem = grapheme.slice(word, 0, s) stem = stem if (grapheme.length(stem) > 0) else '$' suffix = grapheme.slice(word, s) suffix = suffix if (grapheme.length(suffix) > 0) else '$' splits.append((stem, suffix)) return splits
def update_parameters(self, data): ''' Update Paramters of Geometric Distribtuion based on given data Parameters: data = Observed data ''' self.alpha = self.alpha + len(data) self.beta = self.beta + np.sum([(grapheme.length(x) - 1) for x in data])
def evaluate(self, st_cluster, sf_cluster, stem_list, suffix_list, gold_file): # Read gold file and collect only words hit = 0 insert = 0 delete = 0 with open(gold_file, 'r') as f, open(self.result_filename, 'w') as g: reader = csv.reader(f, delimiter='\t') for word, morphemes in reader: # Do this process for each word split_list = split_over_length(word) stem_cluster = self.clusterize(st_cluster, stem_list) suffix_cluster = self.clusterize(sf_cluster, suffix_list) final_prob = [] for stem, suffix in split_list: p_stem = self.get_posterior_by_sampling( stem_cluster, st_cluster, stem_list, stem) p_suffix = self.get_posterior_by_sampling( suffix_cluster, sf_cluster, suffix_list, suffix) final_prob.append(p_stem * p_suffix) best_split = split_list[np.argmax(final_prob)] pred_stem, pred_suffix = best_split[0], best_split[1] gold_stem, gold_suffix = morphemes.split()[0], morphemes.split( )[1] pred_stem_len = grapheme.length(pred_stem) gold_stem_len = grapheme.length(gold_stem) if pred_stem_len == gold_stem_len: hit += 1 elif pred_stem_len < gold_stem_len: insert += 1 elif pred_stem_len > gold_stem_len: delete += 1 note = word + '\t' + morphemes + '\t' + best_split[ 0] + ' ' + best_split[1] + '\n' g.write(note) # Return prec, recall, f1 prec = hit / (hit + insert) recall = hit / (hit + delete) fscore = (2 * hit) / ((2 * hit) + insert + delete) return prec, recall, fscore
def generate_splits(self, no_of_splits, tokens): splits = [] for token in tokens[:100]: for s in range(no_of_splits): # Draw a sample from a Geometric Distribution split_point = np.random.geometric(p=0.5) stem = grapheme.slice(token, 0, split_point) stem = stem if (grapheme.length(stem) > 0) else '$' suffix = grapheme.slice(token, split_point) suffix = suffix if (grapheme.length(suffix) > 0) else '$' splits.append((stem, suffix)) print('Total data:', len(splits)) print('Data Sample \n', splits[:5]) return splits
def list_of_aligned_words(sym_lst): if not sym_lst: return [] l = grapheme.length(sym_lst[0]) res = [] for i in range(l): syms = [grapheme.slice(itm, start=i, end=i + 1) for itm in sym_lst] res.append("".join(syms)) return res
def adjust_spaces(text): fixed_length = len(unicodedata.normalize('NFKD', text)) unicode_length = grapheme.length(text) if unicode_length != fixed_length: _text = ' ' * fixed_length _text += save_cursor() _text += move_cursor_left(fixed_length) _text += unicodedata.normalize('NFKD', text) _text += restore_cursor() return _text return text
def geometric_split(word, prob): split_point = set(np.random.geometric(prob, size=len(word))) split_list = [] for each in split_point: split_list.append((grapheme.slice(word, 0, each), grapheme.slice(word, each, grapheme.length(word)))) # for n in range(1, grapheme.length(word) + 1): # split_list.append((word[:n], word[n:len(word)])) # split_list.append((grapheme.slice(word, 0, n), grapheme.slice(word, n, grapheme.length(word)))) return split_list
def get_posterior_by_index(self, cluster, morpheme_assignment, initial_list, morpheme): index = initial_list.index( morpheme) if morpheme in initial_list else -1 if index >= 0: cluster_id = morpheme_assignment[index] n_si = len(cluster[cluster_id]) return n_si / (len(morpheme_assignment) + self.A) else: return self.A * g0(self.prob_c, grapheme.length(morpheme)) / ( len(morpheme_assignment) + self.A)
def preprocess(self) -> None: self.labels = {} command: Optional[str] = None for index in range(len(self.program)): line: str = self.program[index].rstrip("\n") length: int = grapheme.length(line.rstrip("\n")) command_length = length % 32 if command_length in self.parse and command is None: command = self.parse[command_length](line, length, index) elif command in self.parse: command = self.parse[command](line, length, index)
def graphemecenter(text, width, fillchar=' '): """Return text centered in a string of *grapheme* length width (not len()). Padding is done using the specified fillchar (default is an ASCII space). The original string is returned if width is less than or equal to len(s). """ length = grapheme.length(text) if length > width: return text a = (width - length) // 2 b = a + (width - length) % 2 return (a * fillchar) + text + (b * fillchar)
def test_mixed_text(self): input_str = " \U0001F476\U0001F3FB ascii \u000D\u000A" graphemes = [ " ", "\U0001F476\U0001F3FB", " ", "a", "s", "c", "i", "i", " ", input_str[-2:] ] self.assertEqual(list(grapheme.graphemes(input_str)), graphemes) self.assertEqual(list(grapheme.grapheme_lengths(input_str)), [len(g) for g in graphemes]) self.assertEqual(grapheme.slice(input_str, 0, 2), " \U0001F476\U0001F3FB") self.assertEqual(grapheme.slice(input_str, 0, 3), " \U0001F476\U0001F3FB ") self.assertEqual(grapheme.slice(input_str, end=3), " \U0001F476\U0001F3FB ") self.assertEqual(grapheme.slice(input_str, 1, 4), "\U0001F476\U0001F3FB a") self.assertEqual(grapheme.slice(input_str, 2), input_str[3:]) self.assertEqual(grapheme.slice(input_str, 2, 4), " a") self.assertEqual(grapheme.length(input_str), 10) self.assertEqual(grapheme.length(input_str, until=0), 0) self.assertEqual(grapheme.length(input_str, until=1), 1) self.assertEqual(grapheme.length(input_str, until=4), 4) self.assertEqual(grapheme.length(input_str, until=10), 10) self.assertEqual(grapheme.length(input_str, until=11), 10)
def calculate_probability(self, data_i, cluster_data): ''' Calculate posterior predictive probability of new data x Parameters: data_i = New data cluster_data = Data of cluster k Output: p = Calculated probability ''' beta_geom = self.base_distribution beta_geom.update_parameters(cluster_data) x = grapheme.length(data_i) p = beta_geom.probability(x) return p
def word_to_fsa_with_zeros(word, target_len, zero="Ø"): """Insert zeros freeley to make the word into a fst of target_lenth words word -- a string target_len -- length of the strings accepted by the result fsa Returns a fsa that accepts all strings of length target_length where some zeros have been added to make the word be target_len long. """ word_len = grapheme.length(word) zeros_fst = fs.string_to_fsa(zero) zeros_fst.repeat_n(target_len - word_len) fst = fs.string_to_fsa(word) fst.shuffle(zeros_fst) fst.minimize() return fst
def process(self) -> None: self.index = 0 self.stack_key = self.program_name self.stacks = {self.stack_key: []} self.stack_indexes = {self.stack_key: -1} command: Optional[str] = None while self.index < len(self.program): line = self.program[self.index].rstrip("\n") length = grapheme.length(line.rstrip("\n")) command_length = length % 32 if command_length in self.commands and command is None: command = self.commands[command_length](line, length, self.index) elif command in self.commands: command = self.commands[command](line, length, self.index) self.index += 1
def get_posterior_by_sampling(self, cluster, morpheme_assignment, initial_list, morpheme): initial_list = np.array(initial_list) indices = np.where(initial_list == morpheme)[0].tolist() L = len(morpheme_assignment) prob = [] if indices: for index in indices: cluster_id = morpheme_assignment[index] n_si = len(cluster[cluster_id]) prob.append(n_si / (L + self.A)) # Sampling from the assigned clusters norm_prob = prob / np.sum(prob) prob_index = np.random.choice(len(norm_prob), 1, p=norm_prob)[0] return prob[prob_index] else: return self.A * g0(0.5, grapheme.length(morpheme)) / (L + self.A)
def list_of_aligned_words(mphon_lst): """Converts a list of morphophonemes into a list of aligned words mphon_lst -- list of same length morphophonemes, e.g. ["lll", "ooo", "vvv", "ieØ"] Returns a list of words constructed out of the 1st, 2nd ... alphabetic symbols of the morphophonemes, e.g. ["lll", "ooo", "vvv", "ieØ"] --> ["lovi", "love", "lovØ"] """ if not mphon_lst: return [] lgth = grapheme.length(mphon_lst[0]) res = [] for i in range(lgth): syms = [grapheme.slice(itm,start=i, end=i+1) for itm in mphon_lst] res.append("".join(syms)) return res
def aligner(words, max_zeros_in_longest, line): """Aligns a list of words according to similarity of their phonemes words -- a list of words (or morphs) to be aligned max_zeros_in_longest -- maximum number of zeros to be inserted into the longest word line -- the input line (used only in warning messages) cfg.all_zero_weight -- if phoneme set is {"Ø"} (default 100.0) Returns the best alignment as a list of raw morphophoneme. """ max_length = max([grapheme.length(x) for x in words]) weighted_fsa = hfst.empty_fst() for m in range(max_length, max_length + max_zeros_in_longest + 1): R = multialign(words, m) if R.compare(hfst.empty_fst()): if cfg.verbosity > 1: print("target length", m, "failed") continue weighted_fsa.disjunct(R) weighted_fsa.minimize() weighted_fsa.n_best(10) weighted_fsa.minimize() # accepts 10 best results results = weighted_fsa.extract_paths(output="raw") if cfg.verbosity >= 5: for w, sym_pair_seq in results: lst = [isym for isym, outsym in sym_pair_seq] mpw = ["{}::{:.2f}".format(x, mphon_weight(x)) for x in lst] print(" ".join(mpw), "total weight = {:.3f}".format(w)) if len(results) < 1: print("*** NO ALIGNMENTS FOR:", line, "***", results) return ([]) best_syl_struct = prefer_syl_struct(results) if cfg.final: best = prefer_final_zeros(best_syl_struct) else: best = best_syl_struct[0] return best
def print_result(aligned_result, comments, weights, layout="horizontal"): """Prints the result of the alignment in one of the three formats aligned_result -- tuple of the weight and a list of aligned words where each aligned word is a list of comments -- possible comments which will be passed over weights -- whether to print also the overall weight of this alignment layout -- one of "horizontal" (a sequence of morphophonemes on a single line), "vertical" (each zero-filled word on a line of its own) or "list" (all zero-filled words on a single line)""" weight, aligned_words_lst = aligned_result if cfg.verbosity >= 10: print("aligned_result", aligned_result) if layout == "horizontal": lgth = grapheme.length(aligned_words_lst[0]) mphon_lst = [] for i in range(lgth): lst = [] for aligned_word in aligned_words_lst: symbol = grapheme.slice(aligned_word, start=i, end=i+1) lst.append(symbol) if len(set(lst)) == 1: mphon_str = lst[0] # abbreviate if all identical else: mphon_str = "".join(lst) mphon_lst.append(mphon_str) zstem_pairsym_str = " ".join(mphon_lst) mphonemic_str = " ".join(mphon_lst) if weights: print(mphonemic_str.ljust(40), weight) else: print(mphonemic_str) elif layout == "vertical": print("\n".join(aligned_words_lst)) print() elif layout == "list": print(" ".join(aligned_words_lst)) return
def align_words(word_lst, zero="Ø", extra_zeros=0, best_count=10): """Aligns a list of words word_lst -- the list of words to be aligned zero -- the symbol inserted as a mark for deletion or epenthesis extra_zeros -- the maximun number of zeros to be added in the longest words (the shorter may have more) best_count -- the maximum number of results to be returned (maybe less if no feasible results are found) Returns a list of tuples where each tuple consists of a weight and a list morphophonemes. """ target_len = max([grapheme.length(x) for x in word_lst]) + extra_zeros fst = word_to_fsa_with_zeros(word_lst[0], target_len, zero) if cfg.verbosity >= 10: print("first fst with shuffled zeros:\n", fst) i = 1 for word in word_lst[1:]: word_fsa = word_to_fsa_with_zeros(word, target_len, zero) fst.cross_product(word_fsa) fst = accum_input_labels(fst) if cfg.verbosity >= 10: print("fst accumulated with", word, ":\n", fst) fst.n_best(best_count) fst.minimize() if cfg.verbosity >= 10: print("aligned result:\n", fst) weight_path_lst = fst.extract_paths(output='raw') weight_mphon_lst_lst = [] for weight, path_lst in weight_path_lst: mphon_lst = [insym for insym, outsym in path_lst] weight_mphon_lst_lst.append((weight, mphon_lst)) if cfg.verbosity >= 10: print("weight_mphon_lst_lst:", weight_mphon_lst_lst) return weight_mphon_lst_lst
def _grapheme_len(text, fail_with_zero=False): """Return the number of graphemes in `text`. This is the length of the `text` when printed:: >>> s = 'Â' >>> len(s) 2 >>> _grapheme_len(s) 1 If `fail_with_zero` is given a True, return 0 if `text` is not a string, instead of throwing a TypeError:: >>> _grapheme_len(None, fail_with_zero=True) 0 """ try: return grapheme.length(text) except TypeError: if fail_with_zero: return 0 raise
def shuffle_with_zeros(string, target_length): """Return a fsa where zeros are inserted in all possible ways string -- the string to which zero symbols are inserted target_length -- how long the strings after insertions must be Returns a fsa which accepts all the strings with the inserted zeros. All strings have exactly target_length symbols. """ ### result_fsa = hfst.fst(string) # not correct for composed graphemes !!! result_fsa = fs.string_to_fsa(string) l = grapheme.length(string) if l < target_length: n = target_length - l n_zeros_fsa = hfst.regex(" ".join(n * "Ø")) result_fsa.shuffle(n_zeros_fsa) result_fsa.minimize() result_fsa.set_name(string) if cfg.verbosity >= 30: print("shuffle_with_zeros:") print(result_fsa) return result_fsa
import grapheme as gr import sys as sys import json as json f = open("testdata.json", "r") t_emoji = json.loads(f.read()) print("Grapheme LEN | LEN | LEN (UTF8) | System Size in Bytes") for key in t_emoji: x = t_emoji[key] l1 = gr.length(x) l2 = len(x) l3 = len(x.encode("utf8")) l4 = sys.getsizeof(x) print(key) print(x, "\n", l1, "\t", l2, "\t", l3, "\t", l4, "\n")
def main( max_characters: int, max_morphemes: int, alphabet_file: str, end_of_morpheme_symbol: str, morpheme_delimiter: str, input_file: str, output_file: str, verbose: int, blacklist_char: str, ) -> None: import pickle if grapheme.length(end_of_morpheme_symbol) != 1: raise RuntimeError( "The end of morpheme symbol must consist of a single grapheme cluster " + "(see Unicode Standard Annex #29).") with open(alphabet_file, "rb") as f: alphabet: Alphabet = pickle.load(f) with (sys.stdin if input_file == "-" else open(input_file)) as input_source: with gzip.open(output_file, "wb") as output: characters_dimension: Dimension = Dimension( "characters", max_characters) morphemes_dimension: Dimension = Dimension("morphemes", max_morphemes) tpr: TensorProductRepresentation = TensorProductRepresentation( alphabet=alphabet, characters_dimension=characters_dimension) result: Dict[str, torch.Tensor] = {} skipped_morphemes: Set[str] = set() for number, line in enumerate(input_source): logging.debug(f"Processing line {number}\t{line.strip()}") for word in line.strip().split(): if blacklist_char in word: logging.info(f"Skipping unanalyzed word {word}") elif word not in result: for character in grapheme.graphemes(word): if character not in alphabet and character != morpheme_delimiter and character != end_of_morpheme_symbol: logging.warning( f"WARNING - not in alphabet:\t{Alphabet.unicode_info(character)}" ) morphemes = word.split(morpheme_delimiter) for morpheme in morphemes: if len(morpheme) == 0: logging.debug( f"Line {number} - skipping morpheme of length 0 in word {word}" ) elif len(morpheme) == max_characters: logging.warning( f"Line {number} - skipping morpheme {morpheme} of {word} because its length {len(morpheme)} equals max length {max_characters}, and there is no space to insert the required end of morpheme symbol" ) elif len(morpheme) > max_characters: logging.warning( f"Line {number} - skipping morpheme {morpheme} of {word} because its length {len(morpheme)} exceeds max length {max_characters}" ) else: try: tensor: Tensor = tpr.process_morpheme( morpheme) # if validate_tensors: # reconstructed_surface_form = TensorProductRepresentation.extract_surface_form(alphabet=tpr.alphabet, morpheme_tensor=tensor.data, max_chars_per_morpheme=len(tpr.character_roles)) # assert(reconstructed_surface_form == morpheme) result[morpheme] = tensor.data except IndexError: logging.warning( f"Line {number} - unable to process morpheme {morpheme} (length {len(morpheme)}) of {word}" ) # elif isinstance(e, AssertionError): # logging.warning(f"Line {number} - unable to reconstruct morpheme {morpheme} (length {len(morpheme)}) of {word} from tensor representation") skipped_morphemes.add(morpheme) # raise e logging.info( f"Writing binary file containing {len(result)} morphemes to disk at {output}..." ) pickle.dump(result, output) logging.info(f"...done writing binary file to disk at {output}", file=sys.stderr) logging.info( f"Failed to process {len(skipped_morphemes)} morphemes:\n" + "\n".join(skipped_morphemes))
text = re.sub(r'\[[^\]]*\]', r'', text) text = re.sub(r'<[^>]*>', r'', text) text = re.sub(r'[!।,\'\’\‘\—()?]', r'', text) text = re.sub(r'[०१२३४५६७८९]', r'', text) text = text.replace(u'\ufeff', '') text = text.replace(u'\xa0', u' ') text = re.sub(r'( )+', r' ', text) text = re.sub(r"^\s+", "", text) return text data = set() with open( '../data/test_corpus.txt', 'r', encoding='utf-8') as input_file, open('../data/gold_standard.txt', 'w', encoding='utf-8') as output_file: sent = input_file.read() for each in sent.split(): each = filter(each) # Do not collect no-split results # Do not collect same result twice # if each != result and result not in data: if each and each not in data and grapheme.length(each) > 1: result = nepstem.stem(each) data.add(result) if each == result: result = result + ' ' + '$' output_file.write(each + '\t' + result + '\n') print(each + '\t' + result)
def __len__(self): return grapheme.length(self)
def test_default_grapheme_suit(input_string, expected_graphemes, description): assert list(grapheme.graphemes(input_string)) == expected_graphemes assert grapheme.length(input_string) == len(expected_graphemes)