def analyse(self, vi_ch, smart=False): '''Tries every possible English letter for a Vietnamese mark. Parameters ---------- vi_ch : char Vietnamese mark smart : bool whether or not the method is smart enough to always choose the option with the higher probability ''' for en_ch in vi_ch_alphabet[vi_ch]: df = self.apply_ch(vi_ch, en_ch, smart=smart) fer = (df['en_chosen'] * df['vi_prob']).sum() fvr = (df['vi_chosen'] * df['en_prob']).sum() score = fer + fvr logger.info("{}: {}={}+{}".format(en_ch, score, fer, fvr))
def search_all(self, attempt_count=100, smart=False): best_score = 100.0 for i in range(attempt_count): input_method = '' for vi_ch in vi_code: s = vi_ch_alphabet[vi_ch] input_method += s[randrange(len(s))] input_method, df, fer, fvr, score = self.search(input_method, smart=smart) if score < best_score: best_df = df best_fer = fer best_fvr = fvr best_score = score best_input_method = input_method logger.info( "------ Current best score {}={}+{} from {} -----".format( best_score, best_fer, best_fvr, best_input_method)) return best_input_method, best_df, best_fer, best_fvr, best_score
def search(self, initial_input_method, smart=False): '''Searches for the locally best input method. Not too optimal at the moment''' best_input_method = initial_input_method best_df, best_fer, best_fvr = self.apply(best_input_method, smart=smart) best_score = best_fer + best_fvr logger.info("Initial score {}={}+{} from {}".format( best_score, best_fer, best_fvr, best_input_method)) while True: better = False for i in range(len(vi_code)): for ch in vi_alphabet: input_method = best_input_method[: i] + ch + best_input_method[ i + 1:] if not valid(input_method): continue df, fer, fvr = self.apply(input_method, smart=smart) score = fer + fvr if score < best_score: best_df = df best_fer = fer best_fvr = fvr best_score = score best_input_method = input_method logger.info("Better score {}={}+{} from {}".format( best_score, best_fer, best_fvr, best_input_method)) better = True if not better: break logger.info("Best input method: {}".format(best_input_method)) return best_input_method, best_df, best_fer, best_fvr, best_score
def make_corpus(in_f, trie_f, out_f): logger.info("Loading the enwiki trie '{}'...".format(trie_f)) trie = Trie.from_file(trie_f) logger.info("Loading the viwiki split csv '{}'...".format(in_f)) df = pc.read_csv(in_f) df.columns = ['word', 'count', 'vi_prob'] alphabet = 'bdfjklqrsvwxzadeou' s = df['word'].str.slice(stop=-1) for ch in alphabet: logger.info(ch) df[ch] = s.apply(lambda x: trie.prob(x + ch)) logger.info("Saving the expanded viwiki split csv '{}'...".format(out_f)) pc.to_csv(df, out_f, index=False)
def make_corpus(in_f, out_f): """Convert Wikipedia xml dump file to text corpus""" logger.info("Opening the Wikipedia dump '{}'...".format(in_f)) wiki = WikiCorpus(in_f, token_min_len=1) i = 0 w = 0 counter = Counter() for text in wiki.get_texts(): if (i % 10000 == 0): logger.info('Processed {} articles with {} words'.format(i, w)) w += len(text) counter.update(text) i = i + 1 logger.info('Processing {} articles with {} words complete!'.format(i, w)) df = pd.DataFrame.from_dict(counter, orient='index').reset_index() df = df.rename(columns={'index': 'word', 0: 'count'}) df.sort_values('count', axis=0, ascending=False, inplace=True) pc.to_csv(df, out_f, index=False)
def make_corpus(in_f, trie_f, out_f): logger.info("Loading the enwiki trie '{}'...".format(trie_f)) trie = Trie.from_file(trie_f) logger.info("Loading the viwiki split csv '{}'...".format(in_f)) df = pc.read_csv(in_f) df.columns = ['word', 'count', 'vi_prob'] alphabet = 'bdfjklqrsvwxzadeou' s = df['word'].str.slice(stop=-1) for ch in alphabet: logger.info(ch) df[ch] = s.apply(lambda x: trie.prob(x + ch)) logger.info("Saving the expanded viwiki split csv '{}'...".format(out_f)) pc.to_csv(df, out_f, index=False) if __name__ == '__main__': if len(sys.argv) != 4: logger.info( 'Usage: python expand_split_corpus.py <viwiki_prefix_splitX-....csv> <enwiki_restricted_....trie> <viwiki_prefix_splitX_expanded....csv>' ) sys.exit(1) in_f = sys.argv[1] trie_f = sys.argv[2] out_f = sys.argv[3] make_corpus(in_f, trie_f, out_f)
s = vi_ch_alphabet[vi_ch] input_method += s[randrange(len(s))] input_method, df, fer, fvr, score = self.search(input_method, smart=smart) if score < best_score: best_df = df best_fer = fer best_fvr = fvr best_score = score best_input_method = input_method logger.info( "------ Current best score {}={}+{} from {} -----".format( best_score, best_fer, best_fvr, best_input_method)) return best_input_method, best_df, best_fer, best_fvr, best_score if __name__ == '__main__': if len(sys.argv) != 3: logger.info( 'Usage: python process_wiki_dump.py <wikipedia_dump_file_that_ends_with_articles.xml.bz2> <processed_csv_file>' ) sys.exit(1) in_f = sys.argv[1] out_f = sys.argv[2] make_corpus(in_f, out_f) # currently, the best input method is "qzxdzuzjfkaw" with fer = 0.0009225985366192045 and fvr = 0.002716891565571517 # itelex: "fesdowajrwwx" fer = 0.010456098466438603 and fvr = 0.014519511048841487 pass