def test_transliteration(): # "Well, there's a lot of things you do not understand." # (from somewhere in OpenSubtitles) eq_(tokenize("Па, има ту много ствари које не схваташ.", 'sr'), ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']) eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'), ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
def db_rank(clue): scores = defaultdict(float) for match, score in db_search(clue).items(): scores[slugify(match)] += score * 1000 parts = tokenize(match, 'en') for part in parts: scores[slugify(part)] += score * 1000 / len(parts) for word in tokenize(clue, 'en'): logprob_result = WORDS.segment_logprob(slugify(word)) if logprob_result is not None: logprob, _ = logprob_result else: logprob = -1000. rare_boost = min(25., -logprob) for match, score in db_search(word).items(): scores[slugify(match)] += rare_boost * score * 10 parts = tokenize(match, 'en') for part in parts: scores[slugify(part)] += rare_boost * score * 10 / len(parts) query = query_expand(word) for match, score in db_search(query).items(): scores[slugify(match)] += rare_boost * score parts = tokenize(match, 'en') for part in parts: scores[slugify(part)] += rare_boost * score / len(parts) return scores
def test_transliteration(): # "Well, there's a lot of things you do not understand." # (from somewhere in OpenSubtitles assert ( tokenize("Па, има ту много ствари које не схваташ.", 'sr') == ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'] ) assert ( tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') == ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'] ) # I don't have examples of complete sentences in Azerbaijani that are # naturally in Cyrillic, because it turns out everyone writes Azerbaijani # in Latin letters on the Internet, _except_ sometimes for Wiktionary. # So here are some individual words. # 'library' in Azerbaijani Cyrillic assert preprocess_text('китабхана', 'az') == 'kitabxana' assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana' assert preprocess_text('KİTABXANA', 'az') == 'kitabxana' # 'scream' in Azerbaijani Cyrillic assert preprocess_text('бағырты', 'az') == 'bağırtı' assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı' assert preprocess_text('BAĞIRTI', 'az') == 'bağırtı'
def test_catastrophes(): # More apostrophes, but this time they're in Catalan, and there's other # mid-word punctuation going on too. eq_(tokenize("M'acabo d'instal·lar.", 'ca'), ['m', 'acabo', 'd', 'instal·lar']) eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True), ["m'", 'acabo', "d'", 'instal·lar', '.'])
def test_gender_neutral_at(): # Recognize the gender-neutral @ in Spanish as part of the word text = "La protección de los derechos de tod@s l@s trabajador@s migrantes" assert tokenize(text, "es") == [ "la", "protección", "de", "los", "derechos", "de", "tod@s", "l@s", "trabajador@s", "migrantes" ] text = "el distrito 22@ de Barcelona" assert tokenize(text, 'es') == ["el", "distrito", "22@", "de", "barcelona"] assert lossy_tokenize( text, 'es') == ["el", "distrito", "00@", "de", "barcelona"] # It also appears in Portuguese text = "direitos e deveres para @s membr@s da comunidade virtual" assert tokenize(text, "pt") == [ "direitos", "e", "deveres", "para", "@s", "membr@s", "da", "comunidade", "virtual" ] # Because this is part of our tokenization, the language code doesn't # actually matter, as long as it's a language with Unicode tokenization text = "@s membr@s da comunidade virtual" assert tokenize(text, "en") == ["@s", "membr@s", "da", "comunidade", "virtual"]
def test_tokens(): # Let's test on some Chinese text that has unusual combinations of # syllables, because it is about an American vice-president. # # (He was the Chinese Wikipedia's featured article of the day when I # wrote this test.) hobart = '加勒特·霍巴特' # Garret Hobart, or "jiā lè tè huò bā tè". # He was the sixth American vice president to die in office. fact_simplified = '他是历史上第六位在任期内去世的美国副总统。' fact_traditional = '他是歷史上第六位在任期內去世的美國副總統。' # His name breaks into five pieces, with the only piece staying together # being the one that means 'Bart'. The dot is not included as a token. eq_( tokenize(hobart, 'zh'), ['加', '勒', '特', '霍', '巴特'] ) eq_( tokenize(fact_simplified, 'zh'), [ # he / is / in history / #6 / counter for people '他', '是', '历史上', '第六', '位', # during / term of office / in / die '在', '任期', '内', '去世', # of / U.S. / deputy / president '的', '美国', '副', '总统' ] ) # You match the same tokens if you look it up in Traditional Chinese. eq_(tokenize(fact_simplified, 'zh'), tokenize(fact_traditional, 'zh')) assert_greater(word_frequency(fact_traditional, 'zh'), 0)
def main(): arg1 = sys.argv[1] arg2 = sys.argv[2] urlInput = False if arg2.startswith("http://") or arg2.startswith("https://"): urlInput = True inp_file1 = open(arg1) if urlInput: response = urllib.request.urlopen(arg2) inp_file2 = response.read().decode("utf8").splitlines() else: inp_file2 = open(arg2) numPrints = int(sys.argv[3]) tokenizedLines = wordfreq.tokenize(inp_file2) tokenizedStopWords = wordfreq.tokenize(inp_file1) inp_file1.close() if not urlInput: inp_file2.close() frequencies = wordfreq.countWords(tokenizedLines, tokenizedStopWords) wordfreq.printTopMost(frequencies, numPrints)
def test_ideographic_fallback(): # Try tokenizing Chinese text as English -- it should remain stuck together. eq_(tokenize('中国文字', 'en'), ['中国文字']) # When Japanese is tagged with the wrong language, it will be split # at script boundaries. ja_text = 'ひらがなカタカナromaji' eq_(tokenize(ja_text, 'en'), ['ひらがな', 'カタカナ', 'romaji'])
def test_ideographic_fallback(): # Try tokenizing Chinese text -- it should remain stuck together. eq_(tokenize("中国文字", "zh"), ["中国文字"]) # When Japanese is tagged with the wrong language, it will be split # at script boundaries. ja_text = "ひらがなカタカナromaji" eq_(tokenize(ja_text, "en"), ["ひらがな", "カタカナ", "romaji"])
def test_catastrophes(): # More apostrophes, but this time they're in Catalan, and there's other # mid-word punctuation going on too. assert tokenize("M'acabo d'instal·lar.", 'ca') == ['m', 'acabo', 'd', 'instal·lar'] assert ( tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True) == ["m'", 'acabo', "d'", 'instal·lar', '.'] )
def test_punctuation_at(): # If the @ appears alone in a word, we consider it to be punctuation text = "operadores de canal, que são aqueles que têm um @ ao lado do nick" assert tokenize(text, "pt") == [ "operadores", "de", "canal", "que", "são", "aqueles", "que", "têm", "um", "ao", "lado", "do", "nick" ] assert tokenize(text, "pt", include_punctuation=True) == [ "operadores", "de", "canal", ",", "que", "são", "aqueles", "que", "têm", "um", "@", "ao", "lado", "do", "nick" ] # If the @ is not at the end of the word or part of the word ending '@s', # it is also punctuation text = "un archivo hosts.deny que contiene la línea ALL:ALL@ALL" assert tokenize(text, "es") == [ "un", "archivo", "hosts.deny", "que", "contiene", "la", "línea", "all:all", "all" ] # Make sure not to catch e-mail addresses text = "*****@*****.**" assert tokenize(text, "en") == [ "info", "something.example" ]
def test_arabic(): # Remove tatweels assert tokenize('متــــــــعب', 'ar') == ['متعب'] # Remove combining marks assert tokenize('حَرَكَات', 'ar') == ['حركات'] # An Arabic ligature that is affected by NFKC normalization assert tokenize('\ufefb', 'ar') == ['\u0644\u0627']
def test_actually_russian(): # This looks mostly like Serbian, but was probably actually Russian. # In Russian, Google Translate says it means: # "a hundred out of a hundred, boys!" # # We make sure to handle this case so we don't end up with a mixed-script # word like "pacanы". assert tokenize("сто из ста, пацаны!", 'sr') == ['sto', 'iz', 'sta', 'pacany'] assert tokenize("культуры", 'sr') == ["kul'tury"]
def test_ar(): # Remove tatweels eq_(tokenize("متــــــــعب", "ar"), ["متعب"]) # Remove combining marks eq_(tokenize("حَرَكَات", "ar"), ["حركات"]) eq_( tokenize("\ufefb", "ar"), ["\u0644\u0627"] # An Arabic ligature... # ...that is affected by NFKC normalization )
def test_ideographic_fallback(): # Try tokenizing Chinese text as English -- it should remain stuck together. eq_(tokenize('中国文字', 'en'), ['中国文字']) # When Japanese is tagged with the wrong language, it will be split # at script boundaries. ja_text = 'ひらがなカタカナromaji' eq_( tokenize(ja_text, 'en'), ['ひらがな', 'カタカナ', 'romaji'] )
def test_arabic(): # Remove tatweels eq_(tokenize('متــــــــعب', 'ar'), ['متعب']) # Remove combining marks eq_(tokenize('حَرَكَات', 'ar'), ['حركات']) eq_( tokenize('\ufefb', 'ar'), # An Arabic ligature... ['\u0644\u0627'] # ...that is affected by NFKC normalization )
def test_actually_russian(): # This looks mostly like Serbian, but was probably actually Russian. # In Russian, Google Translate says it means: # "a hundred out of a hundred, boys!" # # We make sure to handle this case so we don't end up with a mixed-script # word like "pacanы". eq_(tokenize("сто из ста, пацаны!", 'sr'), ['sto', 'iz', 'sta', 'pacany']) eq_(tokenize("культуры", 'sr'), ["kul'tury"])
def test_apostrophes(): # Test that we handle apostrophes in French reasonably. assert tokenize("qu'un", 'fr') == ['qu', 'un'] assert tokenize("qu'un", 'fr', include_punctuation=True) == ["qu'", "un"] assert tokenize("langues d'oïl", 'fr') == ['langues', "d", 'oïl'] assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl'] assert tokenize("l'heure", 'fr') == ['l', 'heure'] assert tokenize("l'ànima", 'ca') == ['l', 'ànima'] assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure'] assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital'] assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"] assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french']
def test_tokenization(): # We preserve apostrophes within words, so "can't" is a single word in the # data eq_(tokenize("I don't split at apostrophes, you see.", 'en'), ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']) # Certain punctuation does not inherently split a word. eq_(tokenize("Anything is possible at zombo.com", 'en'), ['anything', 'is', 'possible', 'at', 'zombo.com']) # Splits occur after symbols, and at splitting punctuation such as hyphens. eq_(tokenize('😂test', 'en'), ['😂', 'test']) eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
def test_apostrophes(): # Test that we handle apostrophes in French reasonably. assert tokenize("qu'un", 'fr') == ['qu', 'un'] assert tokenize("qu'un", 'fr', include_punctuation=True) == ["qu'", "un"] assert tokenize("langues d'oïl", 'fr') == ['langues', "d", 'oïl'] assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl'] assert tokenize("l'heure", 'fr') == ['l', 'heure'] assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure'] assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital'] assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"] assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french']
def test_alternate_codes(): # Tokenization of Chinese works when you use other language codes # that are not equal to 'zh'. tokens = ['谢谢', '谢谢'] # Code with a region attached eq_(tokenize('谢谢谢谢', 'zh-CN'), tokens) # Over-long codes for Chinese eq_(tokenize('谢谢谢谢', 'chi'), tokens) eq_(tokenize('谢谢谢谢', 'zho'), tokens) # Separate codes for Mandarin and Cantonese eq_(tokenize('谢谢谢谢', 'cmn'), tokens) eq_(tokenize('谢谢谢谢', 'yue'), tokens)
def test_tokenization(): # We preserve apostrophes within words, so "can't" is a single word in the # data eq_( tokenize("I don't split at apostrophes, you see.", "en"), ["i", "don't", "split", "at", "apostrophes", "you", "see"], ) # Certain punctuation does not inherently split a word. eq_(tokenize("Anything is possible at zombo.com", "en"), ["anything", "is", "possible", "at", "zombo.com"]) # Splits occur after symbols, and at splitting punctuation such as hyphens. eq_(tokenize("😂test", "en"), ["😂", "test"]) eq_(tokenize("flip-flop", "en"), ["flip", "flop"])
def test_alternate_codes(): # Tokenization of Chinese works when you use other language codes # that are not equal to 'zh'. tokens = ['谢谢', '谢谢'] # Code with a region attached assert tokenize('谢谢谢谢', 'zh-CN') == tokens # Over-long codes for Chinese assert tokenize('谢谢谢谢', 'chi') == tokens assert tokenize('谢谢谢谢', 'zho') == tokens # Separate codes for Mandarin and Cantonese assert tokenize('谢谢谢谢', 'cmn') == tokens assert tokenize('谢谢谢谢', 'yue') == tokens
def read_freqs(filename, cutoff=0, lang=None): """ Read words and their frequencies from a CSV file. Only words with a frequency greater than or equal to `cutoff` are returned. If `cutoff` is greater than 0, the csv file must be sorted by frequency in descending order. If lang is given, read_freqs will apply language specific preprocessing operations. """ raw_counts = defaultdict(float) total = 0. with open(filename, encoding='utf-8', newline='') as infile: for key, strval in csv.reader(infile): val = float(strval) if val < cutoff: break tokens = tokenize( key, lang) if lang is not None else simple_tokenize(key) for token in tokens: # Use += so that, if we give the reader concatenated files with # duplicates, it does the right thing raw_counts[fix_text(token)] += val total += val for word in raw_counts: raw_counts[word] /= total return raw_counts
def read_values(filename, cutoff=0, max_size=1e8, lang=None): """ Read words and their frequency or count values from a CSV file. Returns a dictionary of values and the total of all values. Only words with a value greater than or equal to `cutoff` are returned. If `cutoff` is greater than 0, the csv file must be sorted by value in descending order. If `lang` is given, it will apply language-specific tokenization to the words that it reads. """ values = defaultdict(float) total = 0. with open(filename, encoding='utf-8', newline='') as infile: for key, strval in csv.reader(infile): val = float(strval) key = fix_text(key) if val < cutoff or len(values) >= max_size: break tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key) for token in tokens: # Use += so that, if we give the reader concatenated files with # duplicates, it does the right thing values[token] += val total += val return values, total
def main(): #Add all stopwords to a List stop_file = open(sys.argv[1], encoding="utf-8") stop_words = [] for stop in stop_file: stop_words.append(stop.strip()) stop_file.close() inp_file = "" #Check if file points to local dir or http if (str(sys.argv[2]).startswith('http://') or str(sys.argv[2]).startswith('https://')): response = urllib.request.urlopen(sys.argv[2]) inp_file = response.read().decode("utf8").splitlines() else: local_file = open(sys.argv[2], encoding="utf-8") inp_file = local_file.read().splitlines() local_file.close() #Split all words t_file = w.tokenize(inp_file) #Count words countDic = w.countWords(t_file, stop_words) #Print top N w.printTopMost(countDic, int(sys.argv[3]))
def cld2_surface_tokenizer(text, mode='twitter'): """ Uses CLD2 to detect the language and wordfreq tokenizer to create tokens. The `mode` can be 'twitter' or 'reddit', which slightly changes the pre-processing of the text. """ text = unescape_html(text) if mode == 'twitter': text = TWITTER_HANDLE_RE.sub('', text) text = TCO_RE.sub('', text) elif mode == 'reddit': text = URL_RE.sub('', text) text = MARKDOWN_URL_RESIDUE_RE.sub(']', text) lang = cld2_detect_language(text) # If the detected language isn't in our pretty generous list of languages, # return no tokens. if lang not in KEEP_THESE_LANGUAGES: return 'xx', [] # cld2's accuracy seems to improve dramatically with at least 50 # bytes of input, so throw away non-English below this length. if len(text.encode('utf-8')) < 50 and lang != 'en': return 'xx', [] tokens = tokenize(text, lang) return lang, tokens
def read_values(filename, cutoff=0, max_words=1e8, lang=None): """ Read words and their frequency or count values from a CSV file. Returns a dictionary of values and the total of all values. Only words with a value greater than or equal to `cutoff` are returned. In addition, only up to `max_words` words are read. If `cutoff` is greater than 0 or `max_words` is smaller than the list, the csv file must be sorted by value in descending order, so that the most frequent words are kept. If `lang` is given, it will apply language-specific tokenization to the words that it reads. """ values = defaultdict(float) total = 0. with open(filename, encoding='utf-8', newline='') as infile: for key, strval in csv.reader(infile): val = float(strval) key = fix_text(key) if val < cutoff or len(values) >= max_words: break tokens = tokenize( key, lang) if lang is not None else simple_tokenize(key) for token in tokens: # Use += so that, if we give the reader concatenated files with # duplicates, it does the right thing values[token] += val total += val return values, total
def wordfreqs(text): freqs = [] for tok in wordfreq.tokenize(text, 'en'): freq = wordfreq.zipf_frequency(tok, 'en') if freq != 0: freqs.append(freq) return np.array(freqs)
def simple_tokenize(text): """ Tokenize text using the default wordfreq rules. It depends on 'wordfreq', a Python 3 library, so it can tokenize multilingual text consistently: https://pypi.org/project/wordfreq/ """ return wordfreq.tokenize(text, 'xx')
def test_ideographic_fallback(): # Try tokenizing Chinese text as English -- it should remain stuck together. eq_(tokenize('中国文字', 'en'), ['中国文字']) # When Japanese is tagged with the wrong language, it will be split # at script boundaries. ja_text = 'ひらがなカタカナromaji' eq_( tokenize(ja_text, 'en'), ['ひらがな', 'カタカナ', 'romaji'] ) # Test that we leave Thai letters stuck together. If we had better Thai support, # we would actually split this into a three-word phrase. eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี']) eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'), ['การเล่นดนตรี', 'means', 'playing', 'music'])
def standardized(term): """ Breaks into underscore-separated words and replaces numbers with '#' signs. """ tokens = wordfreq.tokenize(term.replace('_', ' '), 'xx') if tokens[0] == 'to': tokens = tokens[1:] return replace_numbers('_'.join(tokens))
def text_to_vector(self, language, text): """ Used in Story Cloze Test to create a vector for text. """ tokens = wordfreq.tokenize(text, language) weighted_terms = [(uri_prefix(standardized_uri(language, token)), 1.) for token in tokens] return self.get_vector(weighted_terms, oov_vector=False)
def test_ar(): # Remove tatweels eq_( tokenize('متــــــــعب', 'ar'), ['متعب'] ) # Remove combining marks eq_( tokenize('حَرَكَات', 'ar'), ['حركات'] ) eq_( tokenize('\ufefb', 'ar'), # An Arabic ligature... ['\u0644\u0627'] # ...that is affected by NFKC normalization )
def text_to_vector(self, language, text): """ Used in Story Cloze Test to create a vector for text. """ tokens = wordfreq.tokenize(text, language) weighted_terms = [ (uri_prefix(standardized_uri(language, token)), 1.) for token in tokens ] return self.get_vector(weighted_terms, oov_vector=False)
def cld2_surface_tokenizer(text): """ Uses CLD2 to detect the language and wordfreq tokenizer to create tokens. """ text = unescape_html(text) text = TWITTER_HANDLE_RE.sub('', text) text = TCO_RE.sub('', text) lang = cld2_detect_language(text) tokens = tokenize(text, lang) return lang, tokens
def main(): f1 = open(sys.argv[1], encoding="utf-8") stops = [] for line in f1: stops.append(line.strip()) f1.close() text = check(sys.argv[2]) tokenz = wordfreq.tokenize(text) freks = wordfreq.countWords(tokenz, stops) wordfreq.printTopMost(freks, int(sys.argv[3]))
def test_number_smashing(): assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver'] assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver'] assert ( lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True) == ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver'] ) assert lossy_tokenize('1', 'en') == ['1'] assert lossy_tokenize('3.14', 'en') == ['0.00'] assert lossy_tokenize('24601', 'en') == ['00000'] assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
def build_wp_database(db, filename): db.execute("DROP TABLE IF EXISTS words") with db as _transaction: for statement in SCHEMA: db.execute(statement) with db as _transaction: num_lines = sum(1 for line in open(filename)) for line in tqdm(open(filename), total=num_lines): title, text = line.split('\t', 1) words = wordfreq.tokenize(text.rstrip(), 'en') for word in words: add_entry(db, title, word)
def test_gender_neutral_at(): # Recognize the gender-neutral @ in Spanish as part of the word text = "La protección de los derechos de tod@s l@s trabajador@s migrantes" assert tokenize(text, "es") == [ "la", "protección", "de", "los", "derechos", "de", "tod@s", "l@s", "trabajador@s", "migrantes" ] text = "el distrito 22@ de Barcelona" assert tokenize(text, 'es') == ["el", "distrito", "22@", "de", "barcelona"] assert lossy_tokenize(text, 'es') == ["el", "distrito", "00@", "de", "barcelona"] # It also appears in Portuguese text = "direitos e deveres para @s membr@s da comunidade virtual" assert tokenize(text, "pt") == [ "direitos", "e", "deveres", "para", "@s", "membr@s", "da", "comunidade", "virtual" ] # Because this is part of our tokenization, the language code doesn't # actually matter, as long as it's a language with Unicode tokenization text = "@s membr@s da comunidade virtual" assert tokenize(text, "en") == ["@s", "membr@s", "da", "comunidade", "virtual"]
def test_tokenization(): # We preserve apostrophes within words, so "can't" is a single word in the # data eq_(tokenize("I don't split at apostrophes, you see.", 'en'), ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']) eq_( tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True), ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.']) # Certain punctuation does not inherently split a word. eq_(tokenize("Anything is possible at zombo.com", 'en'), ['anything', 'is', 'possible', 'at', 'zombo.com']) # Splits occur after symbols, and at splitting punctuation such as hyphens. eq_(tokenize('😂test', 'en'), ['😂', 'test']) eq_(tokenize("flip-flop", 'en'), ['flip', 'flop']) eq_( tokenize('this text has... punctuation :)', 'en', include_punctuation=True), ['this', 'text', 'has', '...', 'punctuation', ':)']) # Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf' # and 'David Bowie' stay together, because our Unicode segmentation algorithm # is up to date eq_(tokenize('emoji test 🧕🏽', 'en'), ['emoji', 'test', '🧕🏽']) eq_( tokenize("👨🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en'), [ '👨🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's", 'nothing', 'i', 'can', 'do', '🌎', '🚀' ]) # Water wave, surfer, flag of California (indicates ridiculously complete support # for Unicode 10 and Emoji 5.0) eq_(tokenize("Surf's up 🌊🏄🏴'", 'en'), ["surf's", "up", "🌊", "🏄", "🏴"])
def test_punctuation_at(): # If the @ appears alone in a word, we consider it to be punctuation text = "operadores de canal, que são aqueles que têm um @ ao lado do nick" assert tokenize(text, "pt") == [ "operadores", "de", "canal", "que", "são", "aqueles", "que", "têm", "um", "ao", "lado", "do", "nick" ] assert tokenize(text, "pt", include_punctuation=True) == [ "operadores", "de", "canal", ",", "que", "são", "aqueles", "que", "têm", "um", "@", "ao", "lado", "do", "nick" ] # If the @ is not at the end of the word or part of the word ending '@s', # it is also punctuation text = "un archivo hosts.deny que contiene la línea ALL:ALL@ALL" assert tokenize(text, "es") == [ "un", "archivo", "hosts.deny", "que", "contiene", "la", "línea", "all:all", "all" ] # Make sure not to catch e-mail addresses text = "*****@*****.**" assert tokenize(text, "en") == ["info", "something.example"]
def test_transliteration(): # "Well, there's a lot of things you do not understand." # (from somewhere in OpenSubtitles assert (tokenize("Па, има ту много ствари које не схваташ.", 'sr') == [ 'pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš' ]) assert (tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') == [ 'pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš' ]) # I don't have examples of complete sentences in Azerbaijani that are # naturally in Cyrillic, because it turns out everyone writes Azerbaijani # in Latin letters on the Internet, _except_ sometimes for Wiktionary. # So here are some individual words. # 'library' in Azerbaijani Cyrillic assert preprocess_text('китабхана', 'az') == 'kitabxana' assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana' assert preprocess_text('KİTABXANA', 'az') == 'kitabxana' # 'scream' in Azerbaijani Cyrillic assert preprocess_text('бағырты', 'az') == 'bağırtı' assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı' assert preprocess_text('BAĞIRTI', 'az') == 'bağırtı'
def test_tokens(): # Let's test on some Chinese text that has unusual combinations of # syllables, because it is about an American vice-president. # # (He was the Chinese Wikipedia's featured article of the day when I # wrote this test.) hobart = '加勒特·霍巴特' # Garret Hobart, or "jiā lè tè huò bā tè". # He was the sixth American vice president to die in office. fact_simplified = '他是历史上第六位在任期内去世的美国副总统。' fact_traditional = '他是歷史上第六位在任期內去世的美國副總統。' # His name breaks into five pieces, with the only piece staying together # being the one that means 'Bart'. The dot is not included as a token. eq_( tokenize(hobart, 'zh'), ['加', '勒', '特', '霍', '巴特'] ) eq_( tokenize(fact_simplified, 'zh'), [ # he / is / in history / #6 / counter for people '他', '是', '历史上', '第六', '位', # during / term of office / in / die '在', '任期', '内', '去世', # of / U.S. / deputy / president '的', '美国', '副', '总统' ] ) # Jieba's original tokenizer knows a lot of names, it seems. eq_( tokenize(hobart, 'zh', external_wordlist=True), ['加勒特', '霍巴特'] ) # We get almost the same tokens from the sentence using Jieba's own # wordlist, but it tokenizes "in history" as two words and # "sixth person" as one. eq_( tokenize(fact_simplified, 'zh', external_wordlist=True), [ # he / is / history / in / sixth person '他', '是', '历史', '上', '第六位', # during / term of office / in / die '在', '任期', '内', '去世', # of / U.S. / deputy / president '的', '美国', '副', '总统' ] ) # You match the same tokens if you look it up in Traditional Chinese. eq_(tokenize(fact_simplified, 'zh'), tokenize(fact_traditional, 'zh')) assert_greater(word_frequency(fact_traditional, 'zh'), 0)
def test_number_smashing(): assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver'] assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver'] assert (lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True) == [ '"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver' ]) assert lossy_tokenize('1', 'en') == ['1'] assert lossy_tokenize('3.14', 'en') == ['0.00'] assert lossy_tokenize('24601', 'en') == ['00000'] assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
def cld2_surface_tokenizer(text): """ Uses CLD2 to detect the language and wordfreq tokenizer to create tokens. """ text = unescape_html(text) text = TWITTER_HANDLE_RE.sub('', text) text = TCO_RE.sub('', text) lang = cld2_detect_language(text) # Don't allow tokenization in Chinese when language-detecting, because # the Chinese tokenizer may not be built yet if lang == 'zh': lang = 'en' tokens = tokenize(text, lang) return lang, tokens
def cld2_reddit_tokenizer(text): """ A language-detecting tokenizer with special cases for handling text from Reddit. """ text = URL_RE.sub('', text) text = MARKDOWN_URL_RESIDUE_RE.sub(']', text) lang = cld2_detect_language(text) if lang not in KEEP_THESE_LANGUAGES: # Reddit is 99.9% English, so if we detected a rare language, it's # much more likely that it's actually English. lang = 'en' tokens = tokenize(text, lang, include_punctuation=True) return lang, tokens
def test_tokenization(): # We preserve apostrophes within words, so "can't" is a single word in the # data assert ( tokenize("I don't split at apostrophes, you see.", 'en') == ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'] ) assert ( tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True) == ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'] ) # Certain punctuation does not inherently split a word. assert ( tokenize("Anything is possible at zombo.com", 'en') == ['anything', 'is', 'possible', 'at', 'zombo.com'] ) # Splits occur after symbols, and at splitting punctuation such as hyphens. assert tokenize('😂test', 'en') == ['😂', 'test'] assert tokenize("flip-flop", 'en') == ['flip', 'flop'] assert ( tokenize('this text has... punctuation :)', 'en', include_punctuation=True) == ['this', 'text', 'has', '...', 'punctuation', ':)'] ) # Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf' # and 'David Bowie' stay together, because our Unicode segmentation algorithm # is up to date assert tokenize('emoji test 🧕🏽', 'en') == ['emoji', 'test', '🧕🏽'] assert ( tokenize("👨🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en') == ['👨🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's", 'nothing', 'i', 'can', 'do', '🌎', '🚀'] ) # Water wave, surfer, flag of California (indicates ridiculously complete support # for Unicode 10 and Emoji 5.0) assert tokenize("Surf's up 🌊🏄🏴'",'en') == ["surf's", "up", "🌊", "🏄", "🏴"]
def test_other_languages(): # Test that we leave Thai letters stuck together. If we had better Thai support, # we would actually split this into a three-word phrase. assert tokenize('การเล่นดนตรี', 'th') == ['การเล่นดนตรี'] assert tokenize('"การเล่นดนตรี" means "playing music"', 'en') == ['การเล่นดนตรี', 'means', 'playing', 'music'] # Test Khmer, a script similar to Thai assert tokenize('សូមស្វាគមន៍', 'km') == ['សូមស្វាគមន៍'] # Test Hindi -- tokens split where there are spaces, and not where there aren't assert tokenize('हिन्दी विक्षनरी', 'hi') == ['हिन्दी', 'विक्षनरी'] # Remove vowel points in Hebrew assert tokenize('דֻּגְמָה', 'he') == ['דגמה'] # Deal with commas, cedillas, and I's in Turkish assert tokenize('kișinin', 'tr') == ['kişinin'] assert tokenize('KİȘİNİN', 'tr') == ['kişinin'] # Deal with cedillas that should be commas-below in Romanian assert tokenize('acelaşi', 'ro') == ['același'] assert tokenize('ACELAŞI', 'ro') == ['același']
def test_tokens(): # Let's test on some Chinese text that has unusual combinations of # syllables, because it is about an American vice-president. # # (He was the Chinese Wikipedia's featured article of the day when I # wrote this test.) hobart = '加勒特·霍巴特' # Garret Hobart, or "jiā lè tè huò bā tè". # He was the sixth American vice president to die in office. fact_simplified = '他是历史上第六位在任期内去世的美国副总统。' fact_traditional = '他是歷史上第六位在任期內去世的美國副總統。' # His name breaks into five pieces, with the only piece staying together # being the one that means 'Bart'. The dot is not included as a token. assert tokenize(hobart, 'zh') == ['加', '勒', '特', '霍', '巴特'] assert tokenize(fact_simplified, 'zh') == [ # he / is / history / in / #6 / counter for people '他', '是', '历史', '上', '第六', '位', # during / term of office / in / die '在', '任期', '内', '去世', # of / U.S. / deputy / president '的', '美国', '副', '总统' ] # Jieba's original tokenizer knows a lot of names, it seems. assert tokenize(hobart, 'zh', external_wordlist=True) == ['加勒特', '霍巴特'] # We get almost the same tokens from the sentence using Jieba's own # wordlist, but it tokenizes "in history" as two words and # "sixth person" as one. assert tokenize(fact_simplified, 'zh', external_wordlist=True) == [ # he / is / history / in / sixth person '他', '是', '历史', '上', '第六位', # during / term of office / in / die '在', '任期', '内', '去世', # of / U.S. / deputy / president '的', '美国', '副', '总统' ] # Check that Traditional Chinese works at all assert word_frequency(fact_traditional, 'zh') > 0 # You get the same token lengths if you look it up in Traditional Chinese, # but the words are different simp_tokens = tokenize(fact_simplified, 'zh', include_punctuation=True) trad_tokens = tokenize(fact_traditional, 'zh', include_punctuation=True) assert ''.join(simp_tokens) == fact_simplified assert ''.join(trad_tokens) == fact_traditional simp_lengths = [len(token) for token in simp_tokens] trad_lengths = [len(token) for token in trad_tokens] assert simp_lengths == trad_lengths
def test_other_languages(): # Test that we leave Thai letters stuck together. If we had better Thai support, # we would actually split this into a three-word phrase. eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี']) eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'), ['การเล่นดนตรี', 'means', 'playing', 'music']) # Test Khmer, a script similar to Thai eq_(tokenize('សូមស្វាគមន៍', 'km'), ['សូមស្វាគមន៍']) # Test Hindi -- tokens split where there are spaces, and not where there aren't eq_(tokenize('हिन्दी विक्षनरी', 'hi'), ['हिन्दी', 'विक्षनरी']) # Remove vowel points in Hebrew eq_(tokenize('דֻּגְמָה', 'he'), ['דגמה']) # Deal with commas, cedillas, and I's in Turkish eq_(tokenize('kișinin', 'tr'), ['kişinin']) eq_(tokenize('KİȘİNİN', 'tr'), ['kişinin']) # Deal with cedillas that should be commas-below in Romanian eq_(tokenize('acelaşi', 'ro'), ['același']) eq_(tokenize('ACELAŞI', 'ro'), ['același'])
import html import sys import wordfreq if len(sys.argv) != 3: print('Usage: python3 sort.py target-lang pairs.csv') sys.exit(1) targetLang = sys.argv[1] pairsPath = sys.argv[2] pairs = {} with open(pairsPath, 'r', encoding='utf-8') as pairsFile: reader = csv.reader(pairsFile, delimiter='\t') for row in reader: words = wordfreq.tokenize(html.unescape(row[0]), targetLang) freqs = [wordfreq.zipf_frequency(word, targetLang, wordlist='combined') for word in words] minfreq = min(freqs) avgfreq = sum(freqs) / float(len(freqs)) pairs[row[0]] = (minfreq, avgfreq, row[1]) pairList = list(pairs.items()) pairList.sort(reverse = True, key=lambda i: i[1]) for pair in pairList: sys.stdout.buffer.write((pair[0] + '\t' + pair[1][2] + '\n').encode('utf-8'))
def test_ideographic_fallback(): # Try tokenizing Chinese text as English -- it should remain stuck together. # # More complex examples like this, involving the multiple scripts of Japanese, # are in test_japanese.py. assert tokenize('中国文字', 'en') == ['中国文字']
def test_alternate_codes(): # Try over-long language codes for French and Catalan assert tokenize("qu'un", 'fra') == ['qu', 'un'] assert tokenize("qu'un", 'fre') == ['qu', 'un'] assert tokenize("M'acabo d'instal·lar.", 'cat') == ['m', 'acabo', 'd', 'instal·lar']
def test_casefolding(): assert tokenize('WEISS', 'de') == ['weiss'] assert tokenize('weiß', 'de') == ['weiss'] assert tokenize('İstanbul', 'tr') == ['istanbul'] assert tokenize('SIKISINCA', 'tr') == ['sıkısınca']