Пример #1
0
def test_transliteration():
    # "Well, there's a lot of things you do not understand."
    # (from somewhere in OpenSubtitles)
    eq_(tokenize("Па, има ту много ствари које не схваташ.", 'sr'),
        ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
    eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'),
        ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
Пример #2
0
def db_rank(clue):
    scores = defaultdict(float)
    for match, score in db_search(clue).items():
        scores[slugify(match)] += score * 1000
        parts = tokenize(match, 'en')
        for part in parts:
            scores[slugify(part)] += score * 1000 / len(parts)

    for word in tokenize(clue, 'en'):
        logprob_result = WORDS.segment_logprob(slugify(word))
        if logprob_result is not None:
            logprob, _ = logprob_result
        else:
            logprob = -1000.
        rare_boost = min(25., -logprob)
        for match, score in db_search(word).items():
            scores[slugify(match)] += rare_boost * score * 10
            parts = tokenize(match, 'en')
            for part in parts:
                scores[slugify(part)] += rare_boost * score * 10 / len(parts)

        query = query_expand(word)
        for match, score in db_search(query).items():
            scores[slugify(match)] += rare_boost * score
            parts = tokenize(match, 'en')
            for part in parts:
                scores[slugify(part)] += rare_boost * score / len(parts)

    return scores
def test_transliteration():
    # "Well, there's a lot of things you do not understand."
    # (from somewhere in OpenSubtitles
    assert (
        tokenize("Па, има ту много ствари које не схваташ.", 'sr') ==
        ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
    )
    assert (
        tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') ==
        ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
    )

    # I don't have examples of complete sentences in Azerbaijani that are
    # naturally in Cyrillic, because it turns out everyone writes Azerbaijani
    # in Latin letters on the Internet, _except_ sometimes for Wiktionary.
    # So here are some individual words.

    # 'library' in Azerbaijani Cyrillic
    assert preprocess_text('китабхана', 'az') == 'kitabxana'
    assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana'
    assert preprocess_text('KİTABXANA', 'az') == 'kitabxana'

    # 'scream' in Azerbaijani Cyrillic
    assert preprocess_text('бағырты', 'az') == 'bağırtı'
    assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı'
    assert preprocess_text('BAĞIRTI', 'az') == 'bağırtı'
Пример #4
0
def test_catastrophes():
    # More apostrophes, but this time they're in Catalan, and there's other
    # mid-word punctuation going on too.
    eq_(tokenize("M'acabo d'instal·lar.", 'ca'),
        ['m', 'acabo', 'd', 'instal·lar'])
    eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True),
        ["m'", 'acabo', "d'", 'instal·lar', '.'])
Пример #5
0
def test_gender_neutral_at():
    # Recognize the gender-neutral @ in Spanish as part of the word
    text = "La protección de los derechos de tod@s l@s trabajador@s migrantes"
    assert tokenize(text, "es") == [
        "la", "protección", "de", "los", "derechos", "de", "tod@s", "l@s",
        "trabajador@s", "migrantes"
    ]

    text = "el distrito 22@ de Barcelona"
    assert tokenize(text, 'es') == ["el", "distrito", "22@", "de", "barcelona"]
    assert lossy_tokenize(
        text, 'es') == ["el", "distrito", "00@", "de", "barcelona"]

    # It also appears in Portuguese
    text = "direitos e deveres para @s membr@s da comunidade virtual"
    assert tokenize(text, "pt") == [
        "direitos", "e", "deveres", "para", "@s", "membr@s", "da",
        "comunidade", "virtual"
    ]

    # Because this is part of our tokenization, the language code doesn't
    # actually matter, as long as it's a language with Unicode tokenization
    text = "@s membr@s da comunidade virtual"
    assert tokenize(text,
                    "en") == ["@s", "membr@s", "da", "comunidade", "virtual"]
Пример #6
0
def test_tokens():
    # Let's test on some Chinese text that has unusual combinations of
    # syllables, because it is about an American vice-president.
    #
    # (He was the Chinese Wikipedia's featured article of the day when I
    # wrote this test.)

    hobart = '加勒特·霍巴特'  # Garret Hobart, or "jiā lè tè huò bā tè".

    # He was the sixth American vice president to die in office.
    fact_simplified  = '他是历史上第六位在任期内去世的美国副总统。'
    fact_traditional = '他是歷史上第六位在任期內去世的美國副總統。'

    # His name breaks into five pieces, with the only piece staying together
    # being the one that means 'Bart'. The dot is not included as a token.
    eq_(
        tokenize(hobart, 'zh'),
        ['加', '勒', '特', '霍', '巴特']
    )

    eq_(
        tokenize(fact_simplified, 'zh'),
        [
         # he / is / in history / #6 / counter for people
         '他', '是',  '历史上', '第六', '位',
         # during / term of office / in / die
         '在', '任期', '内', '去世',
         # of / U.S. / deputy / president
         '的', '美国', '副', '总统'
        ]
    )

    # You match the same tokens if you look it up in Traditional Chinese.
    eq_(tokenize(fact_simplified, 'zh'), tokenize(fact_traditional, 'zh'))
    assert_greater(word_frequency(fact_traditional, 'zh'), 0)
Пример #7
0
def main():
    arg1 = sys.argv[1]
    arg2 = sys.argv[2]
    urlInput = False

    if arg2.startswith("http://") or arg2.startswith("https://"):
        urlInput = True
    
    inp_file1 = open(arg1)
    if urlInput:
        response = urllib.request.urlopen(arg2)
        inp_file2 = response.read().decode("utf8").splitlines()
    else:
        inp_file2 = open(arg2)
    
    numPrints = int(sys.argv[3])

    tokenizedLines = wordfreq.tokenize(inp_file2)
    tokenizedStopWords = wordfreq.tokenize(inp_file1)

    inp_file1.close()
    if not urlInput:
        inp_file2.close()
        
    frequencies = wordfreq.countWords(tokenizedLines, tokenizedStopWords)
    wordfreq.printTopMost(frequencies, numPrints)
Пример #8
0
def test_ideographic_fallback():
    # Try tokenizing Chinese text as English -- it should remain stuck together.
    eq_(tokenize('中国文字', 'en'), ['中国文字'])

    # When Japanese is tagged with the wrong language, it will be split
    # at script boundaries.
    ja_text = 'ひらがなカタカナromaji'
    eq_(tokenize(ja_text, 'en'), ['ひらがな', 'カタカナ', 'romaji'])
Пример #9
0
def test_ideographic_fallback():
    # Try tokenizing Chinese text -- it should remain stuck together.
    eq_(tokenize("中国文字", "zh"), ["中国文字"])

    # When Japanese is tagged with the wrong language, it will be split
    # at script boundaries.
    ja_text = "ひらがなカタカナromaji"
    eq_(tokenize(ja_text, "en"), ["ひらがな", "カタカナ", "romaji"])
def test_catastrophes():
    # More apostrophes, but this time they're in Catalan, and there's other
    # mid-word punctuation going on too.
    assert tokenize("M'acabo d'instal·lar.", 'ca') == ['m', 'acabo', 'd', 'instal·lar']
    assert (
        tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True) ==
        ["m'", 'acabo', "d'", 'instal·lar', '.']
    )
Пример #11
0
def test_punctuation_at():
    # If the @ appears alone in a word, we consider it to be punctuation
    text = "operadores de canal, que são aqueles que têm um @ ao lado do nick"
    assert tokenize(text, "pt") == [
        "operadores",
        "de",
        "canal",
        "que",
        "são",
        "aqueles",
        "que",
        "têm",
        "um",
        "ao",
        "lado",
        "do",
        "nick"
    ]

    assert tokenize(text, "pt", include_punctuation=True) == [
        "operadores",
        "de",
        "canal",
        ",",
        "que",
        "são",
        "aqueles",
        "que",
        "têm",
        "um",
        "@",
        "ao",
        "lado",
        "do",
        "nick"
    ]

    # If the @ is not at the end of the word or part of the word ending '@s',
    # it is also punctuation
    text = "un archivo hosts.deny que contiene la línea ALL:ALL@ALL"
    assert tokenize(text, "es") == [
        "un",
        "archivo",
        "hosts.deny",
        "que",
        "contiene",
        "la",
        "línea",
        "all:all",
        "all"
    ]

    # Make sure not to catch e-mail addresses
    text = "*****@*****.**"
    assert tokenize(text, "en") == [
        "info",
        "something.example"
    ]
Пример #12
0
def test_arabic():
    # Remove tatweels
    assert tokenize('متــــــــعب', 'ar') == ['متعب']

    # Remove combining marks
    assert tokenize('حَرَكَات', 'ar') == ['حركات']

    # An Arabic ligature that is affected by NFKC normalization
    assert tokenize('\ufefb', 'ar') == ['\u0644\u0627']
Пример #13
0
def test_arabic():
    # Remove tatweels
    assert tokenize('متــــــــعب', 'ar') == ['متعب']

    # Remove combining marks
    assert tokenize('حَرَكَات', 'ar') == ['حركات']

    # An Arabic ligature that is affected by NFKC normalization
    assert tokenize('\ufefb', 'ar') == ['\u0644\u0627']
def test_actually_russian():
    # This looks mostly like Serbian, but was probably actually Russian.
    # In Russian, Google Translate says it means:
    # "a hundred out of a hundred, boys!"
    #
    # We make sure to handle this case so we don't end up with a mixed-script
    # word like "pacanы".

    assert tokenize("сто из ста, пацаны!", 'sr') == ['sto', 'iz', 'sta', 'pacany']
    assert tokenize("культуры", 'sr') == ["kul'tury"]
Пример #15
0
def test_ar():
    # Remove tatweels
    eq_(tokenize("متــــــــعب", "ar"), ["متعب"])

    # Remove combining marks
    eq_(tokenize("حَرَكَات", "ar"), ["حركات"])

    eq_(
        tokenize("\ufefb", "ar"), ["\u0644\u0627"]  # An Arabic ligature...  # ...that is affected by NFKC normalization
    )
Пример #16
0
def test_ideographic_fallback():
    # Try tokenizing Chinese text as English -- it should remain stuck together.
    eq_(tokenize('中国文字', 'en'), ['中国文字'])

    # When Japanese is tagged with the wrong language, it will be split
    # at script boundaries.
    ja_text = 'ひらがなカタカナromaji'
    eq_(
        tokenize(ja_text, 'en'),
        ['ひらがな', 'カタカナ', 'romaji']
    )
Пример #17
0
def test_arabic():
    # Remove tatweels
    eq_(tokenize('متــــــــعب', 'ar'), ['متعب'])

    # Remove combining marks
    eq_(tokenize('حَرَكَات', 'ar'), ['حركات'])

    eq_(
        tokenize('\ufefb', 'ar'),  # An Arabic ligature...
        ['\u0644\u0627']  # ...that is affected by NFKC normalization
    )
Пример #18
0
def test_actually_russian():
    # This looks mostly like Serbian, but was probably actually Russian.
    # In Russian, Google Translate says it means:
    # "a hundred out of a hundred, boys!"
    #
    # We make sure to handle this case so we don't end up with a mixed-script
    # word like "pacanы".

    eq_(tokenize("сто из ста, пацаны!", 'sr'), ['sto', 'iz', 'sta', 'pacany'])

    eq_(tokenize("культуры", 'sr'), ["kul'tury"])
Пример #19
0
def test_apostrophes():
    # Test that we handle apostrophes in French reasonably.
    assert tokenize("qu'un", 'fr') == ['qu', 'un']
    assert tokenize("qu'un", 'fr', include_punctuation=True) == ["qu'", "un"]
    assert tokenize("langues d'oïl", 'fr') == ['langues', "d", 'oïl']
    assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl']
    assert tokenize("l'heure", 'fr') == ['l', 'heure']
    assert tokenize("l'ànima", 'ca') == ['l', 'ànima']
    assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure']
    assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital']
    assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"]
    assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french']
Пример #20
0
def test_tokenization():
    # We preserve apostrophes within words, so "can't" is a single word in the
    # data
    eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
        ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])

    # Certain punctuation does not inherently split a word.
    eq_(tokenize("Anything is possible at zombo.com", 'en'),
        ['anything', 'is', 'possible', 'at', 'zombo.com'])

    # Splits occur after symbols, and at splitting punctuation such as hyphens.
    eq_(tokenize('😂test', 'en'), ['😂', 'test'])

    eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
Пример #21
0
def test_tokenization():
    # We preserve apostrophes within words, so "can't" is a single word in the
    # data
    eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
        ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])

    # Certain punctuation does not inherently split a word.
    eq_(tokenize("Anything is possible at zombo.com", 'en'),
        ['anything', 'is', 'possible', 'at', 'zombo.com'])

    # Splits occur after symbols, and at splitting punctuation such as hyphens.
    eq_(tokenize('😂test', 'en'), ['😂', 'test'])

    eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
def test_apostrophes():
    # Test that we handle apostrophes in French reasonably.
    assert tokenize("qu'un", 'fr') == ['qu', 'un']
    assert tokenize("qu'un", 'fr', include_punctuation=True) == ["qu'", "un"]
    assert tokenize("langues d'oïl", 'fr') == ['langues', "d", 'oïl']
    assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl']
    assert tokenize("l'heure", 'fr') == ['l', 'heure']
    assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure']
    assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital']
    assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"]
    assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french']
Пример #23
0
def test_alternate_codes():
    # Tokenization of Chinese works when you use other language codes
    # that are not equal to 'zh'.
    tokens = ['谢谢', '谢谢']

    # Code with a region attached
    eq_(tokenize('谢谢谢谢', 'zh-CN'), tokens)

    # Over-long codes for Chinese
    eq_(tokenize('谢谢谢谢', 'chi'), tokens)
    eq_(tokenize('谢谢谢谢', 'zho'), tokens)

    # Separate codes for Mandarin and Cantonese
    eq_(tokenize('谢谢谢谢', 'cmn'), tokens)
    eq_(tokenize('谢谢谢谢', 'yue'), tokens)
Пример #24
0
def test_tokenization():
    # We preserve apostrophes within words, so "can't" is a single word in the
    # data
    eq_(
        tokenize("I don't split at apostrophes, you see.", "en"),
        ["i", "don't", "split", "at", "apostrophes", "you", "see"],
    )

    # Certain punctuation does not inherently split a word.
    eq_(tokenize("Anything is possible at zombo.com", "en"), ["anything", "is", "possible", "at", "zombo.com"])

    # Splits occur after symbols, and at splitting punctuation such as hyphens.
    eq_(tokenize("😂test", "en"), ["😂", "test"])

    eq_(tokenize("flip-flop", "en"), ["flip", "flop"])
Пример #25
0
def test_alternate_codes():
    # Tokenization of Chinese works when you use other language codes
    # that are not equal to 'zh'.
    tokens = ['谢谢', '谢谢']

    # Code with a region attached
    assert tokenize('谢谢谢谢', 'zh-CN') == tokens

    # Over-long codes for Chinese
    assert tokenize('谢谢谢谢', 'chi') == tokens
    assert tokenize('谢谢谢谢', 'zho') == tokens

    # Separate codes for Mandarin and Cantonese
    assert tokenize('谢谢谢谢', 'cmn') == tokens
    assert tokenize('谢谢谢谢', 'yue') == tokens
Пример #26
0
def read_freqs(filename, cutoff=0, lang=None):
    """
    Read words and their frequencies from a CSV file.

    Only words with a frequency greater than or equal to `cutoff` are returned.

    If `cutoff` is greater than 0, the csv file must be sorted by frequency
    in descending order.

    If lang is given, read_freqs will apply language specific preprocessing
    operations.
    """
    raw_counts = defaultdict(float)
    total = 0.
    with open(filename, encoding='utf-8', newline='') as infile:
        for key, strval in csv.reader(infile):
            val = float(strval)
            if val < cutoff:
                break
            tokens = tokenize(
                key, lang) if lang is not None else simple_tokenize(key)
            for token in tokens:
                # Use += so that, if we give the reader concatenated files with
                # duplicates, it does the right thing
                raw_counts[fix_text(token)] += val
                total += val

    for word in raw_counts:
        raw_counts[word] /= total

    return raw_counts
Пример #27
0
def read_values(filename, cutoff=0, max_size=1e8, lang=None):
    """
    Read words and their frequency or count values from a CSV file. Returns
    a dictionary of values and the total of all values.

    Only words with a value greater than or equal to `cutoff` are returned.

    If `cutoff` is greater than 0, the csv file must be sorted by value
    in descending order.

    If `lang` is given, it will apply language-specific tokenization to the
    words that it reads.
    """
    values = defaultdict(float)
    total = 0.
    with open(filename, encoding='utf-8', newline='') as infile:
        for key, strval in csv.reader(infile):
            val = float(strval)
            key = fix_text(key)
            if val < cutoff or len(values) >= max_size:
                break
            tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
            for token in tokens:
                # Use += so that, if we give the reader concatenated files with
                # duplicates, it does the right thing
                values[token] += val
                total += val
    return values, total
Пример #28
0
def main():
    #Add all stopwords to a List
    stop_file = open(sys.argv[1], encoding="utf-8")
    stop_words = []
    for stop in stop_file:
        stop_words.append(stop.strip())
    stop_file.close()

    inp_file = ""
    #Check if file points to local dir or http
    if (str(sys.argv[2]).startswith('http://')
            or str(sys.argv[2]).startswith('https://')):
        response = urllib.request.urlopen(sys.argv[2])
        inp_file = response.read().decode("utf8").splitlines()
    else:
        local_file = open(sys.argv[2], encoding="utf-8")
        inp_file = local_file.read().splitlines()
        local_file.close()

    #Split all words
    t_file = w.tokenize(inp_file)
    #Count words
    countDic = w.countWords(t_file, stop_words)
    #Print top N
    w.printTopMost(countDic, int(sys.argv[3]))
Пример #29
0
def cld2_surface_tokenizer(text, mode='twitter'):
    """
    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.

    The `mode` can be 'twitter' or 'reddit', which slightly changes the
    pre-processing of the text.
    """
    text = unescape_html(text)
    if mode == 'twitter':
        text = TWITTER_HANDLE_RE.sub('', text)
        text = TCO_RE.sub('', text)
    elif mode == 'reddit':
        text = URL_RE.sub('', text)
        text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)

    lang = cld2_detect_language(text)

    # If the detected language isn't in our pretty generous list of languages,
    # return no tokens.
    if lang not in KEEP_THESE_LANGUAGES:
        return 'xx', []

    # cld2's accuracy seems to improve dramatically with at least 50
    # bytes of input, so throw away non-English below this length.
    if len(text.encode('utf-8')) < 50 and lang != 'en':
        return 'xx', []

    tokens = tokenize(text, lang)
    return lang, tokens
Пример #30
0
def read_values(filename, cutoff=0, max_words=1e8, lang=None):
    """
    Read words and their frequency or count values from a CSV file. Returns
    a dictionary of values and the total of all values.

    Only words with a value greater than or equal to `cutoff` are returned.
    In addition, only up to `max_words` words are read.

    If `cutoff` is greater than 0 or `max_words` is smaller than the list,
    the csv file must be sorted by value in descending order, so that the
    most frequent words are kept.

    If `lang` is given, it will apply language-specific tokenization to the
    words that it reads.
    """
    values = defaultdict(float)
    total = 0.
    with open(filename, encoding='utf-8', newline='') as infile:
        for key, strval in csv.reader(infile):
            val = float(strval)
            key = fix_text(key)
            if val < cutoff or len(values) >= max_words:
                break
            tokens = tokenize(
                key, lang) if lang is not None else simple_tokenize(key)
            for token in tokens:
                # Use += so that, if we give the reader concatenated files with
                # duplicates, it does the right thing
                values[token] += val
                total += val
    return values, total
Пример #31
0
def cld2_surface_tokenizer(text, mode='twitter'):
    """
    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.

    The `mode` can be 'twitter' or 'reddit', which slightly changes the
    pre-processing of the text.
    """
    text = unescape_html(text)
    if mode == 'twitter':
        text = TWITTER_HANDLE_RE.sub('', text)
        text = TCO_RE.sub('', text)
    elif mode == 'reddit':
        text = URL_RE.sub('', text)
        text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)

    lang = cld2_detect_language(text)

    # If the detected language isn't in our pretty generous list of languages,
    # return no tokens.
    if lang not in KEEP_THESE_LANGUAGES:
        return 'xx', []

    # cld2's accuracy seems to improve dramatically with at least 50
    # bytes of input, so throw away non-English below this length.
    if len(text.encode('utf-8')) < 50 and lang != 'en':
        return 'xx', []

    tokens = tokenize(text, lang)
    return lang, tokens
def wordfreqs(text):
    freqs = []
    for tok in wordfreq.tokenize(text, 'en'):
        freq = wordfreq.zipf_frequency(tok, 'en')
        if freq != 0:
            freqs.append(freq)
    return np.array(freqs)
Пример #33
0
def simple_tokenize(text):
    """
    Tokenize text using the default wordfreq rules.
    It depends on 'wordfreq', a Python 3 library, so it can tokenize multilingual
    text consistently: https://pypi.org/project/wordfreq/
    """
    return wordfreq.tokenize(text, 'xx')
Пример #34
0
def test_ideographic_fallback():
    # Try tokenizing Chinese text as English -- it should remain stuck together.
    eq_(tokenize('中国文字', 'en'), ['中国文字'])

    # When Japanese is tagged with the wrong language, it will be split
    # at script boundaries.
    ja_text = 'ひらがなカタカナromaji'
    eq_(
        tokenize(ja_text, 'en'),
        ['ひらがな', 'カタカナ', 'romaji']
    )

    # Test that we leave Thai letters stuck together. If we had better Thai support,
    # we would actually split this into a three-word phrase.
    eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
    eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
        ['การเล่นดนตรี', 'means', 'playing', 'music'])
Пример #35
0
def test_ideographic_fallback():
    # Try tokenizing Chinese text as English -- it should remain stuck together.
    eq_(tokenize('中国文字', 'en'), ['中国文字'])

    # When Japanese is tagged with the wrong language, it will be split
    # at script boundaries.
    ja_text = 'ひらがなカタカナromaji'
    eq_(
        tokenize(ja_text, 'en'),
        ['ひらがな', 'カタカナ', 'romaji']
    )

    # Test that we leave Thai letters stuck together. If we had better Thai support,
    # we would actually split this into a three-word phrase.
    eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
    eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
        ['การเล่นดนตรี', 'means', 'playing', 'music'])
Пример #36
0
def standardized(term):
    """
    Breaks into underscore-separated words and replaces numbers with '#' signs.
    """
    tokens = wordfreq.tokenize(term.replace('_', ' '), 'xx')
    if tokens[0] == 'to':
        tokens = tokens[1:]
    return replace_numbers('_'.join(tokens))
Пример #37
0
 def text_to_vector(self, language, text):
     """
     Used in Story Cloze Test to create a vector for text.
     """
     tokens = wordfreq.tokenize(text, language)
     weighted_terms = [(uri_prefix(standardized_uri(language, token)), 1.)
                       for token in tokens]
     return self.get_vector(weighted_terms, oov_vector=False)
Пример #38
0
def test_ar():
    # Remove tatweels
    eq_(
        tokenize('متــــــــعب', 'ar'),
        ['متعب']
    )

    # Remove combining marks
    eq_(
        tokenize('حَرَكَات', 'ar'),
        ['حركات']
    )

    eq_(
        tokenize('\ufefb', 'ar'),  # An Arabic ligature...
        ['\u0644\u0627']  # ...that is affected by NFKC normalization
    )
Пример #39
0
 def text_to_vector(self, language, text):
     """
     Used in Story Cloze Test to create a vector for text.
     """
     tokens = wordfreq.tokenize(text, language)
     weighted_terms = [
         (uri_prefix(standardized_uri(language, token)), 1.) for token in tokens
     ]
     return self.get_vector(weighted_terms, oov_vector=False)
Пример #40
0
def cld2_surface_tokenizer(text):
    """
    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
    """
    text = unescape_html(text)
    text = TWITTER_HANDLE_RE.sub('', text)
    text = TCO_RE.sub('', text)
    lang = cld2_detect_language(text)
    tokens = tokenize(text, lang)
    return lang, tokens
Пример #41
0
def main():
    f1 = open(sys.argv[1], encoding="utf-8")
    stops = []
    for line in f1:
        stops.append(line.strip())
    f1.close()
    text = check(sys.argv[2])
    tokenz = wordfreq.tokenize(text)
    freks = wordfreq.countWords(tokenz, stops)
    wordfreq.printTopMost(freks, int(sys.argv[3]))
Пример #42
0
def cld2_surface_tokenizer(text):
    """
    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
    """
    text = unescape_html(text)
    text = TWITTER_HANDLE_RE.sub('', text)
    text = TCO_RE.sub('', text)
    lang = cld2_detect_language(text)
    tokens = tokenize(text, lang)
    return lang, tokens
Пример #43
0
def test_number_smashing():
    assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver']
    assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver']
    assert (
        lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True)
        == ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver']
    )
    assert lossy_tokenize('1', 'en') == ['1']
    assert lossy_tokenize('3.14', 'en') == ['0.00']
    assert lossy_tokenize('24601', 'en') == ['00000']
    assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
def build_wp_database(db, filename):
    db.execute("DROP TABLE IF EXISTS words")
    with db as _transaction:
        for statement in SCHEMA:
            db.execute(statement)

    with db as _transaction:
        num_lines = sum(1 for line in open(filename))
        for line in tqdm(open(filename), total=num_lines):
            title, text = line.split('\t', 1)
            words = wordfreq.tokenize(text.rstrip(), 'en')
            for word in words:
                add_entry(db, title, word)
Пример #45
0
def test_gender_neutral_at():
    # Recognize the gender-neutral @ in Spanish as part of the word
    text = "La protección de los derechos de tod@s l@s trabajador@s migrantes"
    assert tokenize(text, "es") == [
        "la",
        "protección",
        "de",
        "los",
        "derechos",
        "de",
        "tod@s",
        "l@s",
        "trabajador@s",
        "migrantes"
    ]

    text = "el distrito 22@ de Barcelona"
    assert tokenize(text, 'es') == ["el", "distrito", "22@", "de", "barcelona"]
    assert lossy_tokenize(text, 'es') == ["el", "distrito", "00@", "de", "barcelona"]

    # It also appears in Portuguese
    text = "direitos e deveres para @s membr@s da comunidade virtual"
    assert tokenize(text, "pt") == [
        "direitos",
        "e",
        "deveres",
        "para",
        "@s",
        "membr@s",
        "da",
        "comunidade",
        "virtual"
    ]

    # Because this is part of our tokenization, the language code doesn't
    # actually matter, as long as it's a language with Unicode tokenization
    text = "@s membr@s da comunidade virtual"
    assert tokenize(text, "en") == ["@s", "membr@s", "da", "comunidade", "virtual"]
Пример #46
0
def test_tokenization():
    # We preserve apostrophes within words, so "can't" is a single word in the
    # data
    eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
        ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])

    eq_(
        tokenize("I don't split at apostrophes, you see.",
                 'en',
                 include_punctuation=True),
        ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])

    # Certain punctuation does not inherently split a word.
    eq_(tokenize("Anything is possible at zombo.com", 'en'),
        ['anything', 'is', 'possible', 'at', 'zombo.com'])

    # Splits occur after symbols, and at splitting punctuation such as hyphens.
    eq_(tokenize('😂test', 'en'), ['😂', 'test'])

    eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])

    eq_(
        tokenize('this text has... punctuation :)',
                 'en',
                 include_punctuation=True),
        ['this', 'text', 'has', '...', 'punctuation', ':)'])

    # Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
    # and 'David Bowie' stay together, because our Unicode segmentation algorithm
    # is up to date
    eq_(tokenize('emoji test 🧕🏽', 'en'), ['emoji', 'test', '🧕🏽'])

    eq_(
        tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀",
                 'en'), [
                     '👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
                     'nothing', 'i', 'can', 'do', '🌎', '🚀'
                 ])

    # Water wave, surfer, flag of California (indicates ridiculously complete support
    # for Unicode 10 and Emoji 5.0)
    eq_(tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'", 'en'),
        ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"])
Пример #47
0
def test_punctuation_at():
    # If the @ appears alone in a word, we consider it to be punctuation
    text = "operadores de canal, que são aqueles que têm um @ ao lado do nick"
    assert tokenize(text, "pt") == [
        "operadores", "de", "canal", "que", "são", "aqueles", "que", "têm",
        "um", "ao", "lado", "do", "nick"
    ]

    assert tokenize(text, "pt", include_punctuation=True) == [
        "operadores", "de", "canal", ",", "que", "são", "aqueles", "que",
        "têm", "um", "@", "ao", "lado", "do", "nick"
    ]

    # If the @ is not at the end of the word or part of the word ending '@s',
    # it is also punctuation
    text = "un archivo hosts.deny que contiene la línea ALL:ALL@ALL"
    assert tokenize(text, "es") == [
        "un", "archivo", "hosts.deny", "que", "contiene", "la", "línea",
        "all:all", "all"
    ]

    # Make sure not to catch e-mail addresses
    text = "*****@*****.**"
    assert tokenize(text, "en") == ["info", "something.example"]
Пример #48
0
def test_transliteration():
    # "Well, there's a lot of things you do not understand."
    # (from somewhere in OpenSubtitles
    assert (tokenize("Па, има ту много ствари које не схваташ.", 'sr') == [
        'pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'
    ])
    assert (tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') == [
        'pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'
    ])

    # I don't have examples of complete sentences in Azerbaijani that are
    # naturally in Cyrillic, because it turns out everyone writes Azerbaijani
    # in Latin letters on the Internet, _except_ sometimes for Wiktionary.
    # So here are some individual words.

    # 'library' in Azerbaijani Cyrillic
    assert preprocess_text('китабхана', 'az') == 'kitabxana'
    assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana'
    assert preprocess_text('KİTABXANA', 'az') == 'kitabxana'

    # 'scream' in Azerbaijani Cyrillic
    assert preprocess_text('бағырты', 'az') == 'bağırtı'
    assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı'
    assert preprocess_text('BAĞIRTI', 'az') == 'bağırtı'
Пример #49
0
def test_tokens():
    # Let's test on some Chinese text that has unusual combinations of
    # syllables, because it is about an American vice-president.
    #
    # (He was the Chinese Wikipedia's featured article of the day when I
    # wrote this test.)

    hobart = '加勒特·霍巴特'  # Garret Hobart, or "jiā lè tè huò bā tè".

    # He was the sixth American vice president to die in office.
    fact_simplified  = '他是历史上第六位在任期内去世的美国副总统。'
    fact_traditional = '他是歷史上第六位在任期內去世的美國副總統。'

    # His name breaks into five pieces, with the only piece staying together
    # being the one that means 'Bart'. The dot is not included as a token.
    eq_(
        tokenize(hobart, 'zh'),
        ['加', '勒', '特', '霍', '巴特']
    )

    eq_(
        tokenize(fact_simplified, 'zh'),
        [
            # he / is / in history / #6 / counter for people
            '他', '是',  '历史上', '第六', '位',
            # during / term of office / in / die
            '在', '任期', '内', '去世',
            # of / U.S. / deputy / president
            '的', '美国', '副', '总统'
        ]
    )

    # Jieba's original tokenizer knows a lot of names, it seems.
    eq_(
        tokenize(hobart, 'zh', external_wordlist=True),
        ['加勒特', '霍巴特']
    )

    # We get almost the same tokens from the sentence using Jieba's own
    # wordlist, but it tokenizes "in history" as two words and
    # "sixth person" as one.
    eq_(
        tokenize(fact_simplified, 'zh', external_wordlist=True),
        [
            # he / is / history / in / sixth person
            '他', '是', '历史', '上', '第六位',
            # during / term of office / in / die
            '在', '任期', '内', '去世',
            # of / U.S. / deputy / president
            '的', '美国', '副', '总统'
        ]
    )

    # You match the same tokens if you look it up in Traditional Chinese.
    eq_(tokenize(fact_simplified, 'zh'), tokenize(fact_traditional, 'zh'))
    assert_greater(word_frequency(fact_traditional, 'zh'), 0)
Пример #50
0
def test_number_smashing():
    assert tokenize('"715 - CRΣΣKS" by Bon Iver',
                    'en') == ['715', 'crσσks', 'by', 'bon', 'iver']
    assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver',
                          'en') == ['000', 'crσσks', 'by', 'bon', 'iver']
    assert (lossy_tokenize('"715 - CRΣΣKS" by Bon Iver',
                           'en',
                           include_punctuation=True) == [
                               '"', '000', '-', 'crσσks', '"', 'by', 'bon',
                               'iver'
                           ])
    assert lossy_tokenize('1', 'en') == ['1']
    assert lossy_tokenize('3.14', 'en') == ['0.00']
    assert lossy_tokenize('24601', 'en') == ['00000']
    assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
Пример #51
0
def cld2_surface_tokenizer(text):
    """
    Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
    """
    text = unescape_html(text)
    text = TWITTER_HANDLE_RE.sub('', text)
    text = TCO_RE.sub('', text)
    lang = cld2_detect_language(text)

    # Don't allow tokenization in Chinese when language-detecting, because
    # the Chinese tokenizer may not be built yet
    if lang == 'zh':
        lang = 'en'

    tokens = tokenize(text, lang)
    return lang, tokens
Пример #52
0
def cld2_reddit_tokenizer(text):
    """
    A language-detecting tokenizer with special cases for handling text from
    Reddit.
    """
    text = URL_RE.sub('', text)
    text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)

    lang = cld2_detect_language(text)
    if lang not in KEEP_THESE_LANGUAGES:
        # Reddit is 99.9% English, so if we detected a rare language, it's
        # much more likely that it's actually English.
        lang = 'en'

    tokens = tokenize(text, lang, include_punctuation=True)
    return lang, tokens
Пример #53
0
def test_tokenization():
    # We preserve apostrophes within words, so "can't" is a single word in the
    # data
    assert (
        tokenize("I don't split at apostrophes, you see.", 'en')
        == ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']
    )

    assert (
        tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True)
        == ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.']
    )

    # Certain punctuation does not inherently split a word.
    assert (
        tokenize("Anything is possible at zombo.com", 'en')
        == ['anything', 'is', 'possible', 'at', 'zombo.com']
    )

    # Splits occur after symbols, and at splitting punctuation such as hyphens.
    assert tokenize('😂test', 'en') == ['😂', 'test']
    assert tokenize("flip-flop", 'en') == ['flip', 'flop']
    assert (
        tokenize('this text has... punctuation :)', 'en', include_punctuation=True)
        == ['this', 'text', 'has', '...', 'punctuation', ':)']
    )

    # Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
    # and 'David Bowie' stay together, because our Unicode segmentation algorithm
    # is up to date
    assert tokenize('emoji test 🧕🏽', 'en') == ['emoji', 'test', '🧕🏽']
    assert (
        tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en')
        == ['👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
            'nothing', 'i', 'can', 'do', '🌎', '🚀']
    )

    # Water wave, surfer, flag of California (indicates ridiculously complete support
    # for Unicode 10 and Emoji 5.0)
    assert tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'",'en') == ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"]
Пример #54
0
def test_other_languages():
    # Test that we leave Thai letters stuck together. If we had better Thai support,
    # we would actually split this into a three-word phrase.
    assert tokenize('การเล่นดนตรี', 'th') == ['การเล่นดนตรี']
    assert tokenize('"การเล่นดนตรี" means "playing music"', 'en') == ['การเล่นดนตรี', 'means', 'playing', 'music']

    # Test Khmer, a script similar to Thai
    assert tokenize('សូមស្វាគមន៍', 'km') == ['សូមស្វាគមន៍']

    # Test Hindi -- tokens split where there are spaces, and not where there aren't
    assert tokenize('हिन्दी विक्षनरी', 'hi') == ['हिन्दी', 'विक्षनरी']

    # Remove vowel points in Hebrew
    assert tokenize('דֻּגְמָה', 'he') == ['דגמה']

    # Deal with commas, cedillas, and I's in Turkish
    assert tokenize('kișinin', 'tr') == ['kişinin']
    assert tokenize('KİȘİNİN', 'tr') == ['kişinin']

    # Deal with cedillas that should be commas-below in Romanian
    assert tokenize('acelaşi', 'ro') == ['același']
    assert tokenize('ACELAŞI', 'ro') == ['același']
Пример #55
0
def test_tokens():
    # Let's test on some Chinese text that has unusual combinations of
    # syllables, because it is about an American vice-president.
    #
    # (He was the Chinese Wikipedia's featured article of the day when I
    # wrote this test.)

    hobart = '加勒特·霍巴特'  # Garret Hobart, or "jiā lè tè huò bā tè".

    # He was the sixth American vice president to die in office.
    fact_simplified  = '他是历史上第六位在任期内去世的美国副总统。'
    fact_traditional = '他是歷史上第六位在任期內去世的美國副總統。'

    # His name breaks into five pieces, with the only piece staying together
    # being the one that means 'Bart'. The dot is not included as a token.
    assert tokenize(hobart, 'zh') == ['加', '勒', '特', '霍', '巴特']

    assert tokenize(fact_simplified, 'zh') == [
        # he / is / history / in / #6 / counter for people
        '他', '是',  '历史', '上', '第六', '位',
        # during / term of office / in / die
        '在', '任期', '内', '去世',
        # of / U.S. / deputy / president
        '的', '美国', '副', '总统'
    ]

    # Jieba's original tokenizer knows a lot of names, it seems.
    assert tokenize(hobart, 'zh', external_wordlist=True) == ['加勒特', '霍巴特']

    # We get almost the same tokens from the sentence using Jieba's own
    # wordlist, but it tokenizes "in history" as two words and
    # "sixth person" as one.
    assert tokenize(fact_simplified, 'zh', external_wordlist=True) == [
        # he / is / history / in / sixth person
        '他', '是', '历史', '上', '第六位',
        # during / term of office / in / die
        '在', '任期', '内', '去世',
        # of / U.S. / deputy / president
        '的', '美国', '副', '总统'
    ]

    # Check that Traditional Chinese works at all
    assert word_frequency(fact_traditional, 'zh') > 0

    # You get the same token lengths if you look it up in Traditional Chinese,
    # but the words are different
    simp_tokens = tokenize(fact_simplified, 'zh', include_punctuation=True)
    trad_tokens = tokenize(fact_traditional, 'zh', include_punctuation=True)
    assert ''.join(simp_tokens) == fact_simplified
    assert ''.join(trad_tokens) == fact_traditional
    simp_lengths = [len(token) for token in simp_tokens]
    trad_lengths = [len(token) for token in trad_tokens]
    assert simp_lengths == trad_lengths
Пример #56
0
def test_other_languages():
    # Test that we leave Thai letters stuck together. If we had better Thai support,
    # we would actually split this into a three-word phrase.
    eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
    eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
        ['การเล่นดนตรี', 'means', 'playing', 'music'])

    # Test Khmer, a script similar to Thai
    eq_(tokenize('សូមស្វាគមន៍', 'km'), ['សូមស្វាគមន៍'])

    # Test Hindi -- tokens split where there are spaces, and not where there aren't
    eq_(tokenize('हिन्दी विक्षनरी', 'hi'), ['हिन्दी', 'विक्षनरी'])

    # Remove vowel points in Hebrew
    eq_(tokenize('דֻּגְמָה', 'he'), ['דגמה'])

    # Deal with commas, cedillas, and I's in Turkish
    eq_(tokenize('kișinin', 'tr'), ['kişinin'])
    eq_(tokenize('KİȘİNİN', 'tr'), ['kişinin'])

    # Deal with cedillas that should be commas-below in Romanian
    eq_(tokenize('acelaşi', 'ro'), ['același'])
    eq_(tokenize('ACELAŞI', 'ro'), ['același'])
Пример #57
0
import html
import sys
import wordfreq

if len(sys.argv) != 3:
    print('Usage: python3 sort.py target-lang pairs.csv')
    sys.exit(1)

targetLang = sys.argv[1]
pairsPath = sys.argv[2]

pairs = {}

with open(pairsPath, 'r', encoding='utf-8') as pairsFile:
    reader = csv.reader(pairsFile, delimiter='\t')
    for row in reader:
        words = wordfreq.tokenize(html.unescape(row[0]), targetLang)

        freqs = [wordfreq.zipf_frequency(word, targetLang, wordlist='combined')
                     for word in words]

        minfreq = min(freqs)
        avgfreq = sum(freqs) / float(len(freqs))
        pairs[row[0]] = (minfreq, avgfreq, row[1])

pairList = list(pairs.items())
pairList.sort(reverse = True, key=lambda i: i[1])

for pair in pairList:
    sys.stdout.buffer.write((pair[0] + '\t' + pair[1][2] + '\n').encode('utf-8'))
Пример #58
0
def test_ideographic_fallback():
    # Try tokenizing Chinese text as English -- it should remain stuck together.
    #
    # More complex examples like this, involving the multiple scripts of Japanese,
    # are in test_japanese.py.
    assert tokenize('中国文字', 'en') == ['中国文字']
def test_alternate_codes():
    # Try over-long language codes for French and Catalan
    assert tokenize("qu'un", 'fra') == ['qu', 'un']
    assert tokenize("qu'un", 'fre') == ['qu', 'un']
    assert tokenize("M'acabo d'instal·lar.", 'cat') == ['m', 'acabo', 'd', 'instal·lar']
Пример #60
0
def test_casefolding():
    assert tokenize('WEISS', 'de') == ['weiss']
    assert tokenize('weiß', 'de') == ['weiss']
    assert tokenize('İstanbul', 'tr') == ['istanbul']
    assert tokenize('SIKISINCA', 'tr') == ['sıkısınca']