Пример #1
0
def generate_language_df(test_column, rows=50):
    import nltk
    import random
    random.seed(55)
    from nltk.corpus import udhr
    from timeit import default_timer as timer
    languages = udhr._fileids
    sents_eng = udhr.sents('English-Latin1')
    sents_bg = udhr.sents('Bulgarian_Balgarski-UTF8')
    sents_ger = udhr.sents('German_Deutsch-Latin1')
    cnt_en = int(rows * 0.4)
    cnt_bg = int(rows * 0.4)
    cnt_de = int(rows * 0.2)

    df = pd.DataFrame(np.random.randn(rows, 4),
                      columns=['A', 'B', 'C', test_column])
    df.set_index('A')
    tcol = df[test_column]
    nums = []
    for i in range(cnt_en):
        rnd_sent = ' '.join(random.choice(sents_eng))
        nums.append(rnd_sent)
    for i in range(cnt_bg):
        rnd_sent = ' '.join(random.choice(sents_bg))
        nums.append(rnd_sent)
    for i in range(cnt_de):
        rnd_sent = ' '.join(random.choice(sents_ger))
        nums.append(rnd_sent)
    df.loc[:, test_column] = nums
    return df
Пример #2
0
def demo():
    from nltk.corpus import udhr

    langs = [
        "Kurdish-UTF8",
        "Abkhaz-UTF8",
        "Farsi_Persian-UTF8",
        "Hindi-UTF8",
        "Hawaiian-UTF8",
        "Russian-UTF8",
        "Vietnamese-UTF8",
        "Serbian_Srpski-UTF8",
        "Esperanto-UTF8",
    ]

    friendly = {
        "kmr": "Northern Kurdish",
        "abk": "Abkhazian",
        "pes": "Iranian Persian",
        "hin": "Hindi",
        "haw": "Hawaiian",
        "rus": "Russian",
        "vie": "Vietnamese",
        "srp": "Serbian",
        "epo": "Esperanto",
    }

    tc = TextCat()

    for cur_lang in langs:
        # Get raw data from UDHR corpus
        raw_sentences = udhr.sents(cur_lang)
        rows = len(raw_sentences) - 1
        cols = list(map(len, raw_sentences))

        sample = ""

        # Generate a sample text of the language
        for i in range(0, rows):
            cur_sent = ""
            for j in range(0, cols[i]):
                cur_sent += " " + raw_sentences[i][j]

            sample += cur_sent

        # Try to detect what it is
        print("Language snippet: " + sample[0:140] + "...")
        guess = tc.guess_language(sample)
        print(f"Language detection: {guess} ({friendly[guess]})")
        print("#" * 140)
Пример #3
0
def demo():
    from nltk.corpus import udhr

    langs = [
        'Kurdish-UTF8',
        'Abkhaz-UTF8',
        'Farsi_Persian-UTF8',
        'Hindi-UTF8',
        'Hawaiian-UTF8',
        'Russian-UTF8',
        'Vietnamese-UTF8',
        'Serbian_Srpski-UTF8',
        'Esperanto-UTF8',
    ]

    friendly = {
        'kmr': 'Northern Kurdish',
        'abk': 'Abkhazian',
        'pes': 'Iranian Persian',
        'hin': 'Hindi',
        'haw': 'Hawaiian',
        'rus': 'Russian',
        'vie': 'Vietnamese',
        'srp': 'Serbian',
        'epo': 'Esperanto',
    }

    tc = TextCat()

    for cur_lang in langs:
        # Get raw data from UDHR corpus
        raw_sentences = udhr.sents(cur_lang)
        rows = len(raw_sentences) - 1
        cols = list(map(len, raw_sentences))

        sample = ''

        # Generate a sample text of the language
        for i in range(0, rows):
            cur_sent = ''
            for j in range(0, cols[i]):
                cur_sent += ' ' + raw_sentences[i][j]

            sample += cur_sent

        # Try to detect what it is
        print('Language snippet: ' + sample[0:140] + '...')
        guess = tc.guess_language(sample)
        print('Language detection: %s (%s)' % (guess, friendly[guess]))
        print('#' * 140)
Пример #4
0
def demo():
    from nltk.corpus import udhr

    langs = [
        'Kurdish-UTF8',
        'Abkhaz-UTF8',
        'Farsi_Persian-UTF8',
        'Hindi-UTF8',
        'Hawaiian-UTF8',
        'Russian-UTF8',
        'Vietnamese-UTF8',
        'Serbian_Srpski-UTF8',
        'Esperanto-UTF8',
    ]

    friendly = {
        'kmr': 'Northern Kurdish',
        'abk': 'Abkhazian',
        'pes': 'Iranian Persian',
        'hin': 'Hindi',
        'haw': 'Hawaiian',
        'rus': 'Russian',
        'vie': 'Vietnamese',
        'srp': 'Serbian',
        'epo': 'Esperanto',
    }

    tc = TextCat()

    for cur_lang in langs:
        # Get raw data from UDHR corpus
        raw_sentences = udhr.sents(cur_lang)
        rows = len(raw_sentences) - 1
        cols = list(map(len, raw_sentences))

        sample = ''

        # Generate a sample text of the language
        for i in range(0, rows):
            cur_sent = ''
            for j in range(0, cols[i]):
                cur_sent += ' ' + raw_sentences[i][j]

            sample += cur_sent

        # Try to detect what it is
        print('Language snippet: ' + sample[0:140] + '...')
        guess = tc.guess_language(sample)
        print('Language detection: %s (%s)' % (guess, friendly[guess]))
        print('#' * 140)
Пример #5
0
def runLeaveOutSentTrial(language):
    sents = udhr.sents(language)
    sent_idx = random.randint(0, len(sents) - 1)
    test_sent = list(filterWords(sents[sent_idx]))
    train_set = []
    for i in range(len(sents)):
        if i == sent_idx:
            continue
        train_set += filterWords(sents[i])

    bigrams = [(language, makeTrigrams(train_set))]
    for lang in LANGUAGES:
        if lang == language:
            continue
        bigrams.append((lang, makeTrigrams(udhr.words(lang))))

    grammars = makeTrigramGrammars(bigrams)

    return test_sent, predictSentLanguage(test_sent, grammars)
Пример #6
0
def runLeaveOutSentTrial(language):
    sents = udhr.sents(language)
    sent_idx = random.randint(0, len(sents) - 1)
    test_sent = sents[sent_idx]
    train_set = []
    for i in range(len(sents)):
        if i == sent_idx:
            continue
        train_set += sents[i]

    ngrams = [(language, train_set)]
    for lang in LANGUAGES:
        if lang == language:
            continue
        ngrams.append((lang, udhr.words(lang)))

    classifier = NGramClassifier(N, ngrams)

    return test_sent, classifier.classifySent(test_sent)
Пример #7
0
    'Greenlandic_Inuktikut-Latin1', 'Hungarian_Magyar-Latin1',
    'Ibibio_Efik-Latin1'
]  # , 'Chinese_Mandarin-UTF8']
cfd = nltk.ConditionalFreqDist(
    (lang, len(word)) for lang in languages for word in udhr.words(lang))
cfd.plot(cumulative=True)
cfd.tabulate(samples=range(10), cumulative=True)
cfd.tabulate(conditions=['English-Latin1', 'German_Deutsch-Latin1'],
             samples=range(10),
             cumulative=True)
# 中文是字符型的,不能使用单词读入
chinese_mandarin_raw = udhr.raw('Chinese_Mandarin-UTF8')
print(chinese_mandarin_raw)
chinese_mandarin_words = udhr.words('Chinese_Mandarin-UTF8')
chinese_mandarin_words
chinese_mandarin_sents = udhr.sents('Chinese_Mandarin-UTF8')
chinese_mandarin_sents


def generate_model(cfdist, word, num=15):
    for i in range(num):
        print(word, end=' ')
        word = cfdist[word].max()


# 1.8. 文本语料库的结构
raw = gutenberg.raw('burgess-busterbrown.txt')
print(raw[1:20])
words = gutenberg.words('burgess-busterbrown.txt')
print(words[1:20])
sents = gutenberg.sents('burgess-busterbrown.txt')
Пример #8
0
#!/usr/bin/python3

import nltk.corpus as corpus
from nltk.corpus import udhr
from nltk.corpus import swadesh

text = udhr.sents('Spanish-Latin1')
es = swadesh.words('es')
spanish_to_english = swadesh.entries(['es', 'en'])
trans = dict(spanish_to_english)

for sentence in text:
    for i in range(len(sentence)):
        if sentence[i] in es:
            print(trans[sentence[i]], end=' ')
        else:
            print("UNK", end=' ')
    print('')
Пример #9
0
import nltk
from nltk.corpus import udhr as u
#The full text of the declaration in Ibibio-Efik
print(u.raw('Ibibio_Efik-Latin1'))

#The length (in words) of the text in Amahuaca and in Greenlandic, and which one is longer
word_lenA = len(u.words('Amahuaca'))
word_lenG = len(u.words('Greenlandic_Inuktikut-Latin1'))
print('\nAmahuaca one has %s words and Greenland one has %s words.' %
      (word_lenA, word_lenG))
if word_lenA > word_lenG:
    print('Amahuaca one is longer.')
else:
    print('Greenland one is longer.')

#The first sentence of the text in Turkish
sentences = u.sents('Turkish_Turkce-Turkish')
sentence1 = ' '.join(sentences[1])
print('\n', sentence1)
Пример #10
0
import json
from nltk.corpus import udhr
from pandas import DataFrame

df = DataFrame.from_csv('language_speakers', index_col='language')

def exportToJSON(langName, passage):
    l = []
    for sentence in passage:
        l.append(sentence)
    with open("passages/" + langName + ".json", 'w') as f:
        f.write(json.dumps(l))


for lang in udhr.fileids():
    langName = ' '.join(lang.split('-')[:-1])
    try:
        print(' '.join(udhr.sents(lang)[0])[:50] + '...', langName)
        if langName in df.index and df.loc[langName].get('speakers_native(m)') > 1:
            exportToJSON(langName, udhr.sents(lang))
    except AssertionError:
        print('could not print... ', lang)