示例#1
0
def main():
    for language in ('pt', 'ru', 'es', 'fr', 'it', 'zh', 'de', 'ar'):
        print language
        translate_leeds_corpus(
            '../metanl/data/source-data/internet-%s-forms.num' % language,
            '../metanl/data/wordlists/leeds-internet-%s.txt' % language,
            make_rosette_normalizer(language))
示例#2
0
def main():
    for language in ('pt', 'ru', 'es', 'fr', 'it', 'zh', 'de', 'ar'):
        print language
        translate_leeds_corpus(
            '../metanl/data/source-data/internet-%s-forms.num' % language,
            '../metanl/data/wordlists/leeds-internet-%s.txt' % language,
            make_rosette_normalizer(language)
        )
示例#3
0
    twitter = get_wordlist('en-twitter')
    combined = merge_lists([(books, '', 1e9), (twitter, '', 1e9)])
    combined.save('multi-en.txt')
    combined.save_logarithmic('multi-en-logarithmic.txt')
    total = sum(combined.worddict.values())
    print "Average frequency:", total / len(combined.worddict)

if __name__ == '__main__':
    merge_english()

########NEW FILE########
__FILENAME__ = reformat-leeds-ja
from metanl import japanese
from metanl.leeds_corpus_reader import translate_leeds_corpus

translate_leeds_corpus('../metanl/data/source-data/internet-ja-forms.num',
    '../metanl/data/leeds-internet-ja.txt', japanese.normalize)

########NEW FILE########
__FILENAME__ = reformat_using_rosette
from metanl.leeds_corpus_reader import translate_leeds_corpus
import socket, time

def make_rosette_normalizer(lcode):
    from lumi_pipeline.text_readers import get_reader
    reader = get_reader('rosette.%s' % lcode)
    def normalizer(text):
        try:
            triples = reader.text_to_token_triples(text)
        except socket.error:
            time.sleep(1)
            print 'backing off'
示例#4
0
from metanl import japanese
from metanl.leeds_corpus_reader import translate_leeds_corpus

translate_leeds_corpus('../metanl/data/source-data/internet-ja-forms.num',
                       '../metanl/data/leeds-internet-ja.txt',
                       japanese.normalize)
from metanl import freeling
from metanl.leeds_corpus_reader import translate_leeds_corpus

for language in freeling.LANGUAGES:
    if language != 'cy' and language != 'en':
        # we don't have data for Welsh, and we have better data for English
        print language
        translate_leeds_corpus(
            '../metanl/data/source-data/internet-%s-forms.num' % language,
            '../metanl/data/wordlists/leeds-internet-%s.txt' % language,
            freeling.LANGUAGES[language].normalize
        )