def main(): for language in ('pt', 'ru', 'es', 'fr', 'it', 'zh', 'de', 'ar'): print language translate_leeds_corpus( '../metanl/data/source-data/internet-%s-forms.num' % language, '../metanl/data/wordlists/leeds-internet-%s.txt' % language, make_rosette_normalizer(language))
def main(): for language in ('pt', 'ru', 'es', 'fr', 'it', 'zh', 'de', 'ar'): print language translate_leeds_corpus( '../metanl/data/source-data/internet-%s-forms.num' % language, '../metanl/data/wordlists/leeds-internet-%s.txt' % language, make_rosette_normalizer(language) )
twitter = get_wordlist('en-twitter') combined = merge_lists([(books, '', 1e9), (twitter, '', 1e9)]) combined.save('multi-en.txt') combined.save_logarithmic('multi-en-logarithmic.txt') total = sum(combined.worddict.values()) print "Average frequency:", total / len(combined.worddict) if __name__ == '__main__': merge_english() ########NEW FILE######## __FILENAME__ = reformat-leeds-ja from metanl import japanese from metanl.leeds_corpus_reader import translate_leeds_corpus translate_leeds_corpus('../metanl/data/source-data/internet-ja-forms.num', '../metanl/data/leeds-internet-ja.txt', japanese.normalize) ########NEW FILE######## __FILENAME__ = reformat_using_rosette from metanl.leeds_corpus_reader import translate_leeds_corpus import socket, time def make_rosette_normalizer(lcode): from lumi_pipeline.text_readers import get_reader reader = get_reader('rosette.%s' % lcode) def normalizer(text): try: triples = reader.text_to_token_triples(text) except socket.error: time.sleep(1) print 'backing off'
from metanl import japanese from metanl.leeds_corpus_reader import translate_leeds_corpus translate_leeds_corpus('../metanl/data/source-data/internet-ja-forms.num', '../metanl/data/leeds-internet-ja.txt', japanese.normalize)
from metanl import freeling from metanl.leeds_corpus_reader import translate_leeds_corpus for language in freeling.LANGUAGES: if language != 'cy' and language != 'en': # we don't have data for Welsh, and we have better data for English print language translate_leeds_corpus( '../metanl/data/source-data/internet-%s-forms.num' % language, '../metanl/data/wordlists/leeds-internet-%s.txt' % language, freeling.LANGUAGES[language].normalize )