示例#1
0
# Natural Language Toolkit: Europarl Corpus Readers
#
# Copyright (C) 2001-2009 NLTK Project
# Author:  Nitin Madnani <*****@*****.**>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT

import re
from util import LazyCorpusLoader
from reader import *

# Create a new corpus reader instance for each European language
danish = LazyCorpusLoader('europarl_raw/danish',
                          EuroparlCorpusReader,
                          r'ep-.*\.da',
                          encoding='utf-8')

dutch = LazyCorpusLoader('europarl_raw/dutch',
                         EuroparlCorpusReader,
                         r'ep-.*\.nl',
                         encoding='utf-8')

english = LazyCorpusLoader('europarl_raw/english',
                           EuroparlCorpusReader,
                           r'ep-.*\.en',
                           encoding='utf-8')

finnish = LazyCorpusLoader('europarl_raw/finnish',
                           EuroparlCorpusReader,
                           r'ep-.*\.fi',
                           encoding='utf-8')
示例#2
0
stored using U{Open Language Archives Community (OLAC)
<http://www.language-archives.org/>} metadata records.  These records
can be accessed using C{nltk.corpus.I{corpus}.olac()}.
"""

import re

from nltk.tokenize import RegexpTokenizer
from nltk.tag import simplify_brown_tag, simplify_wsj_tag,\
                     simplify_alpino_tag, simplify_indian_tag,\
                     simplify_tag

from util import LazyCorpusLoader
from reader import *

abc = LazyCorpusLoader('abc', PlaintextCorpusReader, r'(?!\.).*\.txt')
alpino = LazyCorpusLoader('alpino',
                          AlpinoCorpusReader,
                          tag_mapping_function=simplify_alpino_tag)
brown = LazyCorpusLoader('brown',
                         CategorizedTaggedCorpusReader,
                         r'c[a-z]\d\d',
                         cat_file='cats.txt',
                         tag_mapping_function=simplify_brown_tag)
cess_cat = LazyCorpusLoader('cess_cat',
                            BracketParseCorpusReader,
                            r'(?!\.).*\.tbf',
                            tag_mapping_function=simplify_tag)
cess_esp = LazyCorpusLoader('cess_esp',
                            BracketParseCorpusReader,
                            r'(?!\.).*\.tbf',