def raw(files=items): if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "ppattach", file) for line in open(path).readlines(): yield tuple(line.split())
def raw(files = ['female', 'male']): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "names", file+".txt") for word in open(path).readlines(): yield word.strip()
def raw(files = items): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "ppattach", file) for line in open(path).readlines(): yield tuple(line.split())
def raw(files = items): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "words", file) for word in open(path).readlines(): yield word.strip()
def chunked(files = items, chunk_types=('NP',)): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "conll2000", file + ".txt") s = open(path).read() for sent in tokenize.blankline(s): yield tree.conll_chunk(sent, chunk_types)
def raw(files='english'): if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "stopwords", file) for word in open(path).readlines(): yield word.strip()
def tagged(files = items): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "conll2000", file + ".txt") s = open(path).read() for sent in tokenize.blankline(s): yield [(word, tag) for (word, tag, chunk) in _list_sent(sent)]
def bracket_parse(files=items): if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "ycoe/psd", file + ".psd") s = open(path).read() data = _parse(s) for sent in data: yield tree.bracket_parse(sent)
def _read(files, conversion_function): if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "brown", file) f = open(path).read() for sent in tokenize.blankline(f): yield conversion_function(sent)
def bracket_parse(files = items): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "ycoe/psd", file + ".psd") s = open(path).read() data = _parse(s) for sent in data: yield tree.bracket_parse(sent)
def _read(files, conversion_function): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "brown", file) f = open(path).read() for sent in tokenize.blankline(f): yield conversion_function(sent)
def chunked(files=items, chunk_types=("NP",)): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "conll2000", file + ".txt") s = open(path).read() for sent in tokenize.blankline(s): yield tree.conll_chunk(sent, chunk_types)
def tagged(files=items): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "conll2000", file + ".txt") s = open(path).read() for sent in tokenize.blankline(s): yield [(word, tag) for (word, tag, chunk) in _list_sent(sent)]
def raw(files = items): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "ieer", file) for doc in open(path).read().split('</DOC>'): doc = doc.split('<DOC>') if len(doc) == 2: yield "<DOC>" + doc[1] + "</DOC>\n"
def demo() : path = os.path.join(get_basedir(), "shoebox", "rotokas.dic") l = Lexicon(path) l.parse(key_fields=['lx','ps','sn'], unique_entry=False) h = l.get_header() for e in l.get_entries() : print "<%s><%s><%s>" % (e.get_field_as_string("lx", ""), e.get_field_as_string("ps", ""), e.get_field_as_string("sn", ""))
def demo(): path = os.path.join(get_basedir(), "shoebox", "rotokas.dic") l = Lexicon(path) l.parse(key_fields=['lx', 'ps', 'sn'], unique_entry=False) h = l.get_header() for e in l.get_entries(): print "<%s><%s><%s>" % (e.get_field_as_string( "lx", ""), e.get_field_as_string( "ps", ""), e.get_field_as_string("sn", ""))
def raw(files = items): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "inaugural", file + ".txt") f = open(path) preamble = True text = f.read() for t in tokenize.wordpunct(text): yield t
def raw(files = items): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "state_union", file + ".txt") f = open(path) preamble = True text = f.read() for t in tokenize.wordpunct(text): yield t
def raw(files=items): if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "gutenberg", file + ".txt") f = open(path) preamble = True for line in f.readlines(): if not preamble: for t in tokenize.wordpunct(line): yield t if line[:5] == '*END*': preamble = False
def raw(files = items): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "gutenberg", file + ".txt") f = open(path) preamble = True for line in f.readlines(): if not preamble: for t in tokenize.wordpunct(line): yield t if line[:5] == '*END*': preamble = False
def raw(files = 'raw'): """ @param files: One or more treebank files to be processed @type files: L{string} or L{tuple(string)} @rtype: iterator over L{list(string)} """ # Just one file to process? If so convert to a tuple so we can iterate if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "sinica_treebank", file) for line in open(path).readlines(): yield line.split()[1:]
def parsed(files = 'parsed'): """ @param files: One or more treebank files to be processed @type files: L{string} or L{tuple(string)} @rtype: iterator over L{tree} """ # Just one file to process? If so convert to a tuple so we can iterate if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "sinica_treebank", file) for sent in open(path).readlines(): yield tree.bracket_parse(sent)
def raw(files=items): """ @param files: One or more Senseval files to be processed @type files: L{string} or L{tuple(string)} @rtype: iterator over L{tuple} """ if type(files) is str: files = (files, ) parser = SensevalParser() for file in files: path = os.path.join(get_basedir(), "senseval", file + ".pos") f = open(path).read() for entry in parser.parse(f): yield entry
def raw(files = items): """ @param files: One or more Senseval files to be processed @type files: L{string} or L{tuple(string)} @rtype: iterator over L{tuple} """ if type(files) is str: files = (files,) parser = SensevalParser() for file in files: path = os.path.join(get_basedir(), "senseval", file+".pos") f = open(path).read() for entry in parser.parse(f): yield entry
def raw(files='english-kjv'): """ @param files: One or more treebank files to be processed @type files: L{string} or L{tuple(string)} @rtype: iterator over L{tree} """ # Just one file to process? If so convert to a tuple so we can iterate if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "genesis", file + ".txt") s = open(path).read() for t in tokenize.whitespace(s): yield t
def raw(files = 'english-kjv'): """ @param files: One or more treebank files to be processed @type files: L{string} or L{tuple(string)} @rtype: iterator over L{tree} """ # Just one file to process? If so convert to a tuple so we can iterate if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "genesis", file+".txt") s = open(path).read() for t in tokenize.whitespace(s): yield t
def raw(files='cmudict'): """ @param files: One or more cmudict files to be processed @type files: L{string} or L{tuple(string)} @rtype: iterator over L{tree} """ # Just one file to process? If so convert to a tuple so we can iterate if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "cmudict", file) for line in open(path).readlines(): fields = line.strip().split(' ') yield (fields[0], int(fields[1]), tuple(fields[2:]))
def raw(files = 'cmudict'): """ @param files: One or more cmudict files to be processed @type files: L{string} or L{tuple(string)} @rtype: iterator over L{tree} """ # Just one file to process? If so convert to a tuple so we can iterate if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "cmudict", file) for line in open(path).readlines(): fields = line.strip().split(' ') yield (fields[0], int(fields[1]), tuple(fields[2:]))
def _read(files, conversion_function): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "ycoe/pos", file) f = open(path).read() rx_pattern = re.compile(r""" <.*>_CODE |\s.*_ID """, re.VERBOSE|re.UNICODE) mySents = tokenize.blankline(f) for sent in mySents: sent= re.sub(rx_pattern, '', sent) if sent != "": yield conversion_function(sent, sep="_")
def chunked(files='chunked'): """ @param files: One or more treebank files to be processed @type files: L{string} or L{tuple(string)} @rtype: iterator over L{tree} """ # Just one file to process? If so convert to a tuple so we can iterate if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "treebank", file) s = open(path).read() for t in tokenize.blankline(s): yield tree.chunk(t)
def _read(files, conversion_function): if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "ycoe/pos", file) f = open(path).read() rx_pattern = re.compile( r""" <.*>_CODE |\s.*_ID """, re.VERBOSE | re.UNICODE) mySents = tokenize.blankline(f) for sent in mySents: sent = re.sub(rx_pattern, '', sent) if sent != "": yield conversion_function(sent, sep="_")
def raw(files='raw'): """ @param files: One or more treebank files to be processed @type files: L{string} or L{tuple(string)} @rtype: iterator over L{list(string)} """ # Just one file to process? If so convert to a tuple so we can iterate if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "treebank", file) f = open(path).read() for sent in tokenize.blankline(f): l = [] for t in tokenize.whitespace(sent): l.append(t) yield l
def parse_corpus(file_name, key=None, **kwargs): """ Return an element tree resulting from parsing the toolbox datafile. A convenience function that creates a C{ToolboxData} object, opens and parses the toolbox data file. The data file is assumed to be in the toolbox subdirectory of the directory where NLTK looks for corpora, see L{corpora.get_basedir()}. @param file_name: Name of file in toolbox corpus directory @type file_name: string @param key: marker at the start of each record @type key: string @param kwargs: Keyword arguments passed to L{ToolboxData.parse()} @type kwargs: keyword arguments dictionary @rtype: ElementTree._ElementInterface @return: contents of toolbox data divided into header and records """ db = ToolboxData() db.open(os.path.join(get_basedir(), 'toolbox', file_name)) return db.parse(key, **kwargs)
def parse_corpus(file_name, key=None, **kwargs): """ Return an element tree resulting from parsing the toolbox datafile. A convenience function that creates a C{ToolboxData} object, opens and parses the toolbox data file. The data file is assumed to be in the toolbox subdirectory of the directory where NLTK looks for corpora, see L{corpora.get_basedir()}. @param file_name: Name of file in toolbox corpus directory @type file_name: string @param key: marker at the start of each record @type key: string @param kwargs: Keyword arguments passed to L{ToolboxData.parse()} @type kwargs: keyword arguments dictionary @rtype: ElementTree._ElementInterface @return: contents of toolbox data divided into header and records """ db = ToolboxData() db.open(os.path.join(get_basedir(), "toolbox", file_name)) return db.parse(key, **kwargs)
def raw(files='rotokas.dic', include_header=False, head_field_marker=None): """ Deprecated: use C{StandardFormat.fields()} @param files: One or more toolbox files to be processed @type files: L{string} or L{tuple(string)} @param include_header: flag that determines whether to treat header as record (default is no) @type include_header: boolean @param head_field_marker: option for explicitly setting which marker to use as the head field when parsing the file (default is automatically determining it from the first field of the first record) @type head_field_marker: string @rtype: iterator over L{list(string)} """ # Just one file to process? If so convert to a tuple so we can iterate if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "toolbox", file) fc = open(path, "U").read() if fc.strip().startswith(r"\_"): (header, body) = split(fc, sep="\n\n", maxsplit=1) if include_header: yield list(_parse_record(header)) else: body = fc # Deal with head field marker if head_field_marker: hfm_with_backslash = "\\" + hfm else: ff = split(body, sep="\n", maxsplit=1)[0] # first field hfm_with_backslash = split( ff, sep=" ", maxsplit=1)[0] # raw marker of first field recordsep = "\n\n" + hfm_with_backslash # separates records from one another # Parse records for r in split("\n\n" + body, sep=recordsep)[1:]: yield list(_parse_record(hfm_with_backslash + r))
def raw(files="rotokas.dic", include_header=False, head_field_marker=None): """ Deprecated: use C{StandardFormat.fields()} @param files: One or more toolbox files to be processed @type files: L{string} or L{tuple(string)} @param include_header: flag that determines whether to treat header as record (default is no) @type include_header: boolean @param head_field_marker: option for explicitly setting which marker to use as the head field when parsing the file (default is automatically determining it from the first field of the first record) @type head_field_marker: string @rtype: iterator over L{list(string)} """ # Just one file to process? If so convert to a tuple so we can iterate if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "toolbox", file) fc = open(path, "U").read() if fc.strip().startswith(r"\_"): (header, body) = split(fc, sep="\n\n", maxsplit=1) if include_header: yield list(_parse_record(header)) else: body = fc # Deal with head field marker if head_field_marker: hfm_with_backslash = "\\" + hfm else: ff = split(body, sep="\n", maxsplit=1)[0] # first field hfm_with_backslash = split(ff, sep=" ", maxsplit=1)[0] # raw marker of first field recordsep = "\n\n" + hfm_with_backslash # separates records from one another # Parse records for r in split("\n\n" + body, sep=recordsep)[1:]: yield list(_parse_record(hfm_with_backslash + r))
def _chunk_parse(files, chunk_types, top_node, partial_match, collapse_partials, cascade): # allow any kind of bracketing for flexibility L_BRACKET = re.compile(r'[\(\[\{<]') R_BRACKET = re.compile(r'[\)\]\}>]') if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "ycoe/psd", file + ".psd") s = open(path).read() data = _parse(s) for s in data: bracket = 0 itmType = None stack = [tree.Tree(top_node, [])] inTag = [] for itm in list(tokenize.whitespace(s)): if L_BRACKET.match(itm[0]): bracket += 1 itm = itm[1:] matched = False if partial_match == True: for eachItm in chunk_types: if (len(eachItm) <= len(itm) and eachItm == itm[:len(eachItm)]): matched = True if collapse_partials == True: itm = eachItm else: if (chunk_types is not None and itm in chunk_types): matched = True if matched == True: # and inTag == 0: chunk = tree.Tree(itm, []) if cascade == True: stack.append(chunk) inTag += [bracket] else: if len(inTag) == 0: stack[-1].append(chunk) inTag += [bracket] itmType=itm if R_BRACKET.match(itm[-1]): tmpItm = split(itm, itm[-1]) if tmpItm != "": if len(inTag) > 0 and inTag[-1] <= bracket: #inTag <= bracket: if cascade == True: stack[-1].append( (itmType, tmpItm[0]) ) else: stack[-1][-1].append( (itmType, tmpItm[0]) ) else: if cascade == True: if len(stack) > 1: stack[-2].append(stack[-1]) stack = stack[:-1] stack[-1].append( (itmType, tmpItm[0]) ) inTag = [] + inTag[:-2] bracket -= (len(tmpItm)-1) while( len(inTag) > 0 and bracket < inTag[-1] ): if cascade == True: if len(stack) > 1: stack[-2].append(stack[-1]) stack = stack[:-1] inTag = [] + inTag[:-2] yield stack
import ossaudiodev import time from en.parser.nltk_lite.corpora import get_basedir if sys.platform.startswith('linux') or sys.platform.startswith('freebsd'): PLAY_ENABLED = True else: PLAY_ENABLED = False __all__ = [ "items", "raw", "phonetic", "speakers", "dictionary", "spkrinfo", "audiodata", "play" ] PREFIX = os.path.join(get_basedir(), "timit") speakers = [] items = [] dictionary = {} spkrinfo = {} for f in os.listdir(PREFIX): if re.match("^dr[0-9]-[a-z]{4}[0-9]$", f): speakers.append(f) for g in os.listdir(os.path.join(PREFIX, f)): if g.endswith(".txt"): items.append(f + ':' + g[:-4]) speakers.sort() items.sort()
def loadParadigm(self, p_filename ): """ Load the given paradigm (XML file) Attributes are stored in self.attributes Data are stored in self.data They can be accessed as follows: self.attributes['gender'] # list of genders self.data[6]['gender'] # gender for the sixth data object self.data[6]['content'] # content for the sixth data object """ from en.parser.nltk_lite.corpora import get_basedir basedir = get_basedir() # Look for the file try_filename = os.path.join(get_basedir(), "paradigms", p_filename) try: f = open(try_filename) p_filename = try_filename except IOError: print "Cannot find file" return None f.close() # These variables will be set by this method self.attributes = {} # A new dictionary self.data = [] # A new list # XML admin: create Reader object, parse document reader = Sax2.Reader() doc = reader.fromStream(p_filename) # Cycle through the given attributes and add them to self.attributes # for <name> in <attributes> attributes = doc.getElementsByTagName('attributes')[0] for name in attributes.getElementsByTagName('name'): # Setup a list of attribute values tmp_list = [] # for each value under name, store in list for value in name.getElementsByTagName('value'): tmp_list.append(value.getAttribute('value')) # Store list of values in dictionary self.attributes[name.getAttribute('name')] = tmp_list # Cycle through data objects and add them to self.data # for <form> in <paradigm> forms = doc.getElementsByTagName('paradigm')[0] for form in forms.getElementsByTagName('form'): # Initialise a temporary dictionary tmp_dict = {} for value in form.getElementsByTagName('attribute'): tmp_dict[value.getAttribute('name')] = value.getAttribute('value') # Add the new dictionary to the data list self.data.append(tmp_dict) # Talk to the user print "Paradigm information successfully loaded from file:", p_filename # State the number and print out a list of attributes print " "*4 + str(len(self.attributes)) + " attributes imported:", for att in self.attributes: print att, print # State the number of paradigm objects imported print " "*4 + str(len(self.data)) + " paradigm objects imported." return
def _chunk_parse(files, chunk_types, top_node, partial_match, collapse_partials, cascade): # allow any kind of bracketing for flexibility L_BRACKET = re.compile(r'[\(\[\{<]') R_BRACKET = re.compile(r'[\)\]\}>]') if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "ycoe/psd", file + ".psd") s = open(path).read() data = _parse(s) for s in data: bracket = 0 itmType = None stack = [tree.Tree(top_node, [])] inTag = [] for itm in list(tokenize.whitespace(s)): if L_BRACKET.match(itm[0]): bracket += 1 itm = itm[1:] matched = False if partial_match == True: for eachItm in chunk_types: if (len(eachItm) <= len(itm) and eachItm == itm[:len(eachItm)]): matched = True if collapse_partials == True: itm = eachItm else: if (chunk_types is not None and itm in chunk_types): matched = True if matched == True: # and inTag == 0: chunk = tree.Tree(itm, []) if cascade == True: stack.append(chunk) inTag += [bracket] else: if len(inTag) == 0: stack[-1].append(chunk) inTag += [bracket] itmType = itm if R_BRACKET.match(itm[-1]): tmpItm = split(itm, itm[-1]) if tmpItm != "": if len(inTag) > 0 and inTag[ -1] <= bracket: #inTag <= bracket: if cascade == True: stack[-1].append((itmType, tmpItm[0])) else: stack[-1][-1].append((itmType, tmpItm[0])) else: if cascade == True: if len(stack) > 1: stack[-2].append(stack[-1]) stack = stack[:-1] stack[-1].append((itmType, tmpItm[0])) inTag = [] + inTag[:-2] bracket -= (len(tmpItm) - 1) while (len(inTag) > 0 and bracket < inTag[-1]): if cascade == True: if len(stack) > 1: stack[-2].append(stack[-1]) stack = stack[:-1] inTag = [] + inTag[:-2] yield stack
from en.parser.nltk_lite.corpora import get_basedir from en.parser.nltk_lite import tokenize from itertools import islice import ossaudiodev, time import sys, os, re if sys.platform.startswith('linux') or sys.platform.startswith('freebsd'): PLAY_ENABLED = True else: PLAY_ENABLED = False __all__ = ["items", "raw", "phonetic", "speakers", "dictionary", "spkrinfo", "audiodata", "play"] PREFIX = os.path.join(get_basedir(),"timit") speakers = [] items = [] dictionary = {} spkrinfo = {} for f in os.listdir(PREFIX): if re.match("^dr[0-9]-[a-z]{4}[0-9]$", f): speakers.append(f) for g in os.listdir(os.path.join(PREFIX,f)): if g.endswith(".txt"): items.append(f+':'+g[:-4]) speakers.sort() items.sort()
def loadParadigm(self, p_filename): """ Load the given paradigm (XML file) Attributes are stored in self.attributes Data are stored in self.data They can be accessed as follows: self.attributes['gender'] # list of genders self.data[6]['gender'] # gender for the sixth data object self.data[6]['content'] # content for the sixth data object """ from en.parser.nltk_lite.corpora import get_basedir basedir = get_basedir() # Look for the file try_filename = os.path.join(get_basedir(), "paradigms", p_filename) try: f = open(try_filename) p_filename = try_filename except IOError: print "Cannot find file" return None f.close() # These variables will be set by this method self.attributes = {} # A new dictionary self.data = [] # A new list # XML admin: create Reader object, parse document reader = Sax2.Reader() doc = reader.fromStream(p_filename) # Cycle through the given attributes and add them to self.attributes # for <name> in <attributes> attributes = doc.getElementsByTagName('attributes')[0] for name in attributes.getElementsByTagName('name'): # Setup a list of attribute values tmp_list = [] # for each value under name, store in list for value in name.getElementsByTagName('value'): tmp_list.append(value.getAttribute('value')) # Store list of values in dictionary self.attributes[name.getAttribute('name')] = tmp_list # Cycle through data objects and add them to self.data # for <form> in <paradigm> forms = doc.getElementsByTagName('paradigm')[0] for form in forms.getElementsByTagName('form'): # Initialise a temporary dictionary tmp_dict = {} for value in form.getElementsByTagName('attribute'): tmp_dict[value.getAttribute('name')] = value.getAttribute( 'value') # Add the new dictionary to the data list self.data.append(tmp_dict) # Talk to the user print "Paradigm information successfully loaded from file:", p_filename # State the number and print out a list of attributes print " " * 4 + str(len(self.attributes)) + " attributes imported:", for att in self.attributes: print att, print # State the number of paradigm objects imported print " " * 4 + str(len(self.data)) + " paradigm objects imported." return