예제 #1
0
def raw(files=items):
    if type(files) is str: files = (files, )

    for file in files:
        path = os.path.join(get_basedir(), "ppattach", file)
        for line in open(path).readlines():
            yield tuple(line.split())
예제 #2
0
def raw(files = ['female', 'male']):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "names", file+".txt")
        for word in open(path).readlines():
            yield word.strip()
예제 #3
0
def raw(files = items):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "ppattach", file)
        for line in open(path).readlines():
            yield tuple(line.split())
예제 #4
0
def raw(files = items):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "words", file)
        for word in open(path).readlines():
            yield word.strip()
예제 #5
0
def chunked(files = items, chunk_types=('NP',)):
    if type(files) is str: files = (files,)
    for file in files:
        path = os.path.join(get_basedir(), "conll2000", file + ".txt")
        s = open(path).read()
        for sent in tokenize.blankline(s):
            yield tree.conll_chunk(sent, chunk_types)
예제 #6
0
def raw(files='english'):
    if type(files) is str: files = (files, )

    for file in files:
        path = os.path.join(get_basedir(), "stopwords", file)
        for word in open(path).readlines():
            yield word.strip()
예제 #7
0
def raw(files = ['female', 'male']):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "names", file+".txt")
        for word in open(path).readlines():
            yield word.strip()
예제 #8
0
def tagged(files = items):
    if type(files) is str: files = (files,)
    for file in files:
        path = os.path.join(get_basedir(), "conll2000", file + ".txt")
        s = open(path).read()
        for sent in tokenize.blankline(s):
            yield [(word, tag) for (word, tag, chunk) in _list_sent(sent)]
예제 #9
0
def bracket_parse(files=items):
    if type(files) is str: files = (files, )
    for file in files:
        path = os.path.join(get_basedir(), "ycoe/psd", file + ".psd")
        s = open(path).read()
        data = _parse(s)
        for sent in data:
            yield tree.bracket_parse(sent)
예제 #10
0
def _read(files, conversion_function):
    if type(files) is str: files = (files, )

    for file in files:
        path = os.path.join(get_basedir(), "brown", file)
        f = open(path).read()
        for sent in tokenize.blankline(f):
            yield conversion_function(sent)
예제 #11
0
def bracket_parse(files = items):
    if type(files) is str: files = (files,)
    for file in files:
        path = os.path.join(get_basedir(), "ycoe/psd", file + ".psd")
        s = open(path).read()
        data = _parse(s)
        for sent in data:
            yield tree.bracket_parse(sent)
예제 #12
0
def _read(files, conversion_function):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "brown", file)
        f = open(path).read()
        for sent in tokenize.blankline(f):
            yield conversion_function(sent)
예제 #13
0
def chunked(files=items, chunk_types=("NP",)):
    if type(files) is str:
        files = (files,)
    for file in files:
        path = os.path.join(get_basedir(), "conll2000", file + ".txt")
        s = open(path).read()
        for sent in tokenize.blankline(s):
            yield tree.conll_chunk(sent, chunk_types)
예제 #14
0
def tagged(files=items):
    if type(files) is str:
        files = (files,)
    for file in files:
        path = os.path.join(get_basedir(), "conll2000", file + ".txt")
        s = open(path).read()
        for sent in tokenize.blankline(s):
            yield [(word, tag) for (word, tag, chunk) in _list_sent(sent)]
예제 #15
0
def raw(files = items):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "ieer", file)
        for doc in open(path).read().split('</DOC>'):
            doc = doc.split('<DOC>')
            if len(doc) == 2:
                yield "<DOC>" + doc[1] + "</DOC>\n"
예제 #16
0
def raw(files = items):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "ieer", file)
        for doc in open(path).read().split('</DOC>'):
            doc = doc.split('<DOC>')
            if len(doc) == 2:
                yield "<DOC>" + doc[1] + "</DOC>\n"
예제 #17
0
def demo() :
    path = os.path.join(get_basedir(), "shoebox", "rotokas.dic")
    l = Lexicon(path)
    l.parse(key_fields=['lx','ps','sn'], unique_entry=False)
    h = l.get_header()
    for e in l.get_entries() :
        print "<%s><%s><%s>" % (e.get_field_as_string("lx", ""),
                                e.get_field_as_string("ps", ""),
                                e.get_field_as_string("sn", ""))
예제 #18
0
def demo():
    path = os.path.join(get_basedir(), "shoebox", "rotokas.dic")
    l = Lexicon(path)
    l.parse(key_fields=['lx', 'ps', 'sn'], unique_entry=False)
    h = l.get_header()
    for e in l.get_entries():
        print "<%s><%s><%s>" % (e.get_field_as_string(
            "lx", ""), e.get_field_as_string(
                "ps", ""), e.get_field_as_string("sn", ""))
예제 #19
0
def raw(files = items):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "inaugural", file + ".txt")
        f = open(path)
        preamble = True
        text = f.read()
        for t in tokenize.wordpunct(text):
            yield t
예제 #20
0
def raw(files = items):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "state_union", file + ".txt")
        f = open(path)
        preamble = True
        text = f.read()
        for t in tokenize.wordpunct(text):
            yield t
예제 #21
0
def raw(files=items):
    if type(files) is str: files = (files, )

    for file in files:
        path = os.path.join(get_basedir(), "gutenberg", file + ".txt")
        f = open(path)
        preamble = True
        for line in f.readlines():
            if not preamble:
                for t in tokenize.wordpunct(line):
                    yield t
            if line[:5] == '*END*':
                preamble = False
예제 #22
0
def raw(files = items):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "gutenberg", file + ".txt")
        f = open(path)
        preamble = True
        for line in f.readlines():
            if not preamble:
                for t in tokenize.wordpunct(line):
                    yield t
            if line[:5] == '*END*':
                preamble = False
예제 #23
0
def raw(files = 'raw'):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{list(string)}
    """       

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "sinica_treebank", file)
        for line in open(path).readlines():
            yield line.split()[1:]
예제 #24
0
def parsed(files = 'parsed'):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{tree}
    """       

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "sinica_treebank", file)
        for sent in open(path).readlines():
            yield tree.bracket_parse(sent)
예제 #25
0
def parsed(files = 'parsed'):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{tree}
    """       

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "sinica_treebank", file)
        for sent in open(path).readlines():
            yield tree.bracket_parse(sent)
예제 #26
0
def raw(files = 'raw'):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{list(string)}
    """       

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "sinica_treebank", file)
        for line in open(path).readlines():
            yield line.split()[1:]
예제 #27
0
def raw(files=items):
    """
    @param files: One or more Senseval files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{tuple}
    """

    if type(files) is str: files = (files, )
    parser = SensevalParser()
    for file in files:
        path = os.path.join(get_basedir(), "senseval", file + ".pos")
        f = open(path).read()
        for entry in parser.parse(f):
            yield entry
예제 #28
0
def raw(files = items):
    """
    @param files: One or more Senseval files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{tuple}
    """       

    if type(files) is str: files = (files,)
    parser = SensevalParser()
    for file in files:
        path = os.path.join(get_basedir(), "senseval", file+".pos")
        f = open(path).read()
        for entry in parser.parse(f):
            yield entry
예제 #29
0
def raw(files='english-kjv'):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{tree}
    """

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str: files = (files, )

    for file in files:
        path = os.path.join(get_basedir(), "genesis", file + ".txt")
        s = open(path).read()
        for t in tokenize.whitespace(s):
            yield t
예제 #30
0
def raw(files = 'english-kjv'):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{tree}
    """       

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "genesis", file+".txt")
        s = open(path).read()
        for t in tokenize.whitespace(s):
            yield t
예제 #31
0
파일: cmudict.py 프로젝트: mgolden/en
def raw(files='cmudict'):
    """
    @param files: One or more cmudict files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{tree}
    """

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str: files = (files, )

    for file in files:
        path = os.path.join(get_basedir(), "cmudict", file)
        for line in open(path).readlines():
            fields = line.strip().split(' ')
            yield (fields[0], int(fields[1]), tuple(fields[2:]))
예제 #32
0
def raw(files = 'cmudict'):
    """
    @param files: One or more cmudict files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{tree}
    """       

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "cmudict", file)
        for line in open(path).readlines():
            fields = line.strip().split(' ')
            yield (fields[0], int(fields[1]), tuple(fields[2:]))
예제 #33
0
def _read(files, conversion_function):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "ycoe/pos", file)
        f = open(path).read()
        rx_pattern = re.compile(r"""
                <.*>_CODE
                |\s.*_ID
        """, re.VERBOSE|re.UNICODE)
        mySents = tokenize.blankline(f)
        for sent in mySents:
            sent= re.sub(rx_pattern, '', sent)
            if sent != "":
                yield conversion_function(sent, sep="_")
예제 #34
0
def chunked(files='chunked'):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{tree}
    """

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str: files = (files, )

    for file in files:
        path = os.path.join(get_basedir(), "treebank", file)
        s = open(path).read()
        for t in tokenize.blankline(s):
            yield tree.chunk(t)
예제 #35
0
def _read(files, conversion_function):
    if type(files) is str: files = (files, )

    for file in files:
        path = os.path.join(get_basedir(), "ycoe/pos", file)
        f = open(path).read()
        rx_pattern = re.compile(
            r"""
                <.*>_CODE
                |\s.*_ID
        """, re.VERBOSE | re.UNICODE)
        mySents = tokenize.blankline(f)
        for sent in mySents:
            sent = re.sub(rx_pattern, '', sent)
            if sent != "":
                yield conversion_function(sent, sep="_")
예제 #36
0
def raw(files='raw'):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{list(string)}
    """

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str: files = (files, )

    for file in files:
        path = os.path.join(get_basedir(), "treebank", file)
        f = open(path).read()
        for sent in tokenize.blankline(f):
            l = []
            for t in tokenize.whitespace(sent):
                l.append(t)
            yield l
예제 #37
0
파일: toolbox.py 프로젝트: mgolden/en
def parse_corpus(file_name, key=None, **kwargs):
    """
    Return an element tree resulting from parsing the toolbox datafile.
    
    A convenience function that creates a C{ToolboxData} object, opens and 
    parses the toolbox data file. The data file is assumed to be in the toolbox 
    subdirectory of the directory where NLTK looks for corpora, 
    see L{corpora.get_basedir()}.
    @param file_name: Name of file in toolbox corpus directory
    @type file_name: string
    @param key: marker at the start of each record
    @type key: string
    @param kwargs: Keyword arguments passed to L{ToolboxData.parse()}
    @type kwargs: keyword arguments dictionary
    @rtype:   ElementTree._ElementInterface
    @return:  contents of toolbox data divided into header and records
    """
    db = ToolboxData()
    db.open(os.path.join(get_basedir(), 'toolbox', file_name))
    return db.parse(key, **kwargs)
예제 #38
0
def parse_corpus(file_name, key=None, **kwargs):
    """
    Return an element tree resulting from parsing the toolbox datafile.
    
    A convenience function that creates a C{ToolboxData} object, opens and 
    parses the toolbox data file. The data file is assumed to be in the toolbox 
    subdirectory of the directory where NLTK looks for corpora, 
    see L{corpora.get_basedir()}.
    @param file_name: Name of file in toolbox corpus directory
    @type file_name: string
    @param key: marker at the start of each record
    @type key: string
    @param kwargs: Keyword arguments passed to L{ToolboxData.parse()}
    @type kwargs: keyword arguments dictionary
    @rtype:   ElementTree._ElementInterface
    @return:  contents of toolbox data divided into header and records
    """
    db = ToolboxData()
    db.open(os.path.join(get_basedir(), "toolbox", file_name))
    return db.parse(key, **kwargs)
예제 #39
0
파일: toolbox.py 프로젝트: mgolden/en
def raw(files='rotokas.dic', include_header=False, head_field_marker=None):
    """
    Deprecated: use C{StandardFormat.fields()}
    
    @param files: One or more toolbox files to be processed
    @type files: L{string} or L{tuple(string)}
    @param include_header: flag that determines whether to treat header as record (default is no)
    @type include_header: boolean
    @param head_field_marker: option for explicitly setting which marker to use as the head field
                              when parsing the file (default is automatically determining it from
                              the first field of the first record)
    @type head_field_marker: string
    @rtype: iterator over L{list(string)}
    """

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str: files = (files, )

    for file in files:
        path = os.path.join(get_basedir(), "toolbox", file)
        fc = open(path, "U").read()
        if fc.strip().startswith(r"\_"):
            (header, body) = split(fc, sep="\n\n", maxsplit=1)
            if include_header:
                yield list(_parse_record(header))
        else:
            body = fc

        # Deal with head field marker
        if head_field_marker:
            hfm_with_backslash = "\\" + hfm
        else:
            ff = split(body, sep="\n", maxsplit=1)[0]  # first field
            hfm_with_backslash = split(
                ff, sep=" ", maxsplit=1)[0]  # raw marker of first field
        recordsep = "\n\n" + hfm_with_backslash  # separates records from one another

        # Parse records
        for r in split("\n\n" + body, sep=recordsep)[1:]:
            yield list(_parse_record(hfm_with_backslash + r))
예제 #40
0
def raw(files="rotokas.dic", include_header=False, head_field_marker=None):
    """
    Deprecated: use C{StandardFormat.fields()}
    
    @param files: One or more toolbox files to be processed
    @type files: L{string} or L{tuple(string)}
    @param include_header: flag that determines whether to treat header as record (default is no)
    @type include_header: boolean
    @param head_field_marker: option for explicitly setting which marker to use as the head field
                              when parsing the file (default is automatically determining it from
                              the first field of the first record)
    @type head_field_marker: string
    @rtype: iterator over L{list(string)}
    """

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str:
        files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "toolbox", file)
        fc = open(path, "U").read()
        if fc.strip().startswith(r"\_"):
            (header, body) = split(fc, sep="\n\n", maxsplit=1)
            if include_header:
                yield list(_parse_record(header))
        else:
            body = fc

        # Deal with head field marker
        if head_field_marker:
            hfm_with_backslash = "\\" + hfm
        else:
            ff = split(body, sep="\n", maxsplit=1)[0]  # first field
            hfm_with_backslash = split(ff, sep=" ", maxsplit=1)[0]  # raw marker of first field
        recordsep = "\n\n" + hfm_with_backslash  # separates records from one another

        # Parse records
        for r in split("\n\n" + body, sep=recordsep)[1:]:
            yield list(_parse_record(hfm_with_backslash + r))
예제 #41
0
def _chunk_parse(files, chunk_types, top_node, partial_match, collapse_partials, cascade):
    # allow any kind of bracketing for flexibility

    L_BRACKET = re.compile(r'[\(\[\{<]')
    R_BRACKET = re.compile(r'[\)\]\}>]')

    if type(files) is str: files = (files,)
    for file in files:
        path = os.path.join(get_basedir(), "ycoe/psd", file + ".psd")
        s = open(path).read()
        data = _parse(s)
        for s in data:
            bracket = 0
            itmType = None
            stack = [tree.Tree(top_node, [])]
            inTag = []
            for itm in list(tokenize.whitespace(s)):
                if L_BRACKET.match(itm[0]):
                    bracket += 1
                    itm = itm[1:]
                    matched = False
                    if partial_match == True:
                        for eachItm in chunk_types:
                           if (len(eachItm) <= len(itm) and 
                               eachItm == itm[:len(eachItm)]):
                               matched = True
                               if collapse_partials == True:
                                   itm = eachItm
                    else:
                        if (chunk_types is not None and
                            itm in chunk_types):
                            matched = True
                    if matched == True: # and inTag == 0:
                        chunk = tree.Tree(itm, [])
                        if cascade == True:
                            stack.append(chunk)
                            inTag += [bracket]
                        else:
                            if len(inTag) == 0:
                                stack[-1].append(chunk)
                                inTag += [bracket]
                    itmType=itm
                if R_BRACKET.match(itm[-1]):
                    tmpItm = split(itm, itm[-1])
                    if tmpItm != "":
                        if len(inTag) > 0 and inTag[-1] <= bracket: #inTag <= bracket:
                            if cascade == True:
                                stack[-1].append( (itmType, tmpItm[0]) )
                            else:
                                stack[-1][-1].append( (itmType, tmpItm[0]) )
                        else:
                            if cascade == True:
                                if len(stack) > 1:
                                    stack[-2].append(stack[-1])
                                    stack = stack[:-1]
                            stack[-1].append( (itmType, tmpItm[0]) )
                            inTag = [] + inTag[:-2]
                    bracket -= (len(tmpItm)-1)
                    while( len(inTag) > 0 and bracket < inTag[-1] ):
                        if cascade == True:
                            if len(stack) > 1:
                                stack[-2].append(stack[-1])
                                stack = stack[:-1]
                        inTag = [] + inTag[:-2]
            yield stack
예제 #42
0
import ossaudiodev
import time
from en.parser.nltk_lite.corpora import get_basedir

if sys.platform.startswith('linux') or sys.platform.startswith('freebsd'):
    PLAY_ENABLED = True
else:
    PLAY_ENABLED = False

__all__ = [
    "items", "raw", "phonetic", "speakers", "dictionary", "spkrinfo",
    "audiodata", "play"
]

PREFIX = os.path.join(get_basedir(), "timit")

speakers = []
items = []
dictionary = {}
spkrinfo = {}

for f in os.listdir(PREFIX):
    if re.match("^dr[0-9]-[a-z]{4}[0-9]$", f):
        speakers.append(f)
        for g in os.listdir(os.path.join(PREFIX, f)):
            if g.endswith(".txt"):
                items.append(f + ':' + g[:-4])
speakers.sort()
items.sort()
예제 #43
0
    def loadParadigm(self, p_filename ):
        """
        Load the given paradigm (XML file)
        Attributes are stored in self.attributes
        Data are stored in self.data
    
        They can be accessed as follows:
        self.attributes['gender']   # list of genders
        self.data[6]['gender']      # gender for the sixth data object
        self.data[6]['content']     # content for the sixth data object
        """

        from en.parser.nltk_lite.corpora import get_basedir
        basedir = get_basedir()

        # Look for the file
        try_filename = os.path.join(get_basedir(), "paradigms", p_filename)
        try:
            f = open(try_filename)
            p_filename = try_filename
        except IOError:
            print "Cannot find file"
            return None
        f.close()

        # These variables will be set by this method
        self.attributes = {}  # A new dictionary
        self.data = []        # A new list

        # XML admin: create Reader object, parse document
        reader = Sax2.Reader()
        doc = reader.fromStream(p_filename)

        # Cycle through the given attributes and add them to self.attributes
        # for <name> in <attributes>
        attributes = doc.getElementsByTagName('attributes')[0]
        for name in attributes.getElementsByTagName('name'):

            # Setup a list of attribute values
            tmp_list = []

            # for each value under name, store in list
            for value in name.getElementsByTagName('value'):
                tmp_list.append(value.getAttribute('value'))

            # Store list of values in dictionary
            self.attributes[name.getAttribute('name')] = tmp_list


        # Cycle through data objects and add them to self.data
        # for <form> in <paradigm>
        forms = doc.getElementsByTagName('paradigm')[0]
        for form in forms.getElementsByTagName('form'):
            # Initialise a temporary dictionary
            tmp_dict = {}
            for value in form.getElementsByTagName('attribute'):
                tmp_dict[value.getAttribute('name')] = value.getAttribute('value')
            # Add the new dictionary to the data list
            self.data.append(tmp_dict)

        # Talk to the user
        print "Paradigm information successfully loaded from file:", p_filename
        # State the number and print out a list of attributes
        print " "*4 + str(len(self.attributes)) + " attributes imported:",
        for att in self.attributes:
            print att,
        print
        # State the number of paradigm objects imported
        print " "*4 + str(len(self.data)) + " paradigm objects imported."

        return
예제 #44
0
def _chunk_parse(files, chunk_types, top_node, partial_match,
                 collapse_partials, cascade):
    # allow any kind of bracketing for flexibility

    L_BRACKET = re.compile(r'[\(\[\{<]')
    R_BRACKET = re.compile(r'[\)\]\}>]')

    if type(files) is str: files = (files, )
    for file in files:
        path = os.path.join(get_basedir(), "ycoe/psd", file + ".psd")
        s = open(path).read()
        data = _parse(s)
        for s in data:
            bracket = 0
            itmType = None
            stack = [tree.Tree(top_node, [])]
            inTag = []
            for itm in list(tokenize.whitespace(s)):
                if L_BRACKET.match(itm[0]):
                    bracket += 1
                    itm = itm[1:]
                    matched = False
                    if partial_match == True:
                        for eachItm in chunk_types:
                            if (len(eachItm) <= len(itm)
                                    and eachItm == itm[:len(eachItm)]):
                                matched = True
                                if collapse_partials == True:
                                    itm = eachItm
                    else:
                        if (chunk_types is not None and itm in chunk_types):
                            matched = True
                    if matched == True:  # and inTag == 0:
                        chunk = tree.Tree(itm, [])
                        if cascade == True:
                            stack.append(chunk)
                            inTag += [bracket]
                        else:
                            if len(inTag) == 0:
                                stack[-1].append(chunk)
                                inTag += [bracket]
                    itmType = itm
                if R_BRACKET.match(itm[-1]):
                    tmpItm = split(itm, itm[-1])
                    if tmpItm != "":
                        if len(inTag) > 0 and inTag[
                                -1] <= bracket:  #inTag <= bracket:
                            if cascade == True:
                                stack[-1].append((itmType, tmpItm[0]))
                            else:
                                stack[-1][-1].append((itmType, tmpItm[0]))
                        else:
                            if cascade == True:
                                if len(stack) > 1:
                                    stack[-2].append(stack[-1])
                                    stack = stack[:-1]
                            stack[-1].append((itmType, tmpItm[0]))
                            inTag = [] + inTag[:-2]
                    bracket -= (len(tmpItm) - 1)
                    while (len(inTag) > 0 and bracket < inTag[-1]):
                        if cascade == True:
                            if len(stack) > 1:
                                stack[-2].append(stack[-1])
                                stack = stack[:-1]
                        inTag = [] + inTag[:-2]
            yield stack
예제 #45
0
from en.parser.nltk_lite.corpora import get_basedir
from en.parser.nltk_lite import tokenize
from itertools import islice
import ossaudiodev, time
import sys, os, re

if sys.platform.startswith('linux') or sys.platform.startswith('freebsd'):
    PLAY_ENABLED = True
else:
    PLAY_ENABLED = False
    
__all__ = ["items", "raw", "phonetic", "speakers", "dictionary", "spkrinfo",
           "audiodata", "play"]

PREFIX = os.path.join(get_basedir(),"timit")

speakers = []
items = []
dictionary = {}
spkrinfo = {}

for f in os.listdir(PREFIX):
    if re.match("^dr[0-9]-[a-z]{4}[0-9]$", f):
        speakers.append(f)
        for g in os.listdir(os.path.join(PREFIX,f)):
            if g.endswith(".txt"):
                items.append(f+':'+g[:-4])
speakers.sort()
items.sort()
예제 #46
0
    def loadParadigm(self, p_filename):
        """
        Load the given paradigm (XML file)
        Attributes are stored in self.attributes
        Data are stored in self.data
    
        They can be accessed as follows:
        self.attributes['gender']   # list of genders
        self.data[6]['gender']      # gender for the sixth data object
        self.data[6]['content']     # content for the sixth data object
        """

        from en.parser.nltk_lite.corpora import get_basedir
        basedir = get_basedir()

        # Look for the file
        try_filename = os.path.join(get_basedir(), "paradigms", p_filename)
        try:
            f = open(try_filename)
            p_filename = try_filename
        except IOError:
            print "Cannot find file"
            return None
        f.close()

        # These variables will be set by this method
        self.attributes = {}  # A new dictionary
        self.data = []  # A new list

        # XML admin: create Reader object, parse document
        reader = Sax2.Reader()
        doc = reader.fromStream(p_filename)

        # Cycle through the given attributes and add them to self.attributes
        # for <name> in <attributes>
        attributes = doc.getElementsByTagName('attributes')[0]
        for name in attributes.getElementsByTagName('name'):

            # Setup a list of attribute values
            tmp_list = []

            # for each value under name, store in list
            for value in name.getElementsByTagName('value'):
                tmp_list.append(value.getAttribute('value'))

            # Store list of values in dictionary
            self.attributes[name.getAttribute('name')] = tmp_list

        # Cycle through data objects and add them to self.data
        # for <form> in <paradigm>
        forms = doc.getElementsByTagName('paradigm')[0]
        for form in forms.getElementsByTagName('form'):
            # Initialise a temporary dictionary
            tmp_dict = {}
            for value in form.getElementsByTagName('attribute'):
                tmp_dict[value.getAttribute('name')] = value.getAttribute(
                    'value')
            # Add the new dictionary to the data list
            self.data.append(tmp_dict)

        # Talk to the user
        print "Paradigm information successfully loaded from file:", p_filename
        # State the number and print out a list of attributes
        print " " * 4 + str(len(self.attributes)) + " attributes imported:",
        for att in self.attributes:
            print att,
        print
        # State the number of paradigm objects imported
        print " " * 4 + str(len(self.data)) + " paradigm objects imported."

        return