예제 #1
0
def make_parse_dictionary():
    '''
    Creates a python dictionary structure from the corpus in the file 
    annotated-dictionary.xls, omitting nonwords of various types. 
    The keys are valid the stems and the values are the corresponding 
    parsed (root, pattern) pairs in the Dictionary.
    '''
    nonwordtypes = ['abbreviation', 'compound', 'dialect_word', 'foreign_word', \
                'function_word', 'interjection', 'letter_name', 'proper_name']

    baam_parsings=baampath + 'annotated-dictionary.xls'
    parsings=form.loadexcel(baam_parsings)
    source_parsings=parsings.sheet_by_name('dictionary')
    entries=source_parsings.col_values(0)
    root=source_parsings.col_values(1)
    pattern=source_parsings.col_values(2)
    source_types=parsings.sheet_by_name('types')
    types = source_types.col_values(0)
    dictionarydic={}
    stemdic={}
    point=0
    for (point, entry) in enumerate(entries):
        if pattern[point] not in nonwordtypes:
            dictionarydic[entry]=(root[point].encode(),pattern[point].encode())
    return dictionarydic
예제 #2
0
def make_BANARM_dictionary(source):
    ''' 
    Create a BANARM dictionary for a named tab ('source') in the
    spreadsheet 'root-pattern-frequencies.xls'.

    Requires module 'form'
    
    Parameter:
    ---------------
    source: string 
        Get BAAM data from Excel spreadsheet using function from module 'form'
    '''
    baam_databook=baampath+r'root-pattern-frequencies.xls'
    databook=form.loadexcel(baam_databook)
    source_sheet = databook.sheet_by_name(source)
    source_names = source_sheet.col_values(0)
    for (n, name) in enumerate(source_names):
        source_names[n] = name.encode()
    source_tokens = source_sheet.col_values(1)
    source_no = source_sheet.nrows
    return make_dictionary(source_names,source_tokens)