def __init__(self, form, msd=[], v_insts=[]): (self.form, self.msd) = (form.split('+'), msd) self.scount = 0 r = '' #print self.form for f in self.form: #print (f) if f.isdigit(): r += '(.+)' else: r += f self.scount += len(f) self.regex = r #print (r) #print '\n' self.cregex = re.compile(self.regex) # vars collect_vars = defaultdict(set) for vs in v_insts: #print vs for (i, v) in vs: collect_vars[i].add(v) self.v_regex = [] for (_, ss) in collect_vars.iteritems(): self.v_regex.append( re.compile(genregex.genregex(ss, pvalue=0.05).pyregex()))
def paradigms_to_foma(paradigms, grammarname, pval): """Converts iterable of paradigms to foma-script (as a string).""" parvars = {} rstring = u'' defstring = u'' substring = u'' par_is_constrained = {} alphabet = paradigms_to_alphabet(paradigms) alphabet = {'"' + a + '"' for a in alphabet} alphstring = 'def Alph ' + u'|'.join(alphabet) + ';\n' for paradigm in paradigms: # if paradigm.count < 3 and grammarname == 'Gunconstrained': # continue par_is_constrained[paradigm.name] = False parstrings = [] for formnumber, form in enumerate(paradigm.forms): tagstrings = map( lambda (feature, value): u'"' + feature + u'"' + u' = ' + u'"' + value + u'"', form.msd) parstring = u'' for idx, (is_var, slot) in enumerate(paradigm.slots): if is_var: parvarname = nospace(paradigm.name) + '=var' + str(idx) if parvarname not in parvars: r = genregex.genregex(slot, pvalue=pval, length=False) parvars[parvarname] = True if r.fomaregex() != '?+': par_is_constrained[paradigm.name] = True defstring += 'def ' + parvarname + ' ' + r.fomaregex( ).replace('?', 'Alph') + ';\n' parstring += ' [' + parvarname + '] ' else: thisslot = escape_fixed_string(slot[formnumber]) baseformslot = escape_fixed_string(slot[0]) parstring += u' [' + thisslot + u':' + baseformslot + u'] ' parstring += u'0:["[" ' + u' " " '.join(tagstrings) + u' "]"]' parstrings.append(parstring) #if grammarname != 'Gcodnstrained' or par_is_constrained[paradigm.name]: rstring += u'def ' + nospace( paradigm.name) + u'|\n'.join(parstrings) + u';\n' #parnames = [nospace(paradigm.name) for paradigm in paradigms if ' ' not in paradigm.name] parnames = [] for paradigm in paradigms: #if ' ' not in paradigm.name and (grammarname != 'Gconstrdained' or par_is_constrained[paradigm.name]): parnames.append(nospace(paradigm.name)) rstring += u'def ' + grammarname + u' ' + u' | '.join(parnames) + u';' return alphstring + defstring + rstring
def __init__(self, form, msd=[], v_insts=[]): (self.form,self.msd) = (form.split('+'), msd) self.scount = 0 r = '' for f in self.form: if f.isdigit(): r += '(.+)' else: r += f self.scount += len(f) self.regex = r self.cregex = re.compile(self.regex) # vars collect_vars = defaultdict(set) for vs in v_insts: for (i,v) in vs: collect_vars[i].add(v) self.v_regex = [] for (_,ss) in collect_vars.iteritems(): self.v_regex.append(re.compile(genregex.genregex(ss,pvalue=0.05).pyregex()))