def group_words(csv): "[[str]]-> {str:{str:{str:[float]}}} ie {Word:{Segment:{Feature:[Value]}}}" segment_name = lambda s: s[:re.search('[0-9]', s).end()] segment = fnc.pipe(car, dropwhile(str.islower), segment_name) feature = lambda s: s[re.search('[0-9]', s).end():] fillsegments = curried(dct.map_items)(makesegment) features = carcdr(lambda title, data:(feature(title), map(float, data))) phones = lambda l: dct.map(dict, dct.collapse(l, segment, features)) words = dct.collapse(cdr(csv), fnc.pipe(car, takewhile(str.islower)), fnc.ident) return dct.map(fnc.pipe(phones, fillsegments), words)
def corpus(speakers): "Warning! This contains a hard-coded path specific to jones" #@typecheck((str,str), [(str, [object])]) def per_speaker((fname,speaker)): return sentences(open('/Volumes/Data/Corpora/en/ice-gb/ice-gb-2/data/'+ fname.lower()+'.cor'))[speaker] return dct.map(lambda files: mapn(per_speaker, files), speakers)
def group_regions(regions, words): """{str:[int]}*{str:{str:{str:[float]}}} -> {str:{str:{str:{str:[float]}}}} that is, {Region:{Word:{Segment:(Type,{Feature:[Value]})}}}""" sub2 = lambda n: n-2 dctmapper = curried(dct.map) def outermost(range): inner = dctmapper(dctmapper(lst_extract(map(sub2, range)))) return dct.map(inner, words) return dct.map(outermost, regions)
def makesegment(type,d): # C's numbers: # GL=PV: {0,.5,1}, H/HW/W: {0,1}, V=C=PL=IR=VO={0,1}, L={0,1,2} # I think H/HW/W should be collapsed at read time. L(6), PV(5) and C(4) not # also not IR,VO,PL(2) but I wish we had more of them. size = len(d.itervalues().next()) features = dict(C=dict(GL=0.0, V=0.0, H=0.0, PV=0.0, L=0.0),#H=HW=W total(6) V=dict(B=1.0, H=1.0, L=1.0, R=1.0), #Got rid of '' and "RH" R=dict(MN=1.5, PL=1.0), # mult's range is 0.0 - 2.0 but its meaning varies? MULT=dict(MULT=1.0), VC=dict()) # VC is erroneous data eh. #TODO:Collapse H/HW/W #TODO:Decide if V's L and C's L are different and if so make them different keys = dct.map(lambda default:[default]*size, features[chop(type)]) keys.update(d) return keys
def outermost(range): inner = dctmapper(dctmapper(lst_extract(map(sub2, range)))) return dct.map(inner, words)
def classify(row): "[[lev.Rule]] -> {utf-8-char:set<lev.Rule>}" return dct.map( set, dct.collapse(filter(negate(self_sub), concat(row)), keymap=lambda rule: rule.src) # collapse_envs, )
def find_collapsed(f, collapsed): "{char:[int]} -> [(char,int)] (sorted)" return sorted(dct.map(f, collapsed).items(), key=snd, reverse=True)
def readcorpus(extractor, speakers, delimiter='\t'): return dct.map(cur(map, extractor), iceread.read(speakers, 12, delimiter))
def tinify(regions): items = sorted(dct.count(mapn(concat, regions.values())).items(), key=snd) code = encode(map(fst, items)) return dct.map(cur(map, cur(map, code.__getitem__)), regions)
def corpusSize(path, regions): "path*{region:[filename]}" numbers = dct.map(lambda files:lap(pipe(read(path), len), files), regions) return sorted(dct.map(sum, numbers).items(), key=snd)
def groupedRegions(path, regions): "path*{region:[site]} -> {region:[filename]}" return dct.map(pipe(cur(groupedSites)(path), dict.values, concat), regions)