def generate(division): if division=='ns': each(cat, cross([northern, southern], 'path trigram'.split())) else: ns = northern[1] + southern[1] random.shuffle(ns) n = ('NorthernRandom', ns[:len(northern[1])]) s = ('SouthernRandom', ns[len(northern[1]):]) each(cat, cross([n, s], 'path trigram'.split()))
def groupavg((c1,c2)): "group average" return avg(map(compose(edges.__getitem__, frozenset), cross(flatten(c1), flatten(c2))))
def complete((c1,c2)): "complete link" return max(map(compose(edges.__getitem__, frozenset), cross(flatten(c1), flatten(c2))))
def single((c1,c2)): "single link" return min(map(compose(edges.__getitem__, frozenset), cross(flatten(c1), flatten(c2))))
def sed_avg(ws1, ws2): "[{str:[float]}]*[{str:[float]}] -> float" segs1,segs2 = (concat(transpose_word(ws1)), concat(transpose_word(ws1))) return lst.avg(map(fnc.uncurry(feature_sub), lst.cross(segs1, segs2)))
avgregions = lst.avg(map(sed_avg_total, regions)) return dict(zip(keys, map(sed_distance(avgregions), regions))) def feature_sub(seg1, seg2): "({str:float}*{str:float}) -> float" return (len(set(seg1) ^ set(seg2)) + sum(abs(f1-f2) for f1,f2 in dct.zip(seg1,seg2).values())) @curried def sed_distance(avg, (region1, region2)): "float*([[{str:[float]}]],[[{str:[float]}]])->float" return sum(map(sed_levenshtein(avg), zip(region1, region2))) def transpose_word(word): "[{str:[float]}] -> [[{str:float}]]" def transpose_segment(seg): return [dict(zip(seg.keys(), ns)) for ns in lst.transpose(seg.values())] return lst.transpose(map(transpose_segment, word)) @curried def sed_levenshtein(avg,(ws1,ws2)): "float*([{str:[float]}],[{str:[float]}])->float" def levenshtein((w1, w2)): return lev._levenshtein(w1, w2, avg, (lambda _:avg,lambda _:avg,feature_sub))[-1][-1] return lst.avg(map(levenshtein, lst.cross(transpose_word(ws1), transpose_word(ws2)))) def sed_avg(ws1, ws2): "[{str:[float]}]*[{str:[float]}] -> float" segs1,segs2 = (concat(transpose_word(ws1)), concat(transpose_word(ws1))) return lst.avg(map(fnc.uncurry(feature_sub), lst.cross(segs1, segs2))) def sed_avg_total((region1, region2)): "([[{str:[float]}]],[[{str:[float]}]]) -> float" return lst.avg(map(sed_avg, region1, region2)) / 2