def testDeletions(self): dt = DT(self.w, 'mydt', self.V_N, DT.NullTree) vects = [range(self.V_LEN) for i in range(self.V_N)] en = list(enumerate(cartezian(*vects))) for i, v in en: dt[v] = i random.shuffle(en) for i, v in en[:20]: del dt[v] for i, v in en[:20]: self.assertEqual(0, dt[v]) for i, v in en[20:]: self.assertEqual(i, dt[v]) dt.tree.vanish() for i, v in en[:20]: self.assertEqual(0, dt[v]) for i, v in en[20:]: self.assertEqual(i, dt[v])
def testCartezian(self): dt = DT(self.w, 'mydt', self.V_N, DT.NullTree) vects = [range(self.V_LEN) for i in range(self.V_N)] en = list(enumerate(cartezian(*vects))) for i, v in en: dt[v] = i for i, v in en: self.assertEqual(i, dt[v])
def testIO(self): dt = DT(self.w, 'mydt', self.V_N, DT.NullTree) vects = [range(self.V_LEN) for i in range(self.V_N)] en = list(enumerate(cartezian(*vects))) for i, v in en: dt[v] = i self.w.writeToFile(DT, 'gmtk/test.dt') del self.w[DT, 'mydt'] self.w.readFromFile(DT, 'gmtk/test.dt') dt = self.w[DT, 'mydt'] for i, v in en: self.assertEqual(i, dt[v])
def genStates(self): processed = set() backoff_stat = ADict(default=set) osym_map = SymMap() osym_map['epsilon'] = 0 pop_Given_C = self.workspace[gmtk.SCPT, 'popGivenC1C2C3C4'] push_Given_C = self.workspace[gmtk.SCPT, 'pushGivenC1C2C3C4'] c1_Given_C234 = self.workspace[gmtk.SCPT, 'concept1GivenC2C3C4'] c1_Given_C23 = self.workspace[gmtk.SCPT, 'concept1GivenC2C3'] c1_Given_C2 = self.workspace[gmtk.DCPT, 'concept1GivenC2'] c1_backoff = self.workspace[gmtk.DT, 'backoffC2C3C4'] c2_Given_C = self.workspace[gmtk.SCPT, 'concept2GivenC3C4'] s1_Given_C1234 = self.workspace[gmtk.SCPT, 's1GivenC1C2C3C4'] s1_Given_C123 = self.workspace[gmtk.SCPT, 's1GivenC1C2C3'] s1_Given_C12 = self.workspace[gmtk.SCPT, 's1GivenC1C2'] s1_Given_C1 = self.workspace[gmtk.DCPT, 's1GivenC1'] s1_Unigram = self.workspace[gmtk.DCPT, 's1Unigram'] s1_backoff = self.workspace[gmtk.DT, 'backoffC1C2C3C4'] s2_Given_C1234 = self.workspace[gmtk.SCPT, 's2GivenC1C2C3C4'] s2_Given_C123 = self.workspace[gmtk.SCPT, 's2GivenC1C2C3'] s2_Given_C12 = self.workspace[gmtk.SCPT, 's2GivenC1C2'] s2_Given_C1 = self.workspace[gmtk.DCPT, 's2GivenC1'] s2_Unigram = self.workspace[gmtk.DCPT, 's2Unigram'] s3_Given_C1234 = self.workspace[gmtk.SCPT, 's3GivenC1C2C3C4'] s3_Given_C123 = self.workspace[gmtk.SCPT, 's3GivenC1C2C3'] s3_Given_C12 = self.workspace[gmtk.SCPT, 's3GivenC1C2'] s3_Given_C1 = self.workspace[gmtk.DCPT, 's3GivenC1'] s3_Unigram = self.workspace[gmtk.DCPT, 's3Unigram'] conceptMap = self.conceptMap _EMPTY_ = conceptMap[EMPTY_CONCEPT] _DUMMY_ = conceptMap.get(DUMMY_CONCEPT, None) allConcepts = sorted(conceptMap.values()) symbols = [] maps = [] maps2 = [] count = 1 pte_map = SymMap() pte_map2 = SymMap() if self.pteMap: pte_symbols = sorted(self.pteMap.values()) for key, value in sorted(self.pteMap.items()): pte_map[value] = value+count pte_map2[key] = value+count count += len(pte_map) else: pte_symbols = [] for map in self.symbolMaps: if map is None: map = {} symbols.append(sorted(map.values())) new_map = SymMap() new_map2 = SymMap() for key, value in sorted(map.items()): new_map[value] = value+count new_map2[key] = value+count count += len(new_map) maps.append(new_map) maps2.append(new_map2) s0 = (_EMPTY_,)*4 s0_expanded = False cutoff_sym = self.cutoff_sym cutoff_trans = self.cutoff_trans max_states = self.max_states logger = self.logger stack = [(0, 0, s0)] stack_set = set([s0]) state_map = SymMap() state_map[s0] = 0 _pop_ = self._pop_ interim_counter = 0 n_arcs = 0 while stack: if max_states is None: total_states = len(state_map) - interim_counter else: total_states = max_states if logger is not None: logger.debug(' #states (unexpanded/total) %.2f%%, %d/%d, #arcs %d', 100.*len(processed)/total_states, total_states-len(processed), total_states, n_arcs) c_t_backoff, c_t_dist, c_t = stack.pop(0) backoff_stat[c_t_backoff].add(c_t) if logger is not None: logger.debug(' %.2f: %s, backoff=%d', c_t_dist, self.strState(c_t), c_t_backoff) state_c_t = state_map[c_t] processed.add(c_t) stack_set.remove(c_t) ret = [] pop_pmf = list(pop_Given_C[: c_t[0], c_t[1], c_t[2], c_t[3]]) push_pmf = list(push_Given_C[: c_t[0], c_t[1], c_t[2], c_t[3]]) for pop in range(0, MAX_POP+1): prob_pop = pop_pmf[pop] if prob_pop <= cutoff_trans: continue interim_counter += 1 c_inter = c_t[pop:] + (_EMPTY_, ) * pop osym = ')'*pop if not osym: osym = 'epsilon' ret.append( (prob_pop, c_t, (c_t, c_inter), _pop_, osym) ) for push in range(0, MAX_PUSH+1): prob_push = push_pmf[push] if push == 0: to_push_all = [()] else: to_push_all = cartezian(*[allConcepts]*push) for to_push in to_push_all: c_new = (to_push + c_inter)[:DEPTH] if (c_t == c_new) and not (push == pop == 0): continue if _DUMMY_ in c_new[1:]: continue # Output symbol osym = '' for push_concept in reversed(to_push): osym += conceptMap.inverse[push_concept]+'(' if not osym: osym = 'epsilon' # Smoothing backoff = c1_backoff[c_new[1], c_new[2], c_new[3]] if backoff == 0: c1_pmf = c1_Given_C234[: c_new[1], c_new[2], c_new[3]] elif backoff == 1: c1_pmf = c1_Given_C23[: c_new[1], c_new[2]] else: c1_pmf = c1_Given_C2[: c_new[1]] c2_pmf = c2_Given_C[: c_new[2], c_new[3]] if push == 0: prob_new_c = 1.0 elif push == 1: prob_new_c = c1_pmf[to_push[0]] elif push == 2: prob_new_c = c1_pmf[to_push[0]] * c2_pmf[to_push[1]] prob_trans = prob_push * prob_new_c # Do cut-off if prob_trans <= cutoff_trans: continue # Smoothing backoff = s1_backoff[c_new[0], c_new[1], c_new[2], c_new[3]] if backoff == 0: s_pmf = [list(s1_Given_C1234[: c_new[0], c_new[1], c_new[2], c_new[3]]), list(s2_Given_C1234[: c_new[0], c_new[1], c_new[2], c_new[3]]), list(s3_Given_C1234[: c_new[0], c_new[1], c_new[2], c_new[3]])] elif backoff == 1: s_pmf = [list(s1_Given_C123[: c_new[0], c_new[1], c_new[2]]), list(s2_Given_C123[: c_new[0], c_new[1], c_new[2]]), list(s3_Given_C123[: c_new[0], c_new[1], c_new[2]])] elif backoff == 2: s_pmf = [list(s1_Given_C12[: c_new[0], c_new[1]]), list(s2_Given_C12[: c_new[0], c_new[1]]), list(s3_Given_C12[: c_new[0], c_new[1]])] elif backoff == 3: s_pmf = [list(s1_Given_C1[: c_new[0]]), list(s2_Given_C1[: c_new[0]]), list(s3_Given_C1[: c_new[0]])] else: s_pmf = [list(s1_Unigram), list(s2_Unigram), list(s3_Unigram)] if c_new not in processed and c_new not in stack_set: stack_set.add(c_new) c_new_dist = (c_t_dist-log(prob_trans)) insort(stack, (backoff, c_t_dist-log(prob_trans), c_new)) c_next = (c_t, c_inter) if pte_symbols and c_inter == (_EMPTY_,)*4 and push != 0: for pte_sym in pte_symbols: prob_ptesym = 1.0 pte_sym = pte_map[pte_sym] pte_osym = pte_map2.inverse[pte_sym] ret.append( (prob_trans*prob_ptesym, c_next, c_new, pte_sym, pte_osym) ) prob_trans = 1.0 c_next = c_new for sym, map, pmf in zip(symbols, maps, s_pmf): if map is None: continue for isym in sym: prob_isym = pmf[isym] # Do cut-off if prob_isym <= cutoff_sym: continue else: isym = map[isym] ret.append( (prob_trans*prob_isym, c_next, c_new, isym, osym) ) # For symbols other than the first prob_trans = 1.0 c_next = c_new osym = 'epsilon' for prob, c_t, c_new, isym, osym in ret: state_c_new = state_map.add(c_new) state_c_t = state_map.add(c_t) osym = osym_map.add(osym) n_arcs += 1 yield state_c_t, state_c_new, isym, osym, prob if max_states is not None and len(processed) >= max_states: break self.stateMap = self.convertStateMap(state_map) self.osymMap = osym_map self.isymMaps = maps2 self.ipteMap = pte_map2 backoff_stat = ADict((k, len(v)) for (k,v) in backoff_stat.iteritems()) if logger is not None: logger.debug('Backoff statistics:') logger.debug('===================') total = backoff_stat.sum() for key, value in sorted(backoff_stat.items()): logger.debug(' backoff=%d: #%d (%.2f%%)', key, value, 100.*value/total)
def getPossibleParents(self): keys = [range(i) for i in self.parentCards] if keys: return cartezian(*keys) else: return [()]
def genStates(self): processed = set() backoff_stat = ADict(default=set) osym_map = SymMap() osym_map['epsilon'] = 0 pop_Given_C = self.workspace[gmtk.SCPT, 'popGivenC1C2C3C4'] push_Given_C = self.workspace[gmtk.SCPT, 'pushGivenC1C2C3C4'] c1_Given_C234 = self.workspace[gmtk.SCPT, 'concept1GivenC2C3C4'] c1_Given_C23 = self.workspace[gmtk.SCPT, 'concept1GivenC2C3'] c1_Given_C2 = self.workspace[gmtk.DCPT, 'concept1GivenC2'] c1_backoff = self.workspace[gmtk.DT, 'backoffC2C3C4'] c2_Given_C = self.workspace[gmtk.SCPT, 'concept2GivenC3C4'] s1_Given_C1234 = self.workspace[gmtk.SCPT, 's1GivenC1C2C3C4'] s1_Given_C123 = self.workspace[gmtk.SCPT, 's1GivenC1C2C3'] s1_Given_C12 = self.workspace[gmtk.SCPT, 's1GivenC1C2'] s1_Given_C1 = self.workspace[gmtk.DCPT, 's1GivenC1'] s1_Unigram = self.workspace[gmtk.DCPT, 's1Unigram'] s1_backoff = self.workspace[gmtk.DT, 'backoffC1C2C3C4'] s2_Given_C1234 = self.workspace[gmtk.SCPT, 's2GivenC1C2C3C4'] s2_Given_C123 = self.workspace[gmtk.SCPT, 's2GivenC1C2C3'] s2_Given_C12 = self.workspace[gmtk.SCPT, 's2GivenC1C2'] s2_Given_C1 = self.workspace[gmtk.DCPT, 's2GivenC1'] s2_Unigram = self.workspace[gmtk.DCPT, 's2Unigram'] s3_Given_C1234 = self.workspace[gmtk.SCPT, 's3GivenC1C2C3C4'] s3_Given_C123 = self.workspace[gmtk.SCPT, 's3GivenC1C2C3'] s3_Given_C12 = self.workspace[gmtk.SCPT, 's3GivenC1C2'] s3_Given_C1 = self.workspace[gmtk.DCPT, 's3GivenC1'] s3_Unigram = self.workspace[gmtk.DCPT, 's3Unigram'] conceptMap = self.conceptMap _EMPTY_ = conceptMap[EMPTY_CONCEPT] _DUMMY_ = conceptMap.get(DUMMY_CONCEPT, None) allConcepts = sorted(conceptMap.values()) symbols = [] maps = [] maps2 = [] count = 1 pte_map = SymMap() pte_map2 = SymMap() if self.pteMap: pte_symbols = sorted(self.pteMap.values()) for key, value in sorted(self.pteMap.items()): pte_map[value] = value + count pte_map2[key] = value + count count += len(pte_map) else: pte_symbols = [] for map in self.symbolMaps: if map is None: map = {} symbols.append(sorted(map.values())) new_map = SymMap() new_map2 = SymMap() for key, value in sorted(map.items()): new_map[value] = value + count new_map2[key] = value + count count += len(new_map) maps.append(new_map) maps2.append(new_map2) s0 = (_EMPTY_, ) * 4 s0_expanded = False cutoff_sym = self.cutoff_sym cutoff_trans = self.cutoff_trans max_states = self.max_states logger = self.logger stack = [(0, 0, s0)] stack_set = set([s0]) state_map = SymMap() state_map[s0] = 0 _pop_ = self._pop_ interim_counter = 0 n_arcs = 0 while stack: if max_states is None: total_states = len(state_map) - interim_counter else: total_states = max_states if logger is not None: logger.debug( ' #states (unexpanded/total) %.2f%%, %d/%d, #arcs %d', 100. * len(processed) / total_states, total_states - len(processed), total_states, n_arcs) c_t_backoff, c_t_dist, c_t = stack.pop(0) backoff_stat[c_t_backoff].add(c_t) if logger is not None: logger.debug(' %.2f: %s, backoff=%d', c_t_dist, self.strState(c_t), c_t_backoff) state_c_t = state_map[c_t] processed.add(c_t) stack_set.remove(c_t) ret = [] pop_pmf = list(pop_Given_C[:c_t[0], c_t[1], c_t[2], c_t[3]]) push_pmf = list(push_Given_C[:c_t[0], c_t[1], c_t[2], c_t[3]]) for pop in range(0, MAX_POP + 1): prob_pop = pop_pmf[pop] if prob_pop <= cutoff_trans: continue interim_counter += 1 c_inter = c_t[pop:] + (_EMPTY_, ) * pop osym = ')' * pop if not osym: osym = 'epsilon' ret.append((prob_pop, c_t, (c_t, c_inter), _pop_, osym)) for push in range(0, MAX_PUSH + 1): prob_push = push_pmf[push] if push == 0: to_push_all = [()] else: to_push_all = cartezian(*[allConcepts] * push) for to_push in to_push_all: c_new = (to_push + c_inter)[:DEPTH] if (c_t == c_new) and not (push == pop == 0): continue if _DUMMY_ in c_new[1:]: continue # Output symbol osym = '' for push_concept in reversed(to_push): osym += conceptMap.inverse[push_concept] + '(' if not osym: osym = 'epsilon' # Smoothing backoff = c1_backoff[c_new[1], c_new[2], c_new[3]] if backoff == 0: c1_pmf = c1_Given_C234[:c_new[1], c_new[2], c_new[3]] elif backoff == 1: c1_pmf = c1_Given_C23[:c_new[1], c_new[2]] else: c1_pmf = c1_Given_C2[:c_new[1]] c2_pmf = c2_Given_C[:c_new[2], c_new[3]] if push == 0: prob_new_c = 1.0 elif push == 1: prob_new_c = c1_pmf[to_push[0]] elif push == 2: prob_new_c = c1_pmf[to_push[0]] * c2_pmf[ to_push[1]] prob_trans = prob_push * prob_new_c # Do cut-off if prob_trans <= cutoff_trans: continue # Smoothing backoff = s1_backoff[c_new[0], c_new[1], c_new[2], c_new[3]] if backoff == 0: s_pmf = [ list(s1_Given_C1234[:c_new[0], c_new[1], c_new[2], c_new[3]]), list(s2_Given_C1234[:c_new[0], c_new[1], c_new[2], c_new[3]]), list(s3_Given_C1234[:c_new[0], c_new[1], c_new[2], c_new[3]]) ] elif backoff == 1: s_pmf = [ list(s1_Given_C123[:c_new[0], c_new[1], c_new[2]]), list(s2_Given_C123[:c_new[0], c_new[1], c_new[2]]), list(s3_Given_C123[:c_new[0], c_new[1], c_new[2]]) ] elif backoff == 2: s_pmf = [ list(s1_Given_C12[:c_new[0], c_new[1]]), list(s2_Given_C12[:c_new[0], c_new[1]]), list(s3_Given_C12[:c_new[0], c_new[1]]) ] elif backoff == 3: s_pmf = [ list(s1_Given_C1[:c_new[0]]), list(s2_Given_C1[:c_new[0]]), list(s3_Given_C1[:c_new[0]]) ] else: s_pmf = [ list(s1_Unigram), list(s2_Unigram), list(s3_Unigram) ] if c_new not in processed and c_new not in stack_set: stack_set.add(c_new) c_new_dist = (c_t_dist - log(prob_trans)) insort( stack, (backoff, c_t_dist - log(prob_trans), c_new)) c_next = (c_t, c_inter) if pte_symbols and c_inter == ( _EMPTY_, ) * 4 and push != 0: for pte_sym in pte_symbols: prob_ptesym = 1.0 pte_sym = pte_map[pte_sym] pte_osym = pte_map2.inverse[pte_sym] ret.append((prob_trans * prob_ptesym, c_next, c_new, pte_sym, pte_osym)) prob_trans = 1.0 c_next = c_new for sym, map, pmf in zip(symbols, maps, s_pmf): if map is None: continue for isym in sym: prob_isym = pmf[isym] # Do cut-off if prob_isym <= cutoff_sym: continue else: isym = map[isym] ret.append((prob_trans * prob_isym, c_next, c_new, isym, osym)) # For symbols other than the first prob_trans = 1.0 c_next = c_new osym = 'epsilon' for prob, c_t, c_new, isym, osym in ret: state_c_new = state_map.add(c_new) state_c_t = state_map.add(c_t) osym = osym_map.add(osym) n_arcs += 1 yield state_c_t, state_c_new, isym, osym, prob if max_states is not None and len(processed) >= max_states: break self.stateMap = self.convertStateMap(state_map) self.osymMap = osym_map self.isymMaps = maps2 self.ipteMap = pte_map2 backoff_stat = ADict( (k, len(v)) for (k, v) in backoff_stat.iteritems()) if logger is not None: logger.debug('Backoff statistics:') logger.debug('===================') total = backoff_stat.sum() for key, value in sorted(backoff_stat.items()): logger.debug(' backoff=%d: #%d (%.2f%%)', key, value, 100. * value / total)