def _merge_terms(self, iterlist): # Merge-sorts terms coming from a list of term iterators. # Create a map so we can look up each iterator by its id() value itermap = {} for it in iterlist: itermap[id(it)] = it # Fill in the list with the head term from each iterator. current = [] for it in iterlist: term = next(it) current.append((term, id(it))) heapify(current) # Number of active iterators active = len(current) while active: # Peek at the first term in the sorted list term = current[0][0] # Re-iterate on all items in the list that have that term while active and current[0][0] == term: it = itermap[current[0][1]] try: nextterm = next(it) heapreplace(current, (nextterm, id(it))) except StopIteration: heappop(current) active -= 1 # Yield the term yield term
def __iter__(self): ids = iter(self.idset) try: nx = next(ids) except StopIteration: nx = -1 for i in range(self.limit): if i == nx: try: nx = next(ids) except StopIteration: nx = -1 else: yield i
def __iter__(self): ids = iter(self.idset) try: nx = next(ids) except StopIteration: nx = -1 for i in xrange(self.limit): if i == nx: try: nx = next(ids) except StopIteration: nx = -1 else: yield i
def remap(state): if state in mapping: newnum = mapping[state] else: newnum = next(c) mapping[state] = newnum return newnum
def _merge_terms(self, iterlist): # Merge-sorts terms coming from a list of term iterators. # Create a map so we can look up each iterator by its id() value itermap = {} for it in iterlist: itermap[id(it)] = it # Fill in the list with the head term from each iterator. current = [] for it in iterlist: try: term = next(it) except StopIteration: continue current.append((term, id(it))) # Number of active iterators active = len(current) # If only one iterator is active, just yield from it and return if active == 1: term, itid = current[0] it = itermap[itid] yield term for term in it: yield term return # Otherwise, do a streaming heap sort of the terms from the iterators heapify(current) while active: # Peek at the first term in the sorted list term = current[0][0] # Re-iterate on all items in the list that have that term while active and current[0][0] == term: it = itermap[current[0][1]] try: nextterm = next(it) heapreplace(current, (nextterm, id(it))) except StopIteration: heappop(current) active -= 1 # Yield the term yield term
def get_texts(archive): archive = tarfile.open(archive, "r:gz") while True: entry = next(archive) archive.members = [] if entry is None: break f = archive.extractfile(entry) if f is not None: text = f.read() yield text
def __call__(self, tokens): from itertools import tee count = len(self.filters) # Tee the token iterator and wrap each teed iterator with the # corresponding filter gens = [filter(t.copy() for t in gen) for filter, gen in zip(self.filters, tee(tokens, count))] # Keep a count of the number of running iterators running = count while running: for i, gen in enumerate(gens): if gen is not None: try: yield next(gen) except StopIteration: gens[i] = None running -= 1
def u_to_utf8(dfa, base=0): c = itertools.count(base) transitions = dfa.transitions for src, trans in iteritems(transitions): trans = transitions[src] for label, dest in list(iteritems(trans)): if label is EPSILON: continue elif label is ANY: raise Exception else: assert isinstance(label, text_type) label8 = label.encode("utf8") for i, byte in enumerate(label8): if i < len(label8) - 1: st = next(c) dfa.add_transition(src, byte, st) src = st else: dfa.add_transition(src, byte, dest) del trans[label]
def strings_dfa(strings): dfa = DFA(0) c = itertools.count(1) last = "" seen = {} nodes = [DMNode(0)] for string in strings: if string <= last: raise Exception("Strings must be in order") if not string: raise Exception("Can't add empty string") # Find the common prefix with the previous string i = 0 while i < len(last) and i < len(string) and last[i] == string[i]: i += 1 prefixlen = i # Freeze the transitions after the prefix, since they're not shared add_suffix(dfa, nodes, last, prefixlen + 1, seen) # Create new nodes for the substring after the prefix for label in string[prefixlen:]: node = DMNode(next(c)) # Create an arc from the previous node to this node nodes[-1].arcs[label] = node.n nodes.append(node) # Mark the last node as an accept state nodes[-1].final = True last = string if len(nodes) > 1: add_suffix(dfa, nodes, last, 0, seen) return dfa
def __call__(self, tokens): # Only selects on the first token t = next(tokens) filter = self.filters.get(t.mode, self.default_filter) return filter(chain([t], tokens))