def test_irange(): ss = SortedSet(load=7) assert [] == list(ss.irange()) values = list(range(53)) ss.update(values) for start in range(53): for end in range(start, 53): assert list(ss.irange(start, end)) == values[start:(end + 1)] assert list(ss.irange(start, end, reverse=True)) == values[start:(end + 1)][::-1] for start in range(53): for end in range(start, 53): assert list(range(start, end)) == list(ss.irange(start, end, (True, False))) for start in range(53): for end in range(start, 53): assert list(range(start + 1, end + 1)) == list(ss.irange(start, end, (False, True))) for start in range(53): for end in range(start, 53): assert list(range(start + 1, end)) == list(ss.irange(start, end, (False, False))) for start in range(53): assert list(range(start, 53)) == list(ss.irange(start)) for end in range(53): assert list(range(0, end)) == list(ss.irange(None, end, (True, False))) assert values == list(ss.irange(inclusive=(False, False))) assert [] == list(ss.irange(53)) assert values == list(ss.irange(None, 53, (True, False)))
def create(self, head_vars, affected_lines, primary_loc_keys, ircfg, address): self._address = hex(address) if address else "None" affected_exprs = {} dp = depgraph.DependencyGraph(ircfg, True) for block_loc_key in affected_lines: block = ircfg.blocks[block_loc_key] cur_affected_exprs = SortedSet(key=lambda x: str(x)) for line_nb in affected_lines[block_loc_key]: affected_assignments = block.assignblks[line_nb] for ind, (dst, src) in enumerate(affected_assignments.items()): if type(src) not in [ExprInt, ExprMem]: res = next( dp.get(block_loc_key, {dst}, ind, {block_loc_key})) cur_affected_exprs.update( filter(lambda x: not is_bad_expr(x), res.pending.keys())) affected_exprs[block_loc_key] = cur_affected_exprs loop = FlatteningLoop(list(head_vars), primary_loc_keys, affected_lines, affected_exprs, self.loc_db.add_location()) upd = {} for i in loop.primary_loc_keys: if i in self._loc_key_to_loop: raise RuntimeError( "Overlap of primary blocks of the flattening loops") upd[i] = loop self._loc_key_to_loop.update(upd) self.loops.append(loop) return loop
def test_islice(): ss = SortedSet(load=7) assert [] == list(ss.islice()) values = list(range(53)) ss.update(values) for start in range(53): for stop in range(53): assert list(ss.islice(start, stop)) == values[start:stop] for start in range(53): for stop in range(53): assert list(ss.islice(start, stop, reverse=True)) == values[start:stop][::-1] for start in range(53): assert list(ss.islice(start=start)) == values[start:] assert list(ss.islice(start=start, reverse=True)) == values[start:][::-1] for stop in range(53): assert list(ss.islice(stop=stop)) == values[:stop] assert list(ss.islice(stop=stop, reverse=True)) == values[:stop][::-1]
class Unit: __doc_id_count__ = 0 @classmethod def genDocId(cls): cls.__doc_id_count__ += 1 return cls.__doc_id_count__ - 1 def __init__(self, file, unit_number): self.file = file self.unit_number = unit_number self.words = SortedSet([]) self.docId = self.genDocId() def add(self, iterable): self.words.update(iterable) def keywords(self): return self.words def id(self): return self.docId def string(self): return "<" + str(self.docId) + ", " + self.file + ", unit " + str( self.unit_number) + ">" def __lt__(self, unit): return self.docId < unit.docId
def _calcGeneSNPcorr(self, cr, gene, REF, useAll=False): if self._joint and self._MAP is not None: G = self._GENEID[gene] P = SortedSet(REF[str(cr)][1].irange(G[1] - self._window, G[2] + self._window)) if gene in self._MAP: P.update( list(REF[str(cr)][0].getSNPsPos( list(self._MAP[gene].keys())))) #P = list(set(P)) elif self._MAP is None: G = self._GENEID[gene] P = REF[str(cr)][1].irange(G[1] - self._window, G[2] + self._window) else: if gene in self._MAP: P = set(REF[str(cr)][0].getSNPsPos(list( self._MAP[gene].keys()))) useAll = True else: P = [] DATA = REF[str(cr)][0].get(list(P)) filtered = {} #use = [] #RID = [] # Sort out for D in DATA: # Select if (D[0] in self._GWAS or useAll) and (D[1] > self._MAF) and ( D[0] not in filtered or D[1] < filtered[D[0]][0]): filtered[D[0]] = [D[1], D[2]] #use.append(D[2]) #RID.append(s) # Calc corr RID = list(filtered.keys()) use = [] for i in range(0, len(RID)): use.append(filtered[RID[i]][1]) use = np.array(use) if len(use) > 1: if self._useGPU: C = cp.asnumpy(cp.corrcoef(cp.asarray(use))) else: C = np.corrcoef(use) else: C = np.ones((1, 1)) return C, np.array(RID)
class _CoinDataSet(object): def __init__(self, init_from_file=True): self._market = coinmarketcap.Market() self._data = SortedSet() if init_from_file: for filename in os.listdir(STORAGE_DIR): with open(os.path.join(STORAGE_DIR, filename), 'r') as fp: datapoint_list = json.load( fp, object_hook=_CDPEncoder.decode_hook) self._data.update(datapoint_list) def _DownloadNewDataPoint(self): cmc_dict = self._market.ticker(limit=0) data_to_store = { coin["symbol"]: coin["price_usd"] for coin in cmc_dict } self._data.add(_CoinDataPoint(timestamp(), data_to_store)) self._DumpCurrentDayToFile() def _DumpAllToFile(self, filestr): data_to_dump = list(self._data) with open(filestr, 'w') as fp: json.dump(data_to_dump, fp, cls=_CDPEncoder) def _DumpCurrentDayToFile(self): # Midnight in unix time (system time zone) midnight = datetime.combine(date.today(), time.min) midnight_unix = int(midnight.timestamp()) # All data since midnight. data_to_dump = list(self._data.irange(_CoinDataPoint(midnight_unix))) filestr = os.path.join(STORAGE_DIR, midnight.strftime('%Y-%m-%d.coinjson')) with open(filestr, 'w') as fp: json.dump(data_to_dump, fp, cls=_CDPEncoder) def GetValue(self, symbol, time=None): try: if not time: return float(self._data[-1].coin_data[symbol.upper()]) else: bisect_point = self._data.bisect(_CoinDataPoint(time)) if (bisect_point) is 0: return None return float(self._data[bisect_point - 1].coin_data[symbol.upper()]) except (IndexError, KeyError): return None def GetDayChange(self, symbol): currentVal = self.GetValue(symbol) yesterday_time = datetime.today() - timedelta(days=1) oldVal = self.GetValue(symbol, yesterday_time.timestamp()) if oldVal is None: return None return 100 * ((currentVal - oldVal) / oldVal)
def fit_note(self, note): # TODO: possibly add scale notes to valid notes chord = self.midi_state.active_notes(self.chord_channel) # TODO: this currently maps to black AND white keys, MelodicFlow maps only to white keys. # This extends the range on the keyboard, but this solution should be more easily compatible # with generated output, as we don't have to transpose the black keys. # TODO: do not recompute if same chord as before (cache valid notes) if chord: middle_octave_chords = 4 middle_octave_melody = 8 # If the input note is too low, transpose it upwards # to apply the harmonisation, then transpose it back down. transposed_octave = 0 while note < (middle_octave_melody * 12): note += 12 transposed_octave += 1 # Root C note of all octaves octaves = list(range(0, 127, 12)) # normalize chord to C0, then generate tranposed chords for every octave lowest, count = min(chord), -1 while lowest >= 0: count += 1 lowest -= 12 mapped_over_range = [[e - (12 * count) + octave for e in chord] for octave in octaves] # get valid notes, split for positive and negative movement f_a = SortedSet([ e for l in mapped_over_range[:middle_octave_chords] for e in l ]) f_a.update( [e for l in major_notes[:middle_octave_chords] for e in l]) f_b = SortedSet([ e for l in mapped_over_range[middle_octave_chords:] for e in l ]) f_b.update( [e for l in major_notes[middle_octave_chords:] for e in l]) # get relative distance from played key to middle C of melody diff = note - octaves[middle_octave_melody] note -= transposed_octave * 12 # clamp to valid note range diff = max(-len(f_a), min(diff, len(f_b) - 1)) # jump to next valid note, either up or down if diff < 0: note = f_a[len(f_a) + diff] else: note = f_b[diff] # clamp note to valid MIDI note range note = max(0, min(note, 127)) return note
def getOperators(self, *channels): operator_set = SortedSet() if channels: for channel in channels: operator_set.update(self._operators[channel]) else: operator_set.update(self.operators) return operator_set
def monotone_bfs(self, iu): seen = set([iu]) layers = [SortedSet([iu])] while True: next_layer = SortedSet() for iv in layers[-1]: next_layer.update(set(self.in_neighbours[iv]) - seen) if len(next_layer) == 0: break seen.update(next_layer) layers.append(next_layer) return layers
class SortedSetKey: def __init__(self): self.dict = dict() self.sorted_set = SortedSet(key=self.get_key) def __getitem__(self, item): return self.sorted_set[item] def __len__(self): return len(self.sorted_set) def __str__(self): return str(self.sorted_set) def get_key(self, value): return self.dict[value] def get_reversed_list(self, index, count): return self[-1 - index:-1 - index - count:-1] def values(self): for value in self.sorted_set: yield value def clear(self): self.sorted_set.clear() self.dict.clear() def destroy(self): self.sorted_set = None def index(self, value): return self.sorted_set.index(value) def pop(self, index=-1): return self.sorted_set.pop(index) def add(self, value, rank): if value in self.sorted_set: self.sorted_set.remove(value) self.dict[value] = rank self.sorted_set.add(value) def remove(self, value): self.sorted_set.remove(value) del self.dict[value] def update(self, value_list, rank_list): self.sorted_set.difference_update(value_list) for i, value in enumerate(value_list): self.dict[value] = rank_list[i] self.sorted_set.update(value_list)
def preprocArgs(): ''' For each verb and associated set of questions, for each question, get known argument if not already processed get the lemma for that argument, get the list of ... NOT SURE ''' for r, qs in USP.rel_qs.items(): ignoredQs = set() for q in qs: if q.getArg() in USP.arg_cis: continue cis = SortedSet() ts = q.getArg().split() isIgnored = False for f in ts: if f in ['the','of','in']: continue if f not in USP.form_lemma: isIgnored = True break else: ls = USP.form_lemma[f] for l in ls: if l in USP.lemma_clustIdxs: cis.update(USP.lemma_clustIdxs[l]) if isIgnored: ignoredQs.add(q) continue if len(ts) >= 2: hs = USP.form_lemma[ts[-1]] ds = USP.form_lemma[ts[-2]] for h in hs: for d in ds: if (h, d) in USP.headDep_clustIdxs: cis.add(USP.headDep_clustIdxs[(h, d)]) USP.arg_cis[q.getArg()] = cis qs = [x for x in qs if x not in ignoredQs] USP.rel_qs[r] = qs return None
def __init__(self, sentences): # self.sentences = sentences self.word2idx = {} self.idx2word = {} vocabulary = SortedSet() for phrase in sentences: vocabulary.update(phrase.split(' ')) self.word2idx['<pad>'] = 0 for index, word in enumerate(vocabulary): self.word2idx[word] = index + 1 for word, index in self.word2idx.items(): self.idx2word[index] = word
def get_probe_contexts( probes: SortedSet, tokens: List[str], context_size: int, preserve_order: bool, min_num_contexts: int = 2, ) -> Tuple[Dict[str, Tuple[str]], SortedSet, SortedSet]: # get all probe contexts probe2contexts = SortedDict({p: [] for p in probes}) contexts_in_order = get_sliding_windows(context_size, tokens) context_types = SortedSet() for n, context in enumerate(contexts_in_order[:-context_size]): next_context = contexts_in_order[n + 1] # todo this only works for LEFT contexts target = next_context[-1] if target == 'Monster_cookie': print(target) if target in probes: if preserve_order: probe2contexts[target].append(context) context_types.add(context) else: single_word_contexts = [(w, ) for w in context] probe2contexts[target].extend(single_word_contexts) context_types.update(single_word_contexts) # exclude entries with too few contexts excluded = [] included = SortedSet() for probe, contexts in probe2contexts.items(): if len(contexts) < min_num_contexts: excluded.append(probe) else: included.add(probe) for p in excluded: print( f'WARNING: Excluding "{p}" because it occurs {len(probe2contexts[p])} times' ) del probe2contexts[p] return probe2contexts, context_types, included
class _PortfolioSet(object): def __init__(self, user_id): self.user_id = user_id self._file = os.path.join(STORAGE_DIR, str(user_id)) self._data = SortedSet() if os.path.exists(self._file): with open(self._file, 'r') as fp: datapoint_list = json.load( fp, object_hook=_PortfolioEncoder.decode_hook) self._data.update(datapoint_list) def GetPortfolio(self, timestamp=None): rval = None try: if not timestamp: rval = self._data[-1] else: bisect_point = self._data.bisect( _PortfolioAtTimestamp(0, timestamp)) if bisect_point > 0: rval = self._data[bisect_point - 1] except (IndexError, KeyError): pass # No portfolio at the specified time, return empty portfolio. if not rval: return _PortfolioAtTimestamp(self.user_id, timestamp or time.time()) # We need to use copy.deepcopy here because nobody old data needs # to be kept. The timestamp is set as requested, so the copied portfolio # can simply be saved. rval = copy.deepcopy(rval) rval.timestamp = timestamp or time.time() return rval def AddPortfolio(self, portfolio): self._data.add(portfolio) def Save(self): data_to_dump = list(self._data) with open(self._file, 'w') as fp: json.dump(data_to_dump, fp, cls=_PortfolioEncoder)
def __work(self, w: str): dw, dm = self.__dw, self.__dm sw = str.startswith lw = [] for (k, v) in map(dw.peekitem, range( dw.bisect_left(w), len(self.__dw))): if not sw(k, w): break lw.append(v) lm = SortedSet(key=lambda x: x.word) for (k, v) in map(dm.peekitem, range( dm.bisect_left(w), len(self.__dm))): if not sw(k, w): break lm.update(v) lw.extend(lm) self.__curl = lw
def getTreeCis(ptId): cis = SortedSet() cis.add(USP.ptId_clustIdxStr[ptId][0]) if ptId in USP.ptId_aciChdIds: for cids in USP.ptId_aciChdIds[ptId].values(): for cid in cids: if USP.ptId_parDep[cid] not in USP.allowedDeps: continue cis = cis.update(USP.getTreeCis(cid)) return cis
class History(object): def __init__(self, history=None, modification_history=None): # Dict var_name -> Timeline self.chunk_history = Timeline() if history is None else Timeline( history) if modification_history is None: self.modification_history = {} for c in self.chunk_history: for p in c.modifications: if p not in self.modification_history: self.modification_history[p] = Timeline() self.modification_history[p].add(c) for p in c.dependencies: if p not in self.modification_history: raise Exception( 'Illegal sequence of operations was supplied! Referenced dependency {} does not exist at time {}' .format(p, c.stamp)) self.modification_history[p][-1].dependents.add(c) else: self.modification_history = modification_history self.dirty_chunks = SortedSet() def __iter__(self): return iter(self.chunk_history) def __len__(self): return len(self.modification_history) def get_time_stamp(self, before=None, after=None): if before is not None: pos, succ = self.chunk_history.get_ceil(before) if type( before) != Chunk else self.chunk_history.get_ceil(before.stamp) return 0.5 * (succ.stamp + self.chunk_history[pos - 1].stamp ) if pos > 0 else succ.stamp - 1 elif after is not None: pos, succ = self.chunk_history.get_floor(after) if type( after) != Chunk else self.chunk_history.get_floor(after.stamp) return 0.5 * (succ.stamp + self.chunk_history[pos + 1].stamp) if pos < len( self.chunk_history) - 1 else succ.stamp + 1 return self.chunk_history[-1].stamp + 1 if len( self.chunk_history) > 0 else 1 @profile def _insert_modification(self, chunk, path): if path not in self.modification_history: self.modification_history[path] = Timeline() _, pred = self.modification_history[path].get_floor(chunk.stamp) if pred is not None: to_remove = set() for d in pred.dependents: # Fetch all dependents from predecessor which are going to depend on the new chunk # Save them as dependents and mark them as dirty if d.stamp > chunk.stamp: dep_overlap_diff = d.dependencies.difference( chunk.modifications) # Is there at least one element overlap if len(dep_overlap_diff) < len(d.dependencies): chunk.dependents.add(d) self.dirty_chunks.add(d) # If there is no remaining overlap with pred anymore, remove d if len(dep_overlap_diff.difference( pred.modifications)) == len(dep_overlap_diff): to_remove.add(d) pred.dependents -= to_remove self.modification_history[path].add(chunk) @profile def insert_chunk(self, chunk): for p in chunk.dependencies: if p not in self.modification_history: raise Exception( 'Chunk depends on attribute without history!\n Operation "{}" at {}\n Attribute: {}\n' .format(chunk.operation.name, chunk.stamp, p)) _, pred = self.modification_history[p].get_floor(chunk.stamp) if pred is None: raise Exception( 'Chunk at time {} executing "{}" depends on attributes with empty history! Attributes:\n {}' .format( chunk.stamp, chunk.operation.name, '\n '.join([ str(p) for p in chunk.dependencies if p not in self.modification_history or self.modification_history[p].get_floor( chunk.stamp)[1] is None ]))) pred.dependents.add(chunk) for p in chunk.modifications: self._insert_modification(chunk, p) self.chunk_history.add(chunk) @profile def remove_chunk(self, chunk): for p in chunk.modifications: if self.modification_history[p][0] == chunk and len( chunk.dependents) > 0 and max( [p in c.dependencies for c in chunk.dependents]): raise Exception( 'Can not remove chunk at timestamp {} because it is the founding chunk in the history of {} and would create dangling dependencies.' .format(chunk.stamp, p)) for p in chunk.modifications: self.modification_history[p].discard(chunk) _, pred = self.modification_history[p].get_floor(chunk.stamp) # Copy dependents that depend on this variable to predecessor if pred is not None: pred.dependents.update( {d for d in chunk.dependents if p in d.dependencies}) for p in chunk.dependencies: pos, pred = self.modification_history[p].get_floor(chunk.stamp) if pred is None: raise Exception( 'Chunk depends on attribute with empty history!') # It can happen that this chunk modifies the variable it depends on. # In this case it needs to be removed from the history and from if pred == chunk: pos -= 1 pred = self.modification_history[p][pos] pred.dependents.discard(chunk) self.chunk_history.remove(chunk) self.dirty_chunks.update(chunk.dependents) @profile def replace_chunk(self, c_old, c_new): if c_old.stamp != c_new.stamp: raise Exception( 'Can only replace chunk if stamps match. Stamps:\n Old: {:>8.3f}\n New: {:>8.3f}' .format(c_old.stamp, c_new.stamp)) overlap = c_old.modifications.intersection(c_new.modifications) if len(overlap) != len(c_old.modifications): raise Exception( 'Chunks can only be replaced by others with at least the same definition coverage. Missing variables:\n {}' .format('\n '.join( sorted(c_old.modifications.difference( c_new.modifications))))) new_deps = { p: self.modification_history[p].get_floor(c_new.stamp)[1] if p in self.modification_history else None for p in c_new.dependencies.difference(overlap) } if None in new_deps.values(): raise Exception( 'Replacement chunk at {} tries to depend on variables with insufficient histories. variables:\n {}' .format('\n '.join(sorted(new_deps.keys())))) for p in overlap: pos, _ = self.modification_history[p].get_floor(c_old.stamp) # If we are already here, we might as well remove old and establish new deps if p in c_old.dependencies: self.modification_history[p][pos - 1].dependents.discard(c_old) if p in c_new.dependencies: self.modification_history[p][pos - 1].dependents.add(c_new) self.modification_history[p].remove(c_old) self.modification_history[p].add(c_new) c_new.dependents = c_old.dependents.copy() self.flag_dirty(*c_new.dependents) # Remove old, non-modified deps for p in c_old.dependencies.difference(overlap): self.modification_history[p].get_floor( c_old.stamp)[1].dependents.remove(c_old) # Insert additional modifications for p in c_new.modifications.difference(overlap): self._insert_modification(c_new, p) for c in new_deps.values(): c.dependents.add(c_new) self.chunk_history.remove(c_old) self.chunk_history.add(c_new) def get_chunk_by_index(self, idx): return self.chunk_history[idx] def get_chunk(self, stamp): return self.get_chunk_pos(stamp)[0] def get_chunk_pos(self, stamp): pos, chunk = self.chunk_history.get_floor(stamp) return (chunk, pos) if chunk is None or chunk.stamp == stamp else (None, None) def flag_dirty(self, *chunks): self.dirty_chunks.update(chunks) def flag_clean(self, *chunks): for c in chunks: self.dirty_chunks.discard(c) def expand_dirty_set(self): active_set = set(self.dirty_chunks) while len(active_set) > 0: a = active_set.pop() u = a.dependents.difference(self.dirty_chunks) active_set.update(u) self.dirty_chunks.update(u) def get_dirty(self): return self.dirty_chunks.copy() def get_subhistory(self, time): if len(self.chunk_history) > 0 and self.chunk_history[0].stamp >= time: chunks = self.chunk_history[:self.chunk_history.get_floor(time )[0] + 1] mod_history = { p: Timeline(h[:h.get_floor(time)]) for p, h in self.modification_history.items() if h[0].stamp >= time } return History(chunks, mod_history) return History() def get_history_of(self, *paths): out = set() remaining = set() for p in paths: if p in self.modification_history: remaining.update(self.modification_history[p]) while len(remaining) > 0: chunk = remaining.pop() out.add(chunk) for p in chunk.dependencies: pos, dep = self.modification_history[p].get_floor(chunk.stamp) if dep == chunk: # Catch if predecessor is chunk itself dep = self.modification_history[p][pos - 1] if dep not in out: remaining.add(dep) return Timeline(out) def str_history_of(self, p): if p not in self.modification_history: raise Exception('Path {} has no history.'.format(p)) return '\n'.join([ '{:>8.3f} : {}'.format(chunk.stamp, str(chunk.op)) for chunk in self.modification_history[p] ]) def str_history(self): return '\n'.join([ '{:>8.3f} : {}'.format(chunk.stamp, str(chunk.op)) for chunk in self.chunk_history ]) def __eq__(self, other): if isinstance(other, History): return self.chunk_history == other.chunk_history return False
def in_place_stoplist(self, stoplist=None, freq=0): """ Changes a Corpus object with words in the stoplist removed and with words of frequency <= `freq` removed. :param stoplist: The list of words to be removed. :type stoplist: list :param freq: A threshold where words of frequency <= 'freq' are removed. Default is 0. :type freq: integer, optional :returns: Copy of corpus with words in the stoplist and words of frequnecy <= 'freq' removed. :See Also: :class:`Corpus` """ from sortedcontainers import SortedSet, SortedList stop = SortedSet() if stoplist: for t in stoplist: if t in self.words_int: stop.add(self.words_int[t]) if freq: cfs = np.bincount(self.corpus) freq_stop = np.where(cfs <= freq)[0] stop.update(freq_stop) if not stop: # print 'Stop list is empty.' return self # print 'Removing stop words', datetime.now() f = np.vectorize(stop.__contains__) # print 'Rebuilding context data', datetime.now() context_data = [] BASE = len(self.context_data) - 1 # gathering list of new indicies from narrowest tokenization def find_new_indexes(INTO, BASE=-1): locs = np.where( np.in1d(self.context_data[BASE]['idx'], self.context_data[INTO]['idx']))[0] # creating a list of lcoations that are non-identical new_locs = np.array([ loc for i, loc in enumerate(locs) if i + 1 == len(locs) or self.context_data[BASE]['idx'][ locs[i]] != self.context_data[BASE]['idx'][locs[i + 1]] ]) # creating a search for locations that ARE identical idxs = np.insert(self.context_data[INTO]['idx'], [0, -1], [-1, -1]) same_spots = np.where(np.equal(idxs[:-1], idxs[1:]))[0] # readding the identical locations really_new_locs = np.insert(new_locs, same_spots, new_locs[same_spots - 1]) return really_new_locs # Calculate new base tokens tokens = self.view_contexts(self.context_types[BASE]) new_corpus = [] spans = [] for t in tokens: new_t = t[np.logical_not(f(t))] if t.size else t # TODO: append to new_corpus as well spans.append(new_t.size if new_t.size else 0) if new_t.size: new_corpus.append(new_t) # Stopped all words from Corpus if not new_corpus: return Corpus([]) new_base = self.context_data[BASE].copy() new_base['idx'] = np.cumsum(spans) context_data = [] # calculate new tokenizations for every context_type for i in range(len(self.context_data)): if i == BASE: context_data.append(new_base) else: context = self.context_data[i].copy() context['idx'] = new_base['idx'][find_new_indexes(i, BASE)] context_data.append(context) del self.context_data self.context_data = context_data # print 'Rebuilding corpus and updating stop words', datetime.now() self.corpus = np.concatenate(new_corpus) #self.corpus[f(self.corpus)] self.stopped_words.update(self.words[stop]) #print 'adjusting words list', datetime.now() new_words = np.delete(self.words, stop) # print 'rebuilding word dictionary', datetime.now() new_words_int = dict((word, i) for i, word in enumerate(new_words)) old_to_new = dict( (self.words_int[word], i) for i, word in enumerate(new_words)) #print "remapping corpus", datetime.now() f = np.vectorize(old_to_new.__getitem__) self.corpus[:] = f(self.corpus) #print 'storing new word dicts', datetime.now() self.words = new_words self.words_int = new_words_int return self
class ARG(object): ''' Ancestral Recombination Graph ''' def __init__(self): self.nodes = {} self.roots = bintrees.AVLTree() # root indexes self.rec = bintrees.AVLTree() # arg rec parents nodes self.coal = bintrees.AVLTree() # arg CA parent node self.num_ancestral_recomb = 0 self.num_nonancestral_recomb = 0 self.branch_length = 0 self.nextname = 1 # next node index self.available_names = SortedSet() def __iter__(self): '''iterate over nodes in the arg''' return list(self.nodes) def __len__(self): '''number of nodes''' return len(self.nodes) def __getitem__(self, index): '''returns node by key: item''' return self.nodes[index] def __setitem__(self, index, node): '''adds a node to the ARG''' node.index = index self.add(node) def __contains__(self, index): '''if ARG contains node key ''' return index in self.nodes def copy(self): '''return a copy of the ARG''' arg = ARG() for node in self.nodes.values(): arg.nodes[node.index] = node.copy() # connect nodes for node in self.nodes.values(): node2 = arg.__getitem__(node.index) if node.left_child != None: node2.left_child = arg.__getitem__(node.left_child.index) node2.right_child = arg.__getitem__(node.right_child.index) if node.left_parent != None: node2.left_parent = arg.__getitem__(node.left_parent.index) node2.right_parent = arg.__getitem__(node.right_parent.index) arg.roots = self.roots.copy() # root indexes arg.rec = self.rec.copy() # arg rec parents nodes arg.coal = self.coal.copy() # arg CA parent node arg.num_ancestral_recomb = self.num_ancestral_recomb arg.num_nonancestral_recomb = self.num_nonancestral_recomb arg.branch_length = self.branch_length arg.nextname = self.nextname # next node index arg.available_names = self.available_names.copy() return arg def equal(self, other): '''if self is equal with other (structural equality) TODO : complete this''' if self.__len__() != other.__len__(): return False else: for node in self.nodes.values(): if node.index not in other: return False if not node.equal(other[node.index]): return False return True def leaves(self, node=None): """ Iterates over the leaves of the ARG. """ if node is None: for node in self.nodes.values(): if node.left_child == None: yield node else: for node in self.preorder(node): if node.left_child == None: yield node def preorder(self, node=None): """ Iterates through nodes in preorder traversal. """ visit = set() if node is None: node = self.__getitem__(self.roots.max_key()) queue = [node] for node in queue: if node in visit: continue yield node visit.add(node) if node.left_child != None: queue.append(node.left_child) if node.left_child.index != node.right_child.index: queue.append(node.right_child) def postorder(self, node=None): """ Iterates through nodes in postorder traversal. """ visit = collections.defaultdict(lambda: 0) queue = list(self.leaves(node)) for node in queue: yield node if node.left_parent != None: visit[node.left_parent] += 1 if node.left_parent.left_child.index != node.left_parent.right_child.index: num_child = 2 else: num_child = 1 # if all child has been visited then queue parent if visit[node.left_parent] == num_child: queue.append(node.left_parent) if node.right_parent.index != node.left_parent.index: visit[node.right_parent] += 1 # if all child has been visited then queue parent if visit[node.right_parent] == num_child: queue.append(node.right_parent) def set_roots(self): self.roots.clear() for node in self.nodes.values(): if node.left_parent is None: self.roots[node.index] = node.index def get_times(self): '''return a sorted set of the ARG node.time''' times = SortedSet() for node in self.nodes.values(): times.add(node.time) return times def get_higher_nodes(self, t): ''':return nodes.index of nodes with node.time >= t TODO: a more efficient search option ''' return [key for key in self.nodes if self.nodes[key].time >= t] #========================== # node manipulation def alloc_segment(self, left=None, right=None, node=None, samples=bintrees.AVLTree(), prev=None, next=None): """ alloc a new segment """ s = Segment() s.left = left s.right = right s.node = node s.samples = samples s.next = next s.prev = prev return s def alloc_node(self, index=None, time=None, left_child=None, right_child=None): """ alloc a new Node """ node = Node(index) node.time = time node.first_segment = None node.left_child = left_child node.right_child = right_child node.left_parent = None node.right_parent = None node.breakpoint = None node.snps = bintrees.AVLTree() return node def store_node(self, segment, node): '''store node with segments: segment''' x = segment if x is not None: while x.prev is not None: x = x.prev s = self.alloc_segment(x.left, x.right, node, x.samples.copy()) node.first_segment = s x.node = node x = x.next while x is not None: s = self.alloc_segment(x.left, x.right, node, x.samples.copy(), s) s.prev.next = s x.node = node x = x.next else: # node.first_segment = None self.nodes[node.index] = node def copy_node_segments(self, node): ''' copy the segments of a node, in CA event or Rec events, we need to copy the first node in order to make changes on them ''' x = node.first_segment if x is None: return None else: assert x.prev is None s = self.alloc_segment(x.left, x.right, node, x.samples.copy()) x.node = node x = x.next while x is not None: s = self.alloc_segment(x.left, x.right, node, x.samples.copy(), s) s.prev.next = s x.node = node x = x.next return s def get_available_names(self): '''get free names from 0 to max(nodes)''' self.available_names = SortedSet() current_names = SortedSet(self.__iter__()) counter = 0 prev = current_names[0] while counter < len(current_names): if current_names[counter] != prev + 1: self.available_names.update( range(prev + 1, current_names[counter])) prev = current_names[counter] counter += 1 def new_name(self): '''returns a new name for a node''' if self.available_names: name = self.available_names.pop() else: name = self.nextname self.nextname += 1 return name def add(self, node): ''' add a ready node to the ARG: ''' self.nodes[node.index] = node return node def rename(self, oldindex, newindex): '''renames a node in the ARG''' node = self.nodes[oldindex] node.index = newindex del self.nodes[oldindex] self.nodes[newindex] = node def total_branch_length(self): '''the ARG total branch length''' total_material = 0 for node in self.nodes.values(): if node.left_parent is not None: age = node.left_parent.time - node.time seg = node.first_segment while seg is not None: total_material += ((seg.right - seg.left) * age) seg = seg.next return total_material #======================= #spr related def detach(self, node, sib): ''' Detaches a specified coalescence node from the rest of the ARG ''' # print("Detach()",node.index, "sib", sib.index, "p",node.left_parent.index) assert node.left_parent.index == node.right_parent.index parent = node.left_parent sib.left_parent = parent.left_parent sib.right_parent = parent.right_parent sib.breakpoint = parent.breakpoint grandparent = parent.left_parent if grandparent is not None: grandparent.update_child(parent, sib) grandparent = parent.right_parent grandparent.update_child(parent, sib) def reattach(self, u, v, t, new_names): # Reattaches node u above node v at time t, new_names is a avltree of all #new nodes.index in a new ARG in mcmc assert t > v.time # assert v.left_parent == None or t < v.left_parent.time if u.left_parent is None: # new_name new_name = self.new_name() new_names[new_name] = new_name # self.coal[new_name] = new_name # add the new CA parent to the ARG.coal parent = self.add(self.alloc_node(new_name)) parent.left_child = u u.left_parent = parent u.right_parent = parent else: assert u.left_parent.index == u.right_parent.index parent = u.left_parent parent.time = t parent.breakpoint = v.breakpoint v.breakpoint = None parent.left_parent = v.left_parent grandparent = v.left_parent if grandparent is not None: grandparent.update_child(v, parent) parent.right_parent = v.right_parent grandparent = v.right_parent if grandparent is not None: grandparent.update_child(v, parent) v.left_parent = parent v.right_parent = parent if parent.left_child.index == u.index: parent.right_child = v else: parent.left_child = v return new_names def push_mutation_down(self, node, x): ''' for a given node push the mutation (at x) as down as possible normally mutations automatically should stay at their lowest possible position. This might be useful for initial ARG ''' block = False while not block: node, block = node.push_snp_down(x) def push_all_mutations_down(self, node): '''push down all mutations on node as low as possible''' snp_keys = [k for k in node.snps] for x in snp_keys: self.push_mutation_down(node, x) # iter = len(node.snps) # i = 0 # # while iter > 0: # x = node.snps[i] # self.push_mutation_down(node, x) # iter -= 1 # if node.snps and len(node.snps) > i: # if node.snps[i] == x: # i += 1 def find_tmrca(self, node, x): ''' check the parent of node to see if it is mrca for site x ''' if node.left_parent is None: block = True return node, block elif node.left_parent.index is not node.right_parent.index: assert node.left_parent.contains(x) + node.right_parent.contains( x) == 1 block = False if node.left_parent.contains(x): return node.left_parent, block else: return node.right_parent, block elif node.left_parent.contains(x): block = False return node.left_parent, block else: # it is mrca for x block = True return node.left_parent, block def tmrca(self, x): '''tmrca for site x 1. start from a leaf 2. follow the path of x until its mrca ''' node = self.__getitem__(0) block = False while not block: node, block = self.find_tmrca(node, x) return node.time def total_tmrca(self, sequence_length): ''' return the tmrca of all the sites in the ARG ''' break_points = self.breakpoints(only_ancRec=True, set=True) break_points.add(0) break_points.add(sequence_length) tot_tmrca = np.zeros(int(sequence_length)) count = 0 while count < len(break_points) - 1: x_tmrca = self.tmrca(break_points[count]) tot_tmrca[int(break_points[count]):int(break_points[count + 1])] = x_tmrca count += 1 return tot_tmrca def mean_tmrca(self, sequence_length): '''return a value for tmrca of the ARG, which is the mean over all trmrcas''' break_points = self.breakpoints(only_ancRec=True, set=True) break_points.add(0) break_points.add(sequence_length) tmrca_list = [] count = 0 while count < len(break_points) - 1: x_tmrca = self.tmrca(break_points[count]) tmrca_list.append( x_tmrca * (int(break_points[count + 1]) - int(break_points[count]))) count += 1 return np.mean(tmrca_list) def allele_age(self): ''':return a pd df with four columns: 1. site: the genomic position of the SNP 2. recent age: the most recent age for the allele 3. mid age: the midpoint of node age and its parent (tree node) time 4. latest age: the latest time (back in time) for the mutation The df is sorted based on site. ''' #find the nodes with mutations snp_nodes = [] # nodes with len(snps) > 0 for node in self.nodes.values(): if node.snps: snp_nodes.append(node) # now for each node and find age for each mut age_df = pd.DataFrame( columns=["site", "recent age", "mid age", "latest age"]) for node in snp_nodes: # num_branches = collections.defaultdict(list) node_time = node.time for x in node.snps: parent_age = node.tree_node_age(x, return_parent_time=True) age_df.loc[age_df.shape[0]] = [ x, node_time, (node_time + parent_age) / 2, parent_age ] age_df.sort_values(by=['site'], ascending=True, inplace=True) age_df.reset_index(inplace=True, drop=True) return age_df def invisible_recombs(self): '''return the proportion of invisible recombs ''' invis_count = 0 for node in self.nodes.values(): if node.breakpoint != None and node.is_invisible_recomb(): invis_count += 1 return invis_count / (self.num_ancestral_recomb + self.num_nonancestral_recomb) #@property def breakpoints(self, only_ancRec=False, set=True): ''' :param only_ancRec: only ancestral rec with repetition :param set: if set, only uqique posittions are returned :param invisible count the number of invisible recombs :return: either a list/set of all recombs or a list of anc rec that has repetition ''' if set: br = SortedSet() else: br = SortedList() if not only_ancRec: for node in self.nodes.values(): if node.breakpoint != None: br.add(node.breakpoint) else: for node in self.nodes.values(): if node.breakpoint != None and\ node.contains(node.breakpoint):#ancestral br.add(node.breakpoint) return br #========== probabilites def log_likelihood(self, mutation_rate, data): ''' log_likelihood of mutations on a given ARG up to a normalising constant that depends on the pattern of observed mutations, but not on the ARG or the mutation rate. Note after spr and berfore clean up we might have NAM lineages, this method covers take this into account. :param m : is number of snps ''' snp_nodes = [] # nodes with len(snps) > 0 total_material = 0 number_of_mutations = 0 #get total matereial and nodes with snps for node in self.nodes.values(): if node.first_segment != None: assert node.left_parent != None age = node.left_parent.time - node.time seg = node.first_segment assert seg.prev == None while seg is not None: total_material += ((seg.right - seg.left) * age) seg = seg.next if node.snps: number_of_mutations += len(node.snps) snp_nodes.append(node) self.branch_length = total_material # print("number_of_mutations", number_of_mutations, "m", len(data)) assert number_of_mutations == len(data) # num of snps if mutation_rate == 0: if number_of_mutations == 0: ret = 0 else: ret = -float("inf") else: ret = number_of_mutations * math.log(total_material * mutation_rate) -\ (total_material * mutation_rate) # now calc prob of having this particular mutation pattern for node in snp_nodes: # num_branches = collections.defaultdict(list) for x in node.snps: potential_branch_length = node.tree_node_age(x) ret += math.log(potential_branch_length / total_material) # # verify the mutation is on the correct spot verify_mutation_node(node, data) return ret def log_prior(self, sample_size, sequence_length, recombination_rate, Ne, NAM=True, new_roots=False, kuhner=False): ''' probability of the ARG under coalescen with recombination this is after a move and before clean up. then there might be some extra NAM lineages, we ignore them. :param NAM: no-ancestral material node. If NAm node is allowed. note after spr and before clean up step there might be some NAM in the ARG which is ok. But after clean up or on the initial ARG there should not be any. ''' # order nodes by time #TODO: find an efficient way to order nodes ordered_nodes = [ v for k, v in sorted(self.nodes.items(), key=lambda item: item[1].time) ] number_of_lineages = sample_size number_of_links = number_of_lineages * (sequence_length - 1) number_of_nodes = self.__len__() counter = sample_size time = 0 ret = 0 rec_count = 0 coal_count = 0 roots = bintrees.AVLTree() new_coal = bintrees.AVLTree() if kuhner: self.rec.clear() self.num_ancestral_recomb = 0 self.num_nonancestral_recomb = 0 while counter < number_of_nodes: node = ordered_nodes[counter] assert node.time >= time # make sure it is ordered] rate = (number_of_lineages * (number_of_lineages - 1) / (4 * Ne)) + (number_of_links * (recombination_rate)) # ret -= rate * (node.time - time) if node.left_child.index == node.right_child.index: #rec assert node.left_child.first_segment != None assert node.left_child.left_parent.first_segment != None assert node.left_child.right_parent.first_segment != None ret -= rate * (node.time - time) gap = node.left_child.num_links()-\ (node.left_child.left_parent.num_links() + node.left_child.right_parent.num_links()) ret += math.log(recombination_rate) assert gap >= 1 if gap == 1: self.num_ancestral_recomb += 1 else: self.num_nonancestral_recomb += 1 number_of_links -= gap number_of_lineages += 1 if kuhner: # add rec self.rec[node.index] = node.index self.rec[ordered_nodes[counter + 1].index] = ordered_nodes[counter + 1].index counter += 2 time = node.time rec_count += 1 elif node.left_child.first_segment != None and\ node.right_child.first_segment != None: ret -= rate * (node.time - time) ret -= math.log(2 * Ne) if node.first_segment == None: node_numlink = 0 number_of_lineages -= 2 counter += 1 if new_roots: roots[node.index] = node.index else: node_numlink = node.num_links() number_of_lineages -= 1 counter += 1 lchild_numlink = node.left_child.num_links() rchild_numlink = node.right_child.num_links() number_of_links -= (lchild_numlink + rchild_numlink) - node_numlink time = node.time coal_count += 1 if new_roots: new_coal[node.index] = node.index else: counter += 1 if not NAM: assert node.left_child.first_segment != None assert node.right_child.first_segment != None if new_roots: return ret, roots, new_coal else: return ret def dump(self, path=' ', file_name='arg.arg'): output = path + "/" + file_name with open(output, "wb") as file: pickle.dump(self, file) def load(self, path=' '): with open(path, "rb") as file: return pickle.load(file) def verify(self): ''' verify arg: 1. a node with parent must have seg 2. a node with no parent a. must be in roots b. different child 3. node.parent_time > node.time 4. arg name == node.index 5. recomb parent must have self.snps.empty() 6. nodes with child = None must be leaf 7. number coal + rec + roots check 8. seg.samples is not empty, seg.left < seg.right ''' for node in self.nodes.values(): assert self.nodes[node.index].index == node.index if node.left_parent is None: #roots if node.first_segment is not None: print("in verrify node is ", node.index) self.print_state() assert node.first_segment == None assert node.index in self.roots assert node.breakpoint == None assert node.left_child.index != node.right_child.index assert node.right_parent == None assert node.index in self.coal assert node.time > node.left_child.time assert node.time > node.right_child.time else: # rest assert node.first_segment != None assert node.first_segment.prev == None assert node.get_tail().next == None assert node.index not in self.roots assert node.left_parent.time > node.time if node.left_child is None: #leaves assert node.right_child is None assert node.time == 0 if node.left_parent.index != node.right_parent.index: assert node.breakpoint != None assert node.left_parent.left_child.index ==\ node.left_parent.right_child.index assert node.right_parent.left_child.index ==\ node.right_parent.right_child.index assert node.right_parent.left_child.index == node.index assert not node.left_parent.snps assert not node.right_parent.snps assert node.left_parent.time == node.right_parent.time assert node.left_parent.index in self.rec assert node.right_parent.index in self.rec if node.left_parent.first_segment.left > node.right_parent.first_segment.left: print("in verify node", node.index) print("node.left_parent", node.left_parent.index) print("node.right_parent", node.right_parent.index) assert node.left_parent.first_segment.left < node.right_parent.first_segment.left else: assert node.left_parent.index in self.coal assert node.left_parent.left_child.index !=\ node.left_parent.right_child.index assert node.breakpoint == None if node.first_segment is not None: seg = node.first_segment assert seg.prev is None while seg is not None: assert seg.samples assert seg.left < seg.right assert seg.node.index == node.index seg = seg.next def print_state(self): print("self.arg.coal", self.coal) print("self.arg.rec", self.rec) print("self.arg.roots", self.roots) print("node", "time", "left", "right", "l_chi", "r_chi", "l_par", "r_par", "l_bp", "snps", "fir_seg_sam", sep="\t") for j in self.nodes: node = self.__getitem__(j) if node.left_parent is not None or node.left_child is not None: s = node.first_segment if s is None: print(j, "%.5f" % node.time, "root", "root", node.left_child.index, node.right_child.index, node.left_parent, node.right_parent, node.breakpoint, node.snps, None, sep="\t") while s is not None: l = s.left r = s.right if node.left_child is None: print(j, "%.5f" % node.time, l, r, "Leaf", "Leaf", node.left_parent.index, node.right_parent.index, node.breakpoint, node.snps, s.samples, sep="\t") # elif node.left_parent is None: print(j, "%.5f" % node.time, l, r, node.left_child.index, node.right_child.index, "Root", "Root", node.breakpoint, node.snps, s.samples, sep="\t") else: print(j, "%.5f" % node.time, l, r, node.left_child.index, node.right_child.index, node.left_parent.index, node.right_parent.index, node.breakpoint, node.snps, s.samples, sep="\t") s = s.next
if state == '1:start': prev_len = len(available_res) (new_available, alloc) = take_first_resources( available_res, job.nb_res) available_res = new_available job.resources = alloc if len(job.resources) != job.nb_res: raise Exception('Invalid number of resources ({}, expected {})' .format(job.resources, job.nb_res)) if len(available_res) != prev_len - job.nb_res: raise Exception('Invalid number of available resources ' '({}, expected {})' .formta(len(available_res), prev_len - job.nb_res)) elif state == '0:finish': available_res.update(job.resources) ############## # Export CSV # ############## writer = csv.DictWriter(args.outputCSV, fieldnames=["job_id", "submission_time", "requested_number_of_resources", "requested_time", "success", "starting_time", "execution_time", "finish_time", "waiting_time",
def extend_function(self, sortedSet, a): SortedSet.update(sortedSet, a)
prev_len = len(available_res) (new_available, alloc) = take_first_resources(available_res, job.nb_res) available_res = new_available job.resources = alloc if len(job.resources) != job.nb_res: raise Exception( 'Invalid number of resources ({}, expected {})'.format( job.resources, job.nb_res)) if len(available_res) != prev_len - job.nb_res: raise Exception('Invalid number of available resources ' '({}, expected {})'.formta(len(available_res), prev_len - job.nb_res)) elif state == '0:finish': available_res.update(job.resources) ############## # Export CSV # ############## writer = csv.DictWriter(args.outputCSV, fieldnames=[ "job_id", "submission_time", "requested_number_of_processors", "requested_time", "success", "starting_time", "execution_time", "finish_time", "waiting_time", "turnaround_time", "stretch", "consumed_energy", "allocated_processors" ]) writer.writeheader()
def in_place_stoplist(self, stoplist=None, freq=0): """ Changes a Corpus object with words in the stoplist removed and with words of frequency <= `freq` removed. :param stoplist: The list of words to be removed. :type stoplist: list :param freq: A threshold where words of frequency <= 'freq' are removed. Default is 0. :type freq: integer, optional :returns: Copy of corpus with words in the stoplist and words of frequnecy <= 'freq' removed. :See Also: :class:`Corpus` """ from sortedcontainers import SortedSet, SortedList stop = SortedSet() if stoplist: for t in stoplist: if t in self.words_int: stop.add(self.words_int[t]) if freq: cfs = np.bincount(self.corpus) freq_stop = np.where(cfs <= freq)[0] stop.update(freq_stop) if not stop: # print 'Stop list is empty.' return self # print 'Removing stop words', datetime.now() f = np.vectorize(stop.__contains__) # print 'Rebuilding context data', datetime.now() context_data = [] BASE = len(self.context_data) - 1 # gathering list of new indicies from narrowest tokenization def find_new_indexes(INTO, BASE=-1): locs = np.where(np.in1d(self.context_data[BASE]['idx'], self.context_data[INTO]['idx']))[0] # creating a list of lcoations that are non-identical new_locs = np.array([loc for i, loc in enumerate(locs) if i+1 == len(locs) or self.context_data[BASE]['idx'][locs[i]] != self.context_data[BASE]['idx'][locs[i+1]]]) # creating a search for locations that ARE identical idxs = np.insert(self.context_data[INTO]['idx'], [0,-1], [-1,-1]) same_spots = np.where(np.equal(idxs[:-1], idxs[1:]))[0] # readding the identical locations really_new_locs = np.insert(new_locs, same_spots, new_locs[same_spots-1]) return really_new_locs # Calculate new base tokens tokens = self.view_contexts(self.context_types[BASE]) new_corpus = [] spans = [] for t in tokens: new_t = t[np.logical_not(f(t))] if t.size else t # TODO: append to new_corpus as well spans.append(new_t.size if new_t.size else 0) if new_t.size: new_corpus.append(new_t) # Stopped all words from Corpus if not new_corpus: return Corpus([]) new_base = self.context_data[BASE].copy() new_base['idx'] = np.cumsum(spans) context_data = [] # calculate new tokenizations for every context_type for i in xrange(len(self.context_data)): if i == BASE: context_data.append(new_base) else: context = self.context_data[i].copy() context['idx'] = new_base['idx'][find_new_indexes(i, BASE)] context_data.append(context) del self.context_data self.context_data = context_data # print 'Rebuilding corpus and updating stop words', datetime.now() self.corpus = np.concatenate(new_corpus) #self.corpus[f(self.corpus)] self.stopped_words.update(self.words[stop]) #print 'adjusting words list', datetime.now() new_words = np.delete(self.words, stop) # print 'rebuilding word dictionary', datetime.now() new_words_int = dict((word,i) for i, word in enumerate(new_words)) old_to_new = dict((self.words_int[word],i) for i, word in enumerate(new_words)) #print "remapping corpus", datetime.now() f = np.vectorize(old_to_new.__getitem__) self.corpus[:] = f(self.corpus) #print 'storing new word dicts', datetime.now() self.words = new_words self.words_int = new_words_int return self
class Crawler: def __init__(self, platform): # setup configuration self.config = Config() term = self.config.types[platform]['term'] extension = self.config.types[platform]['extension'] self.language = self.config.types[platform]['language'] self.platform = self.config.types[platform]['platform'] # setup request handler self.requester = RequestHandler(term, extension) # setup data handler self.data = DataHandler() # configure crawler specifics self.size_range = SortedSet() self.size_range.update([0, 384001]) # stick to GitHub size restrictions self.initial_items = [] print "Started GitHub crawler at {}".format(asctime(localtime(time()))) def crawl(self): total_count = self.requester.get_total_count() target_count = total_count print "Crawler found {} items to store and fetch".format(total_count) item_count = 0 current_item = 0 next_item = 1 start_time = int(time()) # sort items differently to get more items # order_state 0 = default ordering (best match according to "score") # order_state 1 = last indexed # order_state 2 = first indexed order_state = 1 # GitHub only provides 1000 items per request while item_count < total_count: print "Crawler looks in range {} to {} Byte".format( self.size_range[current_item], (self.size_range[next_item] - 1), ) # We might get everything from just one request if (len(self.size_range) is 2) and (total_count < 1000): # excluding the lower and upper bound will use the items we got from our initial request lower = None upper = None # in case we need more then one request else: lower = self.size_range[current_item] upper = self.size_range[next_item] print "Setting lower and upper bound to {} and {}".format(lower, upper) # get items, request item count and incomplete status items, this_item_count, incomplete_items = self.requester.get_items(lower, upper, target_count, order_state) # update item count item_count += this_item_count print "Crawler got {} out of {} items".format(this_item_count, target_count) # write the items we got in this request to the DB new_items = 0 updated_items = 0 for item in items: self.data.update_owner_table(item) self.data.update_repository_table(item) local_path, download_url, content = self.requester.store_locally( item["url"], item["repository"]["id"], item["path"] ) new, updated = self.data.update_code_table( item=item, language=self.language, platform=self.platform, local_path=local_path, download_url=download_url, content=content, ) if (new or updated) is 1: self.requester.download(local_path, download_url) new_items += new updated_items += updated # update target count for new items target_count -= (new_items + updated_items) print "Crawler stored {} new items and updated {} items in the database".format(new_items, updated_items) # in case our results are incomplete or we have more than 1000 items we need to narrow down our search field if (incomplete_items or (this_item_count > 1000)) and ((next_item + 1) is len(self.size_range)): # get items with different ordering if order_state is 0: order_state = 1 elif order_state is 1: order_state = 2 elif order_state is 2: new_boundaries = [] for i in xrange(len(self.size_range) - 1): new_boundaries.append(int((self.size_range[i] + self.size_range[i + 1]) / 2) + 1) self.size_range.update(new_boundaries) # include the new boundary into our sorted list print "Crawler introduced new boundaries: {}".format(self.size_range) current_item = 0 next_item = 1 order_state = 1 # jump to the next search area until we are at the end elif (next_item + 1) < len(self.size_range): current_item += 1 next_item += 1 timeout = True if (start_time < (int(time() - (60 * 60 * 8)))) else False if (target_count is 0) or timeout: print "Crawler is finished" if timeout: print "Timeout after 8 hours" break
def test_update(): temp = SortedSet(range(0, 80)) temp._reset(7) temp.update(range(80, 90), range(90, 100)) assert all(temp[val] == val for val in range(100))
class RequestHandler: def __init__(self, term, extension): # setup configuration self.config = Config() # setup GitHub OAuth self.auth = HTTPBasicAuth(self.config.github['user'], self.config.github['token']) # configure crawler specifics self.github_url = 'https://api.github.com/search/code?q=' # use the GitHub search API self.query = '{}+extension:{}'.format( term, extension) # search for contract in files with extension .sol self.sort = '&sort=' self.order = '&order=' self.size_range = SortedSet() self.size_range.update([0, 384001]) # stick to GitHub size restrictions self.initial_items = [] def rate_limit(self, request): limit = requests.get('https://api.github.com/rate_limit', auth=self.auth) limit_json = limit.json() if request is 'search': remaining_search = limit_json["resources"]["search"]["remaining"] reset_time = limit_json["resources"]["search"]["reset"] if remaining_search is 0: # wait until we can do search requests again sleep_time = reset_time - int(time()) print "Search limit reached. Waiting {} seconds".format( sleep_time) sleep(sleep_time) elif request is 'core': remaining_download = limit_json["resources"]["core"]["remaining"] reset_time = limit_json["resources"]["core"]["reset"] if remaining_download is 0: # wait until we can do search requests again sleep_time = reset_time - int(time()) print "Core limit is reached. Waiting {} seconds".format( sleep_time) sleep(sleep_time) def search_github(self, lower, upper, order_state): self.rate_limit(request='search') if isinstance(lower, int) and isinstance(upper, int) and isinstance( order_state, int): base_url = self.github_url + self.query + "+size:>{}+size:<{}+size:{}".format( lower, upper, upper) if order_state is 1: url = base_url + self.sort + "indexed" + self.order + "desc" elif order_state is 2: url = base_url + self.sort + "indexed" + self.order + "asc" else: url = base_url print "Get contracts from {}".format(url) response = requests.get(url, auth=self.auth) else: response = requests.get(self.github_url + self.query, auth=self.auth) if response.status_code is 200: result = response.json() else: print "No valid GitHub credentials found." result = None return result def get_total_count(self): incomplete_results = True result = dict() # Get total number of files that contain search term while incomplete_results: print "Get total number of contracts from {}".format( self.github_url + self.query) try: result = self.search_github(None, None, None) incomplete_results = result["incomplete_results"] except TypeError: print "Could not search GitHub" break # in case we have less then 1000 results, store this to limit API calls self.initial_items = result["items"] total_count = result["total_count"] return total_count def get_items(self, lower, upper, target_count, order_state): items = self.initial_items this_item_count = len(items) incomplete_items = False try: result = self.search_github(lower, upper, order_state) items = result["items"] this_item_count = len(items) incomplete_items = True if ( this_item_count < target_count) else False except TypeError: print "Could not search GitHub" return items, this_item_count, incomplete_items def get_download_url_content(self, url): self.rate_limit(request='core') # GitHub only gives you the download url when you request it for each file response = requests.get(url, auth=self.auth) if response.status_code is 200: result = response.json() download_url = result["download_url"] # This is the hash for the complete file line by line content_full = result["content"] # We want just one hash for the whole file for faster comparison of changes content = hashlib.md5(content_full).hexdigest() else: print "No valid GitHub credentials found." download_url = None content = None return download_url, content def store_locally(self, url, repository_id, remote_path): # get download url download_url, content = self.get_download_url_content(url) # create folder structure current_path = path.dirname(path.abspath(__file__)) file_path = '{}/code-folder/{}/{}'.format(current_path, repository_id, remote_path) local_path = file_path.rpartition("/")[0] if not path.exists(local_path): makedirs(local_path) return file_path, download_url, content def download(self, file_path, download_url): self.rate_limit(request='core') print "Downloading {}".format(file_path) response = requests.get(download_url, auth=self.auth) with open(file_path, 'wb') as out_file: out_file.write(response.content) del response
def stress_issubset(sst): that = SortedSet(sst) that.update(range(1000)) assert sst.issubset(that)
def _calcMultiGeneSNPcorr(self, cr, genes, REF, wAlleles=True): filtered = {} use = [] RID = [] pos = [] for gene in genes: DATA = [] if self._joint and self._MAP is not None: G = self._GENEID[gene] P = SortedSet(REF[str(cr)][1].irange(G[1] - self._window, G[2] + self._window)) if gene in self._MAP: P.update( list(REF[str(cr)][0].getSNPsPos(self._MAP[gene][0]))) #P = list(set(P)) elif self._MAP is None: G = self._GENEID[gene] P = REF[str(cr)][1].irange(G[1] - self._window, G[2] + self._window) else: if gene in self._MAP: P = set(REF[str(cr)][0].getSNPsPos(self._MAP[gene][0])) else: P = [] DATA = REF[str(cr)][0].get(list(P)) # Sort out for D in DATA: # Select if D[0] in self._GWAS and D[1] > self._MAF and ( D[0] not in filtered or filtered[D[0]][0] < D[1]) and ( not wAlleles or (self._GWAS_alleles[D[0]][0] == D[3] and self._GWAS_alleles[D[0]][1] == D[4])): filtered[D[0]] = [D[1], D[2]] #use.append(D[2]) #RID.append(s) pos.append(len(filtered)) # Calc corr RID = list(filtered.keys()) use = [] for i in range(0, len(RID)): use.append(filtered[RID[i]][1]) use = np.array(use) if len(use) > 1: if self._useGPU: C = cp.asnumpy(cp.corrcoef(cp.asarray(use))) else: C = np.corrcoef(use) else: C = np.ones((1, 1)) return C, np.array(RID), pos
def test_update(): temp = SortedSet(range(0, 80), load=7) temp.update(range(80, 90), range(90, 100)) assert all(temp[val] == val for val in range(100))
def jogar(lista): global jogos, universo, valortotal, valorporjogador print print 'lista de jogos do usuario:', lista print # iterar lista de tuplas for tupla in lista: # tratar argumentos do script e transformar cada string em uma lista # adicionar o terceiro elemento (cotas) se nao for passado pelo usuario # e dividir em tres variaveis t = list(literal_eval(tupla)) if (len(t) < 3): t.append(1) print t n, d, c = t # mega sena aceita no maximo 15 dezenas if d > 15: print 'numero de dezenas nao pode ser maior que 15' print sys.exit() # calcular precos ci, ct = preco(n, d) cb = bolao(ct, c) valortotal += ct print 'universo tem', len(universo), 'numeros disponiveis' print 'gerando', n, 'jogo(s) de', d, 'dezenas dividido(s) em', c, 'cota(s)' print 'custo', 'individual:', ci, '| total:', ct, '| cota bolao:', cb print # iterar quantidade de jogos solicitados for j in range(0, n): # se nao tivermos a quantidade de numeros necessarios no universo # usar os que ainda temos # definir novamente o universo com os numeros 1 a 60 # preencher os numeros que faltavam no jogo com uma amostra aleatoria if len(universo) < d: print 'universo tem', len( universo), 'numeros disponiveis mas precisamos de', d print 'adicionando numeros ao universo' print jogo = SortedSet(universo) universo = range(1, 61) while len(jogo) < d: jogo.update(random.sample(universo, (d - len(jogo)))) # se tivermos a quantidade de numeros necessarios no universo # pegar uma amostra aleatoria else: jogo = SortedSet(random.sample(universo, d)) # retirar do universo os numeros usados universo = [e for e in universo if e not in jogo] # adicionar o jogo na lista de jogos jogos.append(jogo) valorporjogador = valortotal / c
def update_keys(self, x): global X temp_lines = SortedSet() temp_lines.update(self.lines) self.lines = temp_lines
class SchedSimulator(object): def __init__(self, app_list, pe_list): self.app_list = app_list # self.app_list = sorted(app_list, key=lambda _app: _app.priority) self.layer_list = [l for _app in app_list for l in _app.layer_list] self.layer_set = set(self.layer_list) self.gene2fit = {} # FIXME self.single_mode = False first_node = app_list[0].layer_list[0] if len(app_list) == 1 and first_node.get_index() != 0: self.single_mode = True self.num_layer = len(self.layer_list) self.throughput_thresh = 100 self.draw_iteration = 1 self.num_pe = len(pe_list) self.prio_step = len(self.layer_list) # Total number of layers self.pe_list = pe_list # variables for CPU utilization constraint self.elapsed_time_per_pe = [0] * self.num_pe self._ready_queues = [PriorityQueue() for _ in range(self.num_pe)] self._rq_set = set() def _init_ready_queue(self): self._ready_queues = [PriorityQueue() for _ in range(self.num_pe)] self._rq_set = set() def _init_all_apps(self): # application initialization # => Edge, Layer initialization for app in self.app_list: app.do_init() def do_init(self): self.iteration = [0] * self.num_layer self.response_time = [0.0] * len(self.app_list) self.pe_start_time = [-1 for _ in range(self.num_pe)] self.pe_end_time = [[-1] for _ in range(self.num_pe)] self._init_ready_queue() self._init_all_apps() self.timeline = SortedSet( ) # FIXME need to change name. (next_sim_time?) offset_set = set() for l in self.layer_list: if l.offset >= 0: offset_set.add(l.offset) self.timeline.update(list(offset_set)) self.occupy_times = [0] * self.num_pe def find_runnable_layers(self, t): runnable_layers = [] layer_list = self.layer_set - self._rq_set for app in self.app_list: for l in app.layer_list: if l.need_in_edge_check and l.need_out_edge_check \ and l.offset <= t and app.check_runnable(l, t): runnable_layers.append(l) return runnable_layers def _enqueue(self, t): runnable_layers = self.find_runnable_layers(t) for l in runnable_layers: if self.single_mode: idx = l.get_app_index() else: idx = l.get_index() pe = l.pe.get_idx() # prio = l.get_priority() + l.iteration * self.prio_step prio = l.get_app_priority() * self.prio_step + l.iteration # print("Queue " + str(pe) + " insert : " + l.name + " prio : " + str(prio)) self._ready_queues[pe].insert(prio, l) self._rq_set.add(l) def _set_pe_time(self, pe, iteration, start_time, end_time): if self.pe_start_time[pe] == -1: self.pe_start_time[pe] = start_time try: if self.pe_end_time[pe][iteration] < end_time: self.pe_end_time[pe][iteration] = end_time except IndexError: assert iteration == len(self.pe_end_time[pe]) self.pe_end_time[pe].append(end_time) @staticmethod def _get_csts_and_objs(fitness, mapping): objs = [] csts = [] for idx, cst in enumerate(fitness.csts): if cst is not None: cst_value = cst.constraint_function(mapping) csts.append(cst_value) else: csts.append((0, )) for idx, obj in enumerate(fitness.objs): obj_value = obj.objective_function(mapping) objs.append(obj_value) return csts, objs def _draw_gantt(self, gantt, gantt_name, mapping, fitness): csts, objs = SchedSimulator._get_csts_and_objs(fitness, mapping) available_results = True for idx, (cst, value) in enumerate(zip(fitness.csts, csts)): if value[0] != 0: available_results = False break config.available_results = available_results if available_results: print("\nPE Mapping per layer: " + str(mapping)) if len(objs) > config.num_of_app: print("\n\t[ Whole Objective ]") for idx, value in enumerate(objs): if idx >= config.num_of_app: print( "\t\tObjective function value [by Energy Consumption] :\t %d" % value[0]) if len(csts) > config.num_of_app: print("\n\t[ Whole Constraint ]") for idx, (cst, value) in enumerate(zip(fitness.csts, csts)): if idx >= config.num_of_app: print( "\t\tConstraint function value [by %s] :\t %.2f -> %.2f" % (type(cst).__name__, value[-1], value[0])) objs_result = [] for idx, app in enumerate(self.app_list): print("\n\t[ %s (Period: %d, Priority: %d) ]" % (app.name, app.get_period(), app.get_priority())) print("\t\tObjective function value [by %s]:\t%.2f" % (config.app_to_obj_dict[idx], objs[idx][0])) if config.app_to_cst_dict[idx] != 'None': print("\t\tConstraint function value [by %s]:\t%.2f" % (config.app_to_cst_dict[idx], csts[idx][-1])) config.objs_result_by_app[idx].append(round(objs[idx][0], 2)) objs_result.append(round(objs[idx][0], 2)) config.file_name = "{}{}_{}_{}_{}_{}_{}_{}_{}_{}_{}".format( config.save_path + "/" + config.name, str(config.sched_method), str(config.hyper_parameter), str(config.processor), str(config.priority), str(config.period), str(config.cpu_config), str(config.objs), str(objs_result), str(config.csts), config.analyzer) gantt.file_name = config.file_name + "#{}.png".format( config.gantt_chart_idx) gantt.draw_gantt_chart() def _pop_and_get_layer_info(self, q): _, l = q.pop() # pop layer from _ready_queues self._rq_set.remove(l) if self.single_mode: layer_idx = l.get_app_index() else: layer_idx = l.get_index() pe = l.pe.get_idx() app = l.get_app() return l, layer_idx, pe, app def _update_timeline(self, l, time): timeline = self.timeline if l.offset >= 0: # print("ID: {} Name: {} Time: {} {} update".format(id(l), l.name, l.offset, l.offset + l.get_period())) # timeline.add(l.offset) # timeline.add((l.iteration + 1) * l.get_period()) timeline.add(l.get_period() + l.offset) l.set_offset(l.get_period() + l.offset) # print l.get_period() timeline.add(time) def do_simulation(self, mapping, iterations=(0, 1), draw_gantt=False, gantt_name="test.png", fitness=None): if draw_gantt: pe_names = [pe.name for pe in self.pe_list] gantt = GanttChart(gantt_name, pe_names) sim_iteration = iterations[0] end_iteration = iterations[1] timeline = self.timeline occupy_times = self.occupy_times sched = Schedule(self.num_pe) # Start scheduling simulation while sim_iteration < end_iteration: t = timeline.pop(0) self._enqueue(t) # Check every PE's ready_queue(Priority queue) for pe_idx, q in enumerate(self._ready_queues): if occupy_times[pe_idx] > t or q.size() == 0: continue l, layer_idx, pe, app = self._pop_and_get_layer_info(q) execution_time, transition_time, transition_time_list = app.do_layer( l, pe, t) end_time = t + execution_time occupy_times[pe_idx] = end_time + transition_time # Update iteration's end time self._set_pe_time(pe, l.iteration, t, occupy_times[pe_idx]) self.iteration[layer_idx] = self.iteration[layer_idx] + 1 # XXX: Fix for Gantt chart bug (SqueezeNet transition time issue) # XXX: Option 4 chosen. # 1. Original # self._update_timeline(l, occupy_times[pe_idx]) # 2. was only possible in single app scheduling # self._update_timeline(l, occupy_times[pe_idx]) # for time in transition_time_list: # self._update_timeline(l, time) # 3. multiple app scheduling possible, but '[]' happens in transition_time_list # for time in transition_time_list: # self._update_timeline(l, time) # 4. Final implementation if transition_time_list == []: self._update_timeline(l, occupy_times[pe_idx]) else: for time in transition_time_list: self._update_timeline(l, time) l.increase_iter() if l.iteration <= 1: time_tuple = (pe, l, t, end_time, transition_time) sched.add_sched(time_tuple) # FIXME What is second condition? if draw_gantt and occupy_times[ pe_idx] != t and l.iteration <= self.draw_iteration: time_tuple = (l.get_name(), self.pe_list[pe].name, t, end_time, transition_time) gantt.add_task(time_tuple) self.elapsed_time_per_pe[pe_idx] += (end_time - t) if l.is_end_node and l.iteration == 1: self.response_time[self.app_list.index(app)] = end_time inc_sim_iteration = True for n in self.iteration: if n < end_iteration: inc_sim_iteration = False if inc_sim_iteration: sim_iteration += 1 if draw_gantt: self._draw_gantt(gantt, gantt_name, mapping, fitness) return sched def get_response_time(self, app): return self.response_time[self.app_list.index(app)]
# All nodes adjacent to our current subgraph adjacents = SortedSet(startNodes) answer = [] startNode = adjacents[0] visited.add(startNode) while visited != allNodes: # The next node we visit is the 1. the first in the alphabet 2. whose prereqs have all been visited. # We represent this by checking each possible node's prereqs and seeing if it is a subset of the # visited set. idx = 0 while not (SortedSet(prereqs[adjacents[idx]]) <= visited): idx += 1 nextNode = adjacents.pop(idx) visited.add(nextNode) adjacents.update(graph[nextNode]) answer.append(nextNode) answer = ''.join(answer) answerFile = open('part1output.txt', 'w') answerFile.write(answer) assert len(answer) == len(visited) print(answer)