def generate(config, dnat=False, test=True): public_ip = config["public_ip"] current_ip = config["base_ip"] dnsmasq_content = "" for group in config["groups"].values(): if not dnat: c = chunks([proxy["domain"] for proxy in group["proxies"]], 5) else: c = chunks([proxy["domain"] for proxy in group["proxies"] if proxy["dnat"]], 5) for chunk in c: if not dnat: dnsmasq_content += generate_dns(chunk, public_ip) else: dnsmasq_content += generate_dns(chunk, current_ip) if test: if not dnat: dnsmasq_content += generate_dns('ptest.verdandi.is', public_ip) dnsmasq_content += generate_dns('ptest2.verdandi.is', public_ip) else: dnsmasq_content += generate_dns('ptest.verdandi.is', current_ip) dnsmasq_content += generate_dns('ptest2.verdandi.is', current_ip) if dnat: for group in config["groups"].values(): for proxy in group["proxies"]: if not proxy["dnat"]: current_ip = long2ip(ip2long(current_ip) + 1) dnsmasq_content += generate_dns(proxy["domain"], current_ip) return dnsmasq_content
def train(self, X_train, X_val): train_true = filter(lambda x: x[2]==1, X_train) train_false = filter(lambda x: x[2]==0, X_train) val_true = filter(lambda x: x[2]==1, X_val) val_false = filter(lambda x: x[2]==0, X_val) n_train_true = len(train_true) n_val_true = len(val_true) make_epoch_helper = functools.partial(make_epoch, train_true=train_true, train_false=train_false, val_true=val_true, val_false=val_false) logging.info("Starting training...") epoch_iterator = ParallelBatchIterator(make_epoch_helper, range(P.N_EPOCHS), ordered=False, batch_size=1, multiprocess=False, n_producers=1) for epoch_values in epoch_iterator: self.pre_epoch() train_epoch_data, val_epoch_data = epoch_values train_epoch_data = util.chunks(train_epoch_data, P.BATCH_SIZE_TRAIN) val_epoch_data = util.chunks(val_epoch_data, P.BATCH_SIZE_VALIDATION) self.do_batches(self.train_fn, train_epoch_data, self.train_metrics) self.do_batches(self.val_fn, val_epoch_data, self.val_metrics) self.post_epoch() logging.info("Setting learning rate to {}".format(P.LEARNING_RATE * ((0.985)**self.epoch))) self.l_r.set_value(P.LEARNING_RATE * ((0.985)**self.epoch))
def __call__(self, message, state=None, *, pad=True): state = state or self.initial_state prepared_message = message + (self.padding(len(message)) if pad else b"") assert len(prepared_message) % self.block_size == 0 for block in chunks(prepared_message, self.block_size): state = self.compress(state, block) return state
def getstatusforfids(self, fids): status = {} for chunk in chunks(fids, 50): for f in arlalow.fetchbulkstatus(self.fsconn, chunk): status[f["fid"]] = f["status"] return status
def extract_all_labels(filenames, out_filepath=DATA_FOLDER+'labels.p', chunk_size=2000): print "EXTRACTING ALL LABELS INTO {0}".format(out_filepath) all_labels = [] label_dict = {} filenames_chunks = util.chunks(filenames, chunk_size) for i, chunk in enumerate(filenames_chunks): pool = Pool(processes=util.CPU_COUNT) chunk_labels = pool.map(extract_labels, chunk) pool.close() for filepath, labels in zip(chunk, chunk_labels): if labels is not None: file_id = util.filename_without_extension(filepath) label_dict[file_id] = labels all_labels += labels print i+1, '/', len(filenames_chunks) #Write labels to file with open(out_filepath,'w') as f: pickle.dump(label_dict, f) print '\nLabels:' print len(set(all_labels)) print Counter(all_labels)
def predict(self, data, modes): """predict whether a list of position follows atrain route by detecting the nearest train stops. Input is the pandas data frame of measurements and an array of current mode predictions. Returns an array of predicted modes of the same size as the input data frame has rows. """ # extract lat/lon from data frame lat = data['WLATITUDE'].values lon = data['WLONGITUDE'].values # chunk is a tuple (start_idx, end_idx, mode) for start_idx, end_idx, _ in ifilter(lambda chunk: chunk[2] in [MODE_CAR, MODE_BUS, MODE_TRAIN], chunks(modes, include_values=True)): # test for distance first lat_seg = lat[start_idx:end_idx] lon_seg = lon[start_idx:end_idx] valid_lat_seg = lat_seg[np.where(np.invert(np.isnan(lat_seg)))[0]] valid_lon_seg = lon_seg[np.where(np.invert(np.isnan(lon_seg)))[0]] if len(valid_lon_seg) == 0: continue # TODO: parameters have to be tuned carefully is_train = predict_mode_by_location(valid_lat_seg, valid_lon_seg, self.train_location_tree, self.train_location_dict, self.train_route_dict, dist_thre = 400, dist_pass_thres = 7, num_stops_thre = 3, dist_pass_thres_perc = 0.7) #check entry point distance entry_pt_near = -1 exit_pt_near = -1 if start_idx-1>=0: if not np.isnan(lat[start_idx-1]): nearest_station = find_nearest_station(lat[start_idx-1], lon[start_idx-1], self.train_location_tree, self.dist_thres_entry_exit) if len(nearest_station)!=0: entry_pt_near = 1 else: entry_pt_near = 0 if end_idx < len(modes): if not np.isnan(lat[end_idx]): nearest_station = find_nearest_station(lat[end_idx],lon[end_idx], self.train_location_tree, self.dist_thres_entry_exit) if len(nearest_station)!=0: exit_pt_near = 1 else: exit_pt_near = 0 if is_train or entry_pt_near + exit_pt_near == 2: modes[start_idx:end_idx] = MODE_TRAIN else: modes[start_idx:end_idx] = MODE_CAR return modes
def collect_tweets_by_ids(tweet_ids_config_filepath, output_folder, config): apikeys = list(config['apikeys'].values()).pop() tweet_ids_config = {} with open(os.path.abspath(tweet_ids_config_filepath), 'r') as tweet_ids_config_rf: tweet_ids_config = json.load(tweet_ids_config_rf) max_range = 100 current_ix = tweet_ids_config['current_ix'] if ('current_ix' in tweet_ids_config) else 0 total = len(tweet_ids_config['tweet_ids'][current_ix:]) tweet_id_chuncks = util.chunks(tweet_ids_config['tweet_ids'][current_ix:], max_range) for tweet_ids in tweet_id_chuncks: try: twitterCralwer = TwitterCrawler(apikeys=apikeys, client_args=CLIENT_ARGS, output_folder = output_folder) twitterCralwer.lookup_tweets_by_ids(tweet_ids) current_ix += len(tweet_ids) except Exception as exc: logger.error(exc) logger.error(util.full_stack()) #don't care, if Ctrl+c is hit, does not handle it. When you restart, it restarts from the last chunk (too much trouble to handle Ctrl + c). # you will get duplicate tweets, so what... pass tweet_ids_config['current_ix'] = current_ix flash_cmd_config(tweet_ids_config, tweet_ids_config_filepath, output_folder) logger.info('COMPLETED -> (current_ix: [%d/%d])'%(current_ix, total)) logger.info('PAUSE %ds to CONTINUE...'%WAIT_TIME) time.sleep(WAIT_TIME) else: logger.info('[tweets_by_ids] ALL COMPLETED')
def decode(self, server, block_header, target, job_id = None, extranonce2 = None): if block_header: job = Object() binary_data = block_header.decode('hex') data0 = np.zeros(64, np.uint32) data0 = np.insert(data0, [0] * 16, unpack('IIIIIIIIIIIIIIII', binary_data[:64])) job.target = np.array(unpack('IIIIIIII', target.decode('hex')), dtype=np.uint32) job.header = binary_data[:68] job.merkle_end = np.uint32(unpack('I', binary_data[64:68])[0]) job.time = np.uint32(unpack('I', binary_data[68:72])[0]) job.difficulty = np.uint32(unpack('I', binary_data[72:76])[0]) job.state = sha256(STATE, data0) job.f = np.zeros(8, np.uint32) job.state2 = partial(job.state, job.merkle_end, job.time, job.difficulty, job.f) job.targetQ = 2**256 / int(''.join(list(chunks(target, 2))[::-1]), 16) job.job_id = job_id job.extranonce2 = extranonce2 job.server = server calculateF(job.state, job.merkle_end, job.time, job.difficulty, job.f, job.state2) if job.difficulty != self.difficulty: self.set_difficulty(job.difficulty) return job
def decode(self, server, block_header, target, job_id = None, extranonce2 = None): if block_header: job = Object() binary_data = block_header.decode('hex') #data0 = list(unpack('<16I', binary_data[:64])) + ([0] * 48) job.headerX = binary_data[:76] job.dataX = unpack('<19I', job.headerX) job.target = unpack('<8I', target.decode('hex')) job.header = binary_data[:68] job.merkle_end = uint32(unpack('<I', binary_data[64:68])[0]) job.time = uint32(unpack('<I', binary_data[68:72])[0]) job.difficulty = uint32(unpack('<I', binary_data[72:76])[0]) # job.state = sha256(STATE, data0) job.targetQ = 2**256 / int(''.join(list(chunks(target, 2))[::-1]), 16) job.job_id = job_id job.extranonce2 = extranonce2 job.server = server if job.difficulty != self.difficulty: self.set_difficulty(job.difficulty) return job
def crack_ecb_oracle(oracle_fn, prefix_length=0): block_size = guess_block_size(oracle_fn) if not looks_like_ecb(oracle_fn(b"A" * 100), block_size): raise ValueError("oracle_fn does not appear to produce ECB mode output") result = bytearray() while True: short_block_length = (block_size - len(result) - 1 - prefix_length) % block_size short_input_block = b"A" * short_block_length block_index = (len(result) + prefix_length) // block_size block_to_look_for = chunks(oracle_fn(short_input_block))[block_index] for guess in all_bytes_by_frequency: test_input = short_input_block + result + bytes([guess]) if chunks(oracle_fn(test_input))[block_index] == block_to_look_for: result.append(guess) break else: # if no byte matches return pkcs7_unpad(result)
def add_text(self, text): if len(text) + len(self._lines[self.point[0]]) > self.draw_width: self.point_to_next_line() if len(text) > self.draw_width: lines_to_add = chunks(text, self.draw_width) lines_to_advance = len(lines_to_add) for line in lines_to_add: self._lines.append(line) self.adjust_point_by_lines(lines_to_advance) else: self._lines[self.point[0]] += text self.point_to_end_of_line()
def cluster_to_kml(user, cluster, cluster_id): """ Creates a single, or possibly multiple KML files a given cluster. A KML file is limited by MyMaps to having only 10 layers, so only 10 sections will be in a given KML file. Responsibilty of caller to check existence and formatting of cluster """ Sections = get_section_db() for i,chunk in enumerate(chunks(cluster,10)): sections = map(lambda section_id: Sections.find_one({'_id':section_id}), chunk) sections_to_kml("%s_cluster_data_kml/CLUSTER_%s_%i" % (user, str(cluster_id), i), sections)
def cross_validation(self, fold, epoch): print 'doing cross validation...' splited_data = list(chunks(self.data, fold)) hyper_test = defaultdict(int) for idx, (train, test) in enumerate(splited_data): for c in self.C: for rho_0 in self.RHO_0: weight = self.train(train, rho_0, c, epoch=epoch) precision = self.test(test, weight) print 'done fold %i' % idx, ' on [rho_0: %s, c: %s]' \ % (rho_0, c) hyper_test[(rho_0, c)] += precision return map(lambda (x, y): (x, y/fold), hyper_test.iteritems())
def start_producers(self, result_queue): jobs = Queue() n_workers = params.N_PRODUCERS batch_count = 0 #Flag used for keeping values in queue in order last_queued_job = Value('i', -1) for job_index, batch in enumerate(util.chunks(self.X,self.batch_size)): batch_count += 1 jobs.put( (job_index,batch) ) # Define producer (putting items into queue) def produce(id): while True: job_index, task = jobs.get() if task is None: #print id, " fully done!" break result = self.gen(task) while(True): #My turn to add job done if last_queued_job.value == job_index-1: with last_queued_job.get_lock(): result_queue.put(result) last_queued_job.value += 1 #print id, " worker PUT", job_index break #Start workers for i in xrange(n_workers): if params.MULTIPROCESS: p = Process(target=produce, args=(i,)) else: p = Thread(target=produce, args=(i,)) p.daemon = True p.start() #Add poison pills to queue (to signal workers to stop) for i in xrange(n_workers): jobs.put((-1,None)) return batch_count, jobs
def profile(subset=1000, multi=True, n_threads = 4, batch_size=64, thread_pool=False): # Load a bunch of imagenames y = util.load_labels() y = y[:subset] keys = y.index.values #Create sublists (batches) batched_keys = util.chunks(keys, batch_size) if multi: augment_multithreaded(batched_keys, n_threads=n_threads, thread_pool=thread_pool) else: augment_singlethreaded(batched_keys)
def threshold_optimization(p, y): print "Optimizing threshold" y_images = util.chunks(y, 384*512) def dice_objective(threshold): p_binary = np.where(p > threshold, 1,0) p_images_binary = util.chunks(p_binary, 384*512) mean, std, dices = dice(p_images_binary, y_images) return -mean x, v, message = scipy.optimize.fmin_l_bfgs_b(dice_objective, 0.5, approx_grad=True, bounds=[(0, 1)], epsilon=1e-03) print "Optimized, threshold {0}, ? {1}, termination because {2}".format(x,v,message) return x[0]
def refresh_job(self, j): j.extranonce2 = self.increment_nonce(j.extranonce2) coinbase = j.coinbase1 + self.extranonce + j.extranonce2 + j.coinbase2 merkle_root = sha256(sha256(unhexlify(coinbase)).digest()).digest() for hash_ in j.merkle_branch: merkle_root = sha256(sha256(merkle_root + unhexlify(hash_)).digest()).digest() merkle_root_reversed = '' for word in chunks(merkle_root, 4): merkle_root_reversed += word[::-1] merkle_root = hexlify(merkle_root_reversed) j.block_header = ''.join([j.version, j.prevhash, merkle_root, j.ntime, j.nbits]) j.time = time() return j
def call(self, orderlist): assert isinstance(orderlist, list) orders = {} MAXORDERS = 50 for ol in util.chunks(orderlist, MAXORDERS): # make BDAQ representation of orders from orderlist past self.req.Orders.Order = self.makeorderlist(ol) apilog.info('calling BDAQ Api PlaceOrdersNoReceipt') result = self.client.service.PlaceOrdersNoReceipt(self.req) ors = apiparse.ParsePlaceOrdersNoReceipt(result, orderlist) orders.update(ors) # note: could put result.Timestamp in order object so that we # are saving the BDAQ time. return orders
def status_iter(iterable, callback, chunksize=1, reportsize=10): itersize = len(iterable) starttime = time.time() for i, item in enumerate(util.chunks(iterable, chunksize), 1): callback(item) if i % reportsize == 0: done = i * chunksize nowtime = time.time() numblocks = itersize * 1.0 / (reportsize*chunksize) curblock = done / (reportsize*chunksize) position = curblock / numblocks duration = round(nowtime - starttime) durdelta = datetime.timedelta(seconds=duration) remaining = round((duration / position) - duration) remdelta = datetime.timedelta(seconds=remaining) lookuplog.info("Done %s/%s in %s; %s remaining", done, itersize, str(durdelta), str(remdelta)) lookuplog.info("Finished")
def nfold_cross_validate(data, n=4): data_chunks = chunks(data, len(data) / n) rmse_values = [] for i in range(n): train_set = flatten(data_chunks[:i] + data_chunks[i + 1:]) test_set = data_chunks[i] classif = nltk.MaxentClassifier.train(train_set) test_fs, test_ratings = zip(*test_set) results = classif.batch_classify(test_fs) set_rmse = rmse(test_ratings, results) print 'RMSE: ', set_rmse rmse_values.append(set_rmse) print 'Average RMSE:', sum(rmse_values) / float(len(rmse_values))
def submit_events(self, events): headers = {"Content-Type": "application/json"} event_chunk_size = self.event_chunk_size for chunk in chunks(events, event_chunk_size): payload = { "apiKey": self.api_key, "events": {"api": chunk}, "uuid": get_uuid(), "internalHostname": get_hostname(), } params = {} if self.api_key: params["api_key"] = self.api_key url = "%s/intake?%s" % (self.api_host, urlencode(params)) self.submit_http(url, json.dumps(payload), headers)
def submit_events(self, events): headers = {'Content-Type':'application/json'} event_chunk_size = self.event_chunk_size for chunk in chunks(events, event_chunk_size): payload = { 'apiKey': self.api_key, 'events': { 'api': chunk }, 'uuid': get_uuid(), 'internalHostname': get_hostname() } params = {} if self.api_key: params['api_key'] = self.api_key url = '%s/intake?%s' % (self.api_host, urlencode(params)) self.submit_http(url, json.dumps(payload), headers)
def _start_producers(self, result_queue): jobs = Queue() n_workers = self.n_producers batch_count = 0 # Flag used for keeping values in queue in order last_queued_job = Value('i', -1) chunks = util.chunks(self.X,self.batch_size) # Add jobs to queue for job_index, X_batch in enumerate(chunks): batch_count += 1 jobs.put( (job_index,X_batch) ) # Add poison pills to queue (to signal workers to stop) for i in xrange(n_workers): jobs.put((-1,None)) # Define producer function produce = partial(_produce_helper, generator=self.generator, jobs=jobs, result_queue=result_queue, last_queued_job=last_queued_job, ordered=self.ordered) # Start worker processes or threads for i in xrange(n_workers): name = "ParallelBatchIterator worker {0}".format(i) if self.multiprocess: p = Process(target=produce, args=(i,), name=name) else: p = Thread(target=produce, args=(i,), name=name) # Make the process daemon, so the main process can die without these finishing #p.daemon = True p.start() return batch_count, jobs
def call(self, mids): """ Return all selections for Market ids in mids, where mids is a list of market ids. """ allselections = [] # split up mids into groups of size MAXMIDS for (callnum, ids) in \ enumerate(util.chunks(mids, ApiGetPrices.MAXMIDS)): self.req.MarketIds = ids if callnum > 0: # sleep for some time before calling Api again time.sleep(self.throttl) apilog.info('calling BDAQ Api GetPrices') result = self.client.service.GetPrices(self.req) selections = apiparse.ParseGetPrices(ids, result) allselections = allselections + selections return allselections
def threshold_optimization_naive(p,y): print "Optimizing threshold" y_images = util.chunks(y, 384*512) candidates = np.arange(0.25,0.75,1/2500) def dice_objective(threshold): p_binary = np.where(p > threshold, 1,0) p_images_binary = util.chunks(p_binary, 384*512) mean, std, dices = dice(p_images_binary, y_images) return mean #score = map(dice_objective,tqdm(candidates)) scores = [] for t in tqdm(candidates): score = dice_objective(t) scores.append(score) print np.argmax(scores) threshold = candidates[np.argmax(scores)] print "Best threshold ", threshold return threshold
def lookup(): """ returns (done, remaining)""" songs = db.data.get_pending_songs() songcount = db.data.get_count_pending_songs() if not songs: return (0, 0) # We can use a with statement to ensure threads are cleaned up promptly with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: # Start the load operations and mark each future with its URL i = 0 future_to_song = {} for songchunk in util.chunks(songs, 10): future_to_song[executor.submit(query, songchunk, i)] = songchunk i = 1 - i for future in concurrent.futures.as_completed(future_to_song): songchunk = future_to_song[future] # For each set of songs, get them from the response # for songs not in the response, add an empty response try: data = future.result() except Exception as exc: print('%r generated an exception: %s' % (songchunk, exc)) else: gotsongs = set() waitings = set(songchunk) results = data["response"].get("songs", []) for s in results: songid = s["id"] gotsongs.add(songid) response = {"response": {"songs": [s], "status": data["response"]["status"]}} db.data.add_response_if_not_exists(echonest.SONG_PROFILE, songid, response) nosongs = waitings-gotsongs for s in list(nosongs): db.data.add_response_if_not_exists(echonest.SONG_PROFILE, s, {}) return (len(songs), songcount-len(songs))
def filter_and_lemma(chunk_size=2000): files = glob.glob(INPUT_FOLDER+'*.frog.out') lemmatized = {} #Split all files in the list into chunks file_chunks = util.chunks(files, chunk_size) for i, chunk in enumerate(tqdm(file_chunks)): pool = Pool(processes=util.CPU_COUNT) filtered_lemmatized = pool.map(process, chunk) pool.close() for filename, value in zip(chunk, filtered_lemmatized): file_id = util.filename_without_extension(filename, '.frog.out') lemmatized[file_id] = value #Order by key ordered = OrderedDict(sorted(lemmatized.items())) with open(DATA_FOLDER+'processed.p','w') as f: pickle.dump(ordered,f) print "Done!"
def submit_events(self, events): headers = {'Content-Type':'application/json'} method = 'POST' events_len = len(events) event_chunk_size = self.event_chunk_size for chunk in chunks(events, event_chunk_size): payload = { 'apiKey': self.api_key, 'events': { 'api': chunk }, 'uuid': get_uuid(), 'internalHostname': get_hostname() } params = {} if self.api_key: params['api_key'] = self.api_key url = '/intake?%s' % urlencode(params) status = None conn = self.http_conn_cls(self.api_host) try: start_time = time() conn.request(method, url, json.dumps(payload), headers) response = conn.getresponse() status = response.status response.close() duration = round((time() - start_time) * 1000.0, 4) log.debug("%s %s %s%s (%sms)" % ( status, method, self.api_host, url, duration)) finally: conn.close()
def getActivations(x_train, numActivationTrainingInstances, model, dnnModel, y_train): util.thisLogger.logInfo( "------ start of activation data extraction for training data -------") startTime = datetime.datetime.now() # Only get activations from the instances that are correctly classified y_predict = np.argmax(dnnModel.predict(x_train), axis=1) # The DNN is trained to output 0 or 1 only. # get the original classes it was trained on and transform the outputs classes = util.getParameter('DataClasses') classes = np.asarray(classes.replace('[', '').replace( ']', '').split(',')).astype(int) util.thisLogger.logInfo('Data classes to be used: %s' % (classes)) count = 0 for c in classes: y_predict = np.where(y_predict == count, c, y_predict) count += 1 incorrectPredictIndexes = [] for i in range(0, len(y_predict) - 1): if (y_predict[i] != y_train[i]): incorrectPredictIndexes.append(i) x_train = np.delete(x_train, incorrectPredictIndexes, axis=0) y_train = np.delete(y_train, incorrectPredictIndexes, axis=0) y_predict = np.delete(y_predict, incorrectPredictIndexes, axis=0) # train in batches activationTrainingBatchSize = util.getParameter( 'ActivationTrainingBatchSize') if numActivationTrainingInstances == -1: numActivationTrainingInstances = len(x_train) xData = x_train[:numActivationTrainingInstances, ] batchData = list(util.chunks(xData, activationTrainingBatchSize)) activationData = [] numBatches = len(batchData) batchActivationData = [[] for i in range(numBatches)] for batchIndex in range(numBatches): batch = batchData[batchIndex] util.thisLogger.logInfo("Training batch " + str(batchIndex + 1) + " of " + str(len(batchData)) + " (" + str(len(batch)) + " instances)") # Get activations and set up streams for the training data # get reduced activations for all training data in one go # Train in a loop util.thisLogger.logInfo( str(len(batch)) + " instances selected from training data") activations, numLayers = extract.getActivationData(model, batch) batchActivationData[batchIndex].append(activations) activationData.append(activations) util.thisLogger.logInfo( "Filter Layers: DNN has %s activation layers, getting activation data for %s instances." % (numLayers, len(batch))) endTime = datetime.datetime.now() util.thisLogger.logInfo('Total training time: ' + str(endTime - startTime)) util.thisLogger.logInfo( "------- end of activation data extraction for training data --------") util.thisLogger.logInfo("") return numLayers, batchData, activationData, batchActivationData
def NARR_to_EPIC(vals): lat,lon = vals # Output pandas frame into EPIC weather file out_fl = constants.epic_dly+os.sep+str(lat)+'_'+str(lon)+'.txt' if not(os.path.isfile(out_fl)): logging.info(out_fl) # List all years for which we will create EPIC file lst_yrs = rrule(YEARLY, dtstart=constants.strt_date, until=constants.end_date) # Create pandas data frame, fill with 0.0s, for 1st year. epic_df = pandas.DataFrame(index=pandas.date_range(constants.strt_date,constants.end_date),\ columns=[constants.vars_to_get.keys()]) epic_out = open(out_fl,'w') # Loop across years for idx_yr in range(lst_yrs.count()): cur_strt_date = datetime.date(lst_yrs[idx_yr].year,1,1) cur_end_date = datetime.date(lst_yrs[idx_yr].year,12,31) cur_date_range = pandas.date_range(cur_strt_date,cur_end_date) tmp_df = pandas.DataFrame(index=cur_date_range,columns=[constants.vars_to_get.keys()]) tmp_df.fillna(0.0,inplace=True) # Loop across variables for cur_var in constants.vars_to_get.keys(): e_fl = open(constants.data_dir + os.sep + 'Data' + os.sep + cur_var + os.sep + str(lst_yrs[idx_yr].year)+\ os.sep + str(lat) + '_' + str(lon) + '.txt') epic_vars = filter(None,e_fl.readlines()[0].strip().split("'")) if cur_var == 'air.2m': epic_min_tmp = util.chunks(epic_vars,8,True) epic_max_tmp = util.chunks(epic_vars,8,False) tmp_df[cur_var] = pandas.Series(epic_min_tmp,index=cur_date_range) tmp_df[cur_var] = tmp_df[cur_var].map(lambda x:float(x)+constants.K_To_C) tmp_df['tmax'] = pandas.Series(epic_max_tmp,index=cur_date_range) tmp_df['tmax'] = tmp_df['tmax'].map(lambda x:float(x)+constants.K_To_C) tmp_df['tmin'] = tmp_df['air.2m'] else: tmp_df[cur_var] = pandas.Series(epic_vars,index=cur_date_range) tmp_df[cur_var] = tmp_df[cur_var].map(lambda x:float(x)) # Get into right units tmp_df['wnd'] = pandas.Series(tmp_df['uwnd.10m'].astype(float)**2.0+\ tmp_df['vwnd.10m'].astype(float)**2.0,index=tmp_df.index) tmp_df['wnd'] = tmp_df['wnd']**0.5 tmp_df['rhum.2m'] = tmp_df['rhum.2m'].map(lambda x:float(x)/100.0) tmp_df['swr_diff'] = pandas.Series(tmp_df['dswrf']-tmp_df['uswrf.sfc'],index=tmp_df.index) tmp_df['srad'] = tmp_df['swr_diff'].map(lambda x:constants.WMsq_MjMsq*x) tmp_df['year'] = tmp_df.index.year tmp_df['month'] = tmp_df.index.month tmp_df['day'] = tmp_df.index.day epic_df = epic_df.combine_first(tmp_df) # Output dataframe to text file with right formatting for index, row in epic_df.iterrows(): epic_out.write(('%6d%4d%4d'+6*'%6.2f'+'\n') % (row['year'],row['month'],row['day'], row['srad'],row['tmax'],row['tmin'], row['apcp'],row['rhum.2m'],row['wnd'])) epic_out.close() else: logging.info('File exists: '+out_fl)
def main(args_list: List[str]) -> None: parser = argparse.ArgumentParser(description="Autoencoder for coq terms") add_std_args(parser) parser.add_argument("--gamma", default=.9, type=float) parser.add_argument("--epoch-step", default=5, type=int) parser.add_argument("--num-decoder-layers", dest="num_decoder_layers", default=3, type=int) args = parser.parse_args(args_list) curtime = time.time() print("Loading data...", end="") sys.stdout.flush() dataset = list( itertools.islice(read_text_data(args.scrape_file), args.max_tuples)) print(" {:.2f}s".format(time.time() - curtime)) curtime = time.time() print("Extracting terms...", end="") sys.stdout.flush() term_strings = list( chain.from_iterable( [[hyp.split(":")[1].strip() for hyp in datum.context.focused_hyps] + [datum.context.focused_goal] for datum in dataset])) print(" {:.2f}s".format(time.time() - curtime)) curtime = time.time() print("Building tokenizer...", end="") sys.stdout.flush() tokenizer = tk.make_keyword_tokenizer_topk(term_strings, tk.tokenizers[args.tokenizer], args.num_keywords, 2) print(" {:.2f}s".format(time.time() - curtime)) curtime = time.time() print("Tokenizing {} strings...".format(len(term_strings)), end="") sys.stdout.flush() with multiprocessing.Pool(None) as pool: tokenized_data_chunks = pool.imap_unordered( functools.partial(use_tokenizer, tokenizer, args.max_length), chunks(term_strings, 32768)) tokenized_data = list(chain.from_iterable(tokenized_data_chunks)) print(" {:.2f}s".format(time.time() - curtime)) checkpoints = train(tokenized_data, tokenizer.numTokens(), args.max_length, args.hidden_size, args.learning_rate, args.epoch_step, args.gamma, args.num_encoder_layers, args.num_decoder_layers, args.num_epochs, args.batch_size, args.print_every, optimizers[args.optimizer]) for epoch, (encoder_state, decoder_state, training_loss) in enumerate(checkpoints): state = { 'epoch': epoch, 'training-loss': training_loss, 'tokenizer': tokenizer, 'tokenizer-name': args.tokenizer, 'optimizer': args.optimizer, 'learning-rate': args.learning_rate, 'encoder': encoder_state, 'decoder': decoder_state, 'num-encoder-layers': args.num_encoder_layers, 'num-decoder-layers': args.num_decoder_layers, 'max-length': args.max_length, 'hidden-size': args.hidden_size, 'num-keywords': args.num_keywords, 'context-filter': args.context_filter, } with open(args.save_file, 'wb') as f: print("=> Saving checkpoint at epoch {}".format(epoch)) torch.save(state, f) pass
def expect_layout(self, layout): compressed = lzma.compress(layout.encode("utf-8")) self.expect("FE01", struct.pack("<I", len(compressed))) for idx, chunk in enumerate(chunks(compressed, 32)): self.expect(struct.pack("<BBI", 0xFE, 0x02, idx), chunk)
def NARR_to_EPIC(vals): lat, lon = vals # Output pandas frame into EPIC weather file out_fl = constants.epic_dly + os.sep + str(lat) + '_' + str(lon) + '.txt' if not (os.path.isfile(out_fl)): logging.info(out_fl) # List all years for which we will create EPIC file lst_yrs = rrule(YEARLY, dtstart=constants.strt_date, until=constants.end_date) # Create pandas data frame, fill with 0.0s, for 1st year. epic_df = pandas.DataFrame(index=pandas.date_range(constants.strt_date,constants.end_date),\ columns=[constants.vars_to_get.keys()]) epic_out = open(out_fl, 'w') # Loop across years for idx_yr in range(lst_yrs.count()): cur_strt_date = datetime.date(lst_yrs[idx_yr].year, 1, 1) cur_end_date = datetime.date(lst_yrs[idx_yr].year, 12, 31) cur_date_range = pandas.date_range(cur_strt_date, cur_end_date) tmp_df = pandas.DataFrame(index=cur_date_range, columns=[constants.vars_to_get.keys()]) tmp_df.fillna(0.0, inplace=True) # Loop across variables for cur_var in constants.vars_to_get.keys(): e_fl = open(constants.data_dir + os.sep + 'Data' + os.sep + cur_var + os.sep + str(lst_yrs[idx_yr].year)+\ os.sep + str(lat) + '_' + str(lon) + '.txt') epic_vars = filter(None, e_fl.readlines()[0].strip().split("'")) if cur_var == 'air.2m': epic_min_tmp = util.chunks(epic_vars, 8, True) epic_max_tmp = util.chunks(epic_vars, 8, False) tmp_df[cur_var] = pandas.Series(epic_min_tmp, index=cur_date_range) tmp_df[cur_var] = tmp_df[cur_var].map( lambda x: float(x) + constants.K_To_C) tmp_df['tmax'] = pandas.Series(epic_max_tmp, index=cur_date_range) tmp_df['tmax'] = tmp_df['tmax'].map( lambda x: float(x) + constants.K_To_C) tmp_df['tmin'] = tmp_df['air.2m'] else: tmp_df[cur_var] = pandas.Series(epic_vars, index=cur_date_range) tmp_df[cur_var] = tmp_df[cur_var].map(lambda x: float(x)) # Get into right units tmp_df['wnd'] = pandas.Series(tmp_df['uwnd.10m'].astype(float)**2.0+\ tmp_df['vwnd.10m'].astype(float)**2.0,index=tmp_df.index) tmp_df['wnd'] = tmp_df['wnd']**0.5 tmp_df['rhum.2m'] = tmp_df['rhum.2m'].map( lambda x: float(x) / 100.0) tmp_df['swr_diff'] = pandas.Series(tmp_df['dswrf'] - tmp_df['uswrf.sfc'], index=tmp_df.index) tmp_df['srad'] = tmp_df['swr_diff'].map( lambda x: constants.WMsq_MjMsq * x) tmp_df['year'] = tmp_df.index.year tmp_df['month'] = tmp_df.index.month tmp_df['day'] = tmp_df.index.day epic_df = epic_df.combine_first(tmp_df) # Output dataframe to text file with right formatting for index, row in epic_df.iterrows(): epic_out.write(('%6d%4d%4d' + 6 * '%6.2f' + '\n') % (row['year'], row['month'], row['day'], row['srad'], row['tmax'], row['tmin'], row['apcp'], row['rhum.2m'], row['wnd'])) epic_out.close() else: logging.info('File exists: ' + out_fl)
def maximize(self): print 'mini-batch gd: examples = {}, batch size = {}'.format(len(self.train), self.batch_size) # these are for multithreading q_in = Queue() q_out = Queue() def worker(): while True: ex = q_in.get() q_out.put(self.objective.gradient(self.params, ex)) q_in.task_done() # launch workers for i in range(self.num_threads): t = threading.Thread(target=worker) t.daemon = True t.start() # no. of mini-batch steps taken self.steps = 0 while True: # form fresh batches train_copy = list(self.train) random.shuffle(train_copy) batches = list(util.chunks(train_copy, self.batch_size)) for batch in batches: grad = SparseVector() if self.num_threads == 1: for ex in batch: grad_ex = self.objective.gradient(self.params, ex) grad += grad_ex else: # WARNING: this is only safe if examples in the batch are mutually exclusive for ex in batch: q_in.put(ex) q_in.join() while not q_out.empty(): grad += q_out.get() for frozen in self.freeze_params: grad.remove(frozen) # normalize by batch size grad *= 1.0 / len(batch) # add regularization gradient if self.l1_reg != 0.0 or self.l2_reg != 0.0: reg_grad = self.reg_gradient(self.params, grad, self.approx_reg) grad += reg_grad # record gradient norm, before gradient gets modified by various algorithms self.gnorm = grad.norm2() delta = grad # check if Adagrad controller is begin used adagrad = next((controller for controller in self.controllers if isinstance(controller, AdaGrad)), None) if adagrad is None: delta *= self.step_size self.delta = delta else: # this controller will modify self.delta self.delta = delta adagrad.control(self) # these controllers will modify self.delta, and maybe also self.halt for controller in self.controllers: if isinstance(controller, AdaGrad): continue controller.control(self) # update params self.params += self.delta # check if unit normalization controller unit_norm = next((controller for controller in self.controllers if isinstance(controller, UnitNorm)), None) if unit_norm is not None: unit_norm.control(self) self.track() self.steps += 1 if self.halt: return self.params
def plot_importances(article_sents, importances, abstracts_text, save_location=None, save_name=None): plt.ioff() sents_per_figure = 40 max_importance = np.max(importances) chunked_sents = util.chunks(article_sents, sents_per_figure) chunked_importances = util.chunks(importances, sents_per_figure) for chunk_idx in range(len(chunked_sents)): my_article_sents = chunked_sents[chunk_idx] my_importances = chunked_importances[chunk_idx] if len(my_article_sents) < sents_per_figure: my_article_sents += ['' ] * (sents_per_figure - len(my_article_sents)) my_importances = np.concatenate([ my_importances, np.zeros([sents_per_figure - len(my_importances)]) ]) y_pos = np.arange(len(my_article_sents)) fig, ax1 = plt.subplots() fig.subplots_adjust(left=0.9, top=1.0, bottom=0.03, right=1.0) ax1.barh(y_pos, my_importances, align='center', color='green', ecolor='black') ax1.set_yticks(y_pos) ax1.set_yticklabels(my_article_sents) ax1.invert_yaxis() # labels read top-to-bottom ax1.set_xlabel('Performance') ax1.set_title('How fast do you want to go today?') ax1.set_xlim(right=max_importance) fig.set_size_inches(18.5, 10.5) plt.savefig( os.path.join(save_location, save_name + '_' + str(chunk_idx) + '.jpg')) plt.close(fig) plt.figure() fig_txt = tw.fill(tw.dedent(abstracts_text), width=80) plt.figtext(0.5, 0.5, fig_txt, horizontalalignment='center', fontsize=9, multialignment='left', bbox=dict(boxstyle="round", facecolor='#D8D8D8', ec="0.5", pad=0.5, alpha=1), fontweight='bold') fig = plt.gcf() fig.set_size_inches(18.5, 10.5) plt.savefig( os.path.join(save_location, save_name + '_' + str(chunk_idx + 1) + '.jpg')) plt.close(fig)
resvar = np.asarray([np.linalg.norm(r)**2 for r in R]) losses.append(np.sum(resvar)) D2 = np.diag(1 / resvar) precision2 = D2 @ (np.identity(n) - B) err = (precision2 - precision) loss2 = np.trace(err @ err.T) B = B - lr * G print(loss2) test_points = 10 losses = np.asarray(losses)[:test_points] target_losses = [ 118., 41.150800000000004, 33.539355199999996, 29.747442032320002, 27.450672271574934, 25.95846376879459, 24.917943341139274, 24.139761502111114, 23.519544126307142, 22.998235729589265 ] u.check_equal(losses[:test_points], target_losses[:test_points]) print('mismatch is ', np.max(losses - target_losses)) if __name__ == '__main__': numbers = [(x + 1)**3 for x in range(16)] list(u.chunks(numbers, 4)) X = np.array(list(u.chunks(numbers, 4))) X = np.asarray([[5, 1, 0, 4], [0, 4, 1, 2], [1, 0, 3, 3], [4, 2, 0, 4]]) test_numpy(X)
lastTimeStamp = None lnameDict = lname(s.LDBPATH) connections = {} whoCache = {} hostnames = [] hostnameToCluster = {} for cluster in s.MACHINES['clusters']: if cluster not in whoCache: whoCache[cluster] = OrderedDict() for hostname in s.MACHINES['clusters'][cluster]['hostnames']: hostnames.append(hostname) hostnameToCluster[hostname] = cluster hostnamesChunked = list(util.chunks(hostnames, len(hostnames)//s.THREADS)) threads = [] clients = [] thread_times = [] def sshAndGetWho(client, hostname): #s.log('sshing into %s', hostname) who = [] try: client.connect( hostname, username=s.USERNAME, password=s.PASSWORD, ) stdin, stdout, stderr = client.exec_command('w') # get rid of first two lines of w output
def enqueue_jobs(cls, method, ids_q_or_list, queue_number, use_rq=True, append=False, chunk_size=25, shortcut_fn=None ): """ Takes sqlalchemy query with IDs, runs fn on those repos. """ shortcut_data = None if use_rq: if shortcut_fn: raise ValueError("you can't use RQ with a shortcut_fn") else: if shortcut_fn: shortcut_data_start = time() logger.info(u"Getting shortcut data...") shortcut_data = shortcut_fn() logger.info(u"Got shortcut data in {} seconds".format( elapsed(shortcut_data_start) )) chunk_size = int(chunk_size) start_time = time() new_loop_start_time = time() index = 0 try: logger.info(u"running this query: \n{}\n".format( ids_q_or_list.statement.compile(dialect=postgresql.dialect()))) row_list = ids_q_or_list.all() except AttributeError: logger.info(u"running this query: \n{}\n".format(ids_q_or_list)) row_list = db.engine.execute(sql.text(ids_q_or_list)).fetchall() if row_list is None: logger.info(u"no IDs, all done.") return None logger.info(u"finished enqueue_jobs query in {} seconds".format(elapsed(start_time))) object_ids = [row[0] for row in row_list] # do this as late as possible so things can keep using queue if use_rq: if append: logger.info(u"not clearing queue. queue currently has {} jobs".format(ti_queues[queue_number].count)) else: empty_queue(queue_number) num_items = len(object_ids) logger.info(u"adding {} items to queue...".format(num_items)) # iterate through chunks of IDs like [[id1, id2], [id3, id4], ... ] object_ids_chunk = [] for object_ids_chunk in chunks(object_ids, chunk_size): update_fn_args = [cls, method, object_ids_chunk] if use_rq: job = ti_queues[queue_number].enqueue_call( func=update_fn, args=update_fn_args, timeout=60 * 10, result_ttl=0 # number of seconds ) job.meta["object_ids_chunk"] = object_ids_chunk job.save() # logger.info(u"saved job {}".format(job)) else: update_fn_args.append(shortcut_data) update_fn(*update_fn_args, index=index) if True: # index % 10 == 0 and index != 0: num_jobs_remaining = num_items - (index * chunk_size) try: jobs_per_hour_this_chunk = chunk_size / float(elapsed(new_loop_start_time) / 3600) predicted_mins_to_finish = round( (num_jobs_remaining / float(jobs_per_hour_this_chunk)) * 60, 1 ) logger.info(u"\n\nWe're doing {} jobs per hour. At this rate, done in {}min".format( int(jobs_per_hour_this_chunk), predicted_mins_to_finish )) logger.info(u"(finished chunk {} of {} chunks in {} seconds total, {} seconds this loop)\n".format( index, num_items/chunk_size, elapsed(start_time), elapsed(new_loop_start_time) )) except ZeroDivisionError: # logger.info(u"not printing status because divide by zero") logger.info(u"."), new_loop_start_time = time() index += 1 logger.info(u"last chunk of ids: {}".format(list(object_ids_chunk))) db.session.remove() # close connection nicely return True
def correct_raw_data(raw_data_path, channel, subsample_factor=2, log_s3_path=None, background_correction=True): total_n_jobs = cpu_count() # overwrite existing raw data with corrected data outdir = raw_data_path # get list of all tiles to correct for given channel all_files = np.sort(glob.glob(f'{raw_data_path}/*/*.tiff')) if background_correction: background_val = get_background_value(raw_data_path) total_files = len(all_files) bias_path = f'{outdir}/CHN0{channel}_bias.tiff' if os.path.exists(bias_path): bias = tf.imread(bias_path) else: # subsample tiles files_cb = all_files[::subsample_factor] num_files = len(files_cb) # compute running sums in parallel sums = Parallel(total_n_jobs, verbose=10)( delayed(sum_tiles)(f) for f in chunks(files_cb, math.ceil(num_files // (total_n_jobs)) + 1)) sums = [i[:, :, None] for i in sums] mean_tile = np.squeeze(np.sum(np.concatenate(sums, axis=2), axis=2)) / num_files if background_correction: # subtract background out from bias correction mean_tile -= background_val mean_tile = sitk.GetImageFromArray(mean_tile) # get the bias correction tile using N4ITK bias = sitk.GetArrayFromImage(get_bias_field(mean_tile, scale=1.0)) # save bias tile to local directory tf.imsave(bias_path, bias.astype('float32')) # save bias tile to S3 if log_s3_path: s3 = boto3.resource('s3') img = Image.fromarray(bias) fp = BytesIO() img.save(fp, format='TIFF') # reset pointer to beginning of file fp.seek(0) log_s3_url = S3Url(log_s3_path.strip('/')) bias_path = f'{log_s3_url.key}/CHN0{channel}_bias.tiff' s3.Object(log_s3_url.bucket, bias_path).upload_fileobj(fp) # correct all the files and save them files_per_proc = math.ceil(total_files / total_n_jobs) + 1 work = chunks(all_files, files_per_proc) with tqdm_joblib(tqdm(desc="Correcting tiles", total=total_n_jobs)) as progress_bar: Parallel(n_jobs=total_n_jobs, verbose=10)( delayed(correct_tiles)(files, outdir, bias, background_val) for files in work)
def queue_work(self, work, miner=None): target = ''.join( list(chunks('%064x' % self.server_difficulty, 2))[::-1]) self.switch.queue_work(self, work.block_header, target, work.job_id, work.extranonce2, miner)
def subwindow_shape(self): return tuple((b-a for a, b in util.chunks(self.subwindow, 2)))
def build_lines(self): self._lines = chunks(self._text, self.draw_width) self.scroll["maxCurrentLine"] = len(self._lines)
def enqueue_jobs(cls, method, ids_q_or_list, queue_number, use_rq=True, chunk_size=25, shortcut_fn=None): """ Takes sqlalchemy query with IDs, runs fn on those repos. """ shortcut_data = None if use_rq: empty_queue(queue_number) if shortcut_fn: raise ValueError("you can't use RQ with a shortcut_fn") else: if shortcut_fn: shortcut_data_start = time() print "Getting shortcut data..." shortcut_data = shortcut_fn() print "Got shortcut data in {}sec".format( elapsed(shortcut_data_start)) chunk_size = int(chunk_size) start_time = time() new_loop_start_time = time() index = 0 print "running this query: \n{}\n".format( ids_q_or_list.statement.compile(dialect=postgresql.dialect())) row_list = ids_q_or_list.all() print "finished query in {}sec".format(elapsed(start_time)) if row_list is None: print "no IDs, all done." return None object_ids = [row[0] for row in row_list] num_jobs = len(object_ids) print "adding {} jobs to queue...".format(num_jobs) # iterate through chunks of IDs like [[id1, id2], [id3, id4], ... ] object_ids_chunk = [] for object_ids_chunk in chunks(object_ids, chunk_size): update_fn_args = [cls, method, object_ids_chunk] if use_rq: job = ti_queues[queue_number].enqueue_call( func=update_fn, args=update_fn_args, timeout=60 * 10, result_ttl=0 # number of seconds ) job.meta["object_ids_chunk"] = object_ids_chunk job.save() # print u"saved job {}".format(job) else: update_fn_args.append(shortcut_data) update_fn(*update_fn_args, index=index) if True: # index % 10 == 0 and index != 0: num_jobs_remaining = num_jobs - (index * chunk_size) try: jobs_per_hour_this_chunk = chunk_size / float( elapsed(new_loop_start_time) / 3600) predicted_mins_to_finish = round( (num_jobs_remaining / float(jobs_per_hour_this_chunk)) * 60, 1) print "\n\nWe're doing {} jobs per hour. At this rate, done in {}min".format( int(jobs_per_hour_this_chunk), predicted_mins_to_finish) print "(finished chunk {} of {} chunks in {}sec total, {}sec this loop)\n".format( index, num_jobs / chunk_size, elapsed(start_time), elapsed(new_loop_start_time)) except ZeroDivisionError: # print u"not printing status because divide by zero" print ".", new_loop_start_time = time() index += 1 print "last chunk of ids: {}".format(list(object_ids_chunk)) db.session.remove() # close connection nicely return True
def enqueue_jobs(cls, method, ids_q_or_list, queue_number, use_rq="rq", chunk_size=10, shortcut_fn=None): """ Takes sqlalchemy query with (login, repo_name) IDs, runs fn on those repos. """ shortcut_data = None if use_rq == "rq": empty_queue(queue_number) if shortcut_fn: raise ValueError("you can't use RQ with a shortcut_fn") else: if shortcut_fn: shortcut_data_start = time() print "Getting shortcut data..." shortcut_data = shortcut_fn() print "Got shortcut data in {}sec".format( elapsed(shortcut_data_start)) chunk_size = int(chunk_size) start_time = time() new_loop_start_time = time() index = 0 print "running this query: \n{}\n".format( ids_q_or_list.statement.compile(dialect=postgresql.dialect())) row_list = ids_q_or_list.all() print "finished query in {}sec".format(elapsed(start_time)) if row_list is None: print "no IDs, all done." return None object_ids = [row[0] for row in row_list] num_jobs = len(object_ids) print "adding {} jobs to queue...".format(num_jobs) # iterate through chunks of IDs like [[id1, id2], [id3, id4], ... ] object_ids_chunk = [] for object_ids_chunk in chunks(object_ids, chunk_size): update_fn_args = [cls, method, object_ids_chunk] if use_rq == "rq": job = ti_queues[queue_number].enqueue_call( func=update_fn, args=update_fn_args, timeout=60 * 10, result_ttl=0 # number of seconds ) job.meta["object_ids_chunk"] = object_ids_chunk job.save() else: update_fn_args.append(shortcut_data) update_fn(*update_fn_args) if index % 1000 == 0 and index != 0: print "added {} jobs to queue in {}sec total, {}sec this loop".format( index, elapsed(start_time), elapsed(new_loop_start_time)) new_loop_start_time = time() index += 1 print "last object added to the queue was {}".format( list(object_ids_chunk)) db.session.remove() # close connection nicely return True
def iterstories(stories, include_tasks=False): for s in stories: yield s if include_tasks: for t in chunks(s.tasks, 2): yield PivotalTaskPair(t)