def addDownload2(self): start=timeit.default_timer() global urlname global urlsize global urlstatus global urltype global model global idx url=str(self.dialog.lineEdit_6.text()) urlname = url.split('/')[-1] response=urllib2.urlopen(url) self.meta=response.info() urlsize=int(self.meta.getheaders('Content-Length')[0]) if 1000<urlsize<1000000: urlsize=str(urlsize/1000.0) + " KB" elif 1000000<urlsize<10**9: urlsize=str(1.0*urlsize/10**6) + " MB" else: urlsize=str(1.0*urlsize/10**9) + " GB" self.dialog.label_17.setText(urlsize) self.dialog.label_68.setText(urlname) f=open(urlname,'wb') f.write(response.read()) stop=timeit.default_timer() speed=float(urlsize.split(' ')[0])/(stop-start) speed=str(speed) + " " + urlsize.split(' ')[-1] + "/s" self.dialog.label_67.setText("100 %") self.dialog.label_69.setText(speed) self.dialog.label_70.setText(str(stop-start))
def test(self, view): """ Calls the given view and measures the time for it to return. The garbage collector is diabled during execution. """ gc_old = gc.isenabled() gc.disable() try: start = timeit.default_timer() if view.method == 'GET': response = self.client.get(view.url, view.data) elif view.method == 'POST': response = self.client.post(view.url, view.data) else: raise ValueError('Unknown view method: %s' % view.method) end = timeit.default_timer() # Return result in milliseconds time_ms = (end - start) * 1000 # Try to get version information version = subprocess.check_output(['git', 'describe']) from .models import TestResult return TestResult(view=view, time=time_ms, result=response, result_code=response.status_code, version=version) finally: if gc_old: gc.enable()
def measured(self, msg, *args, **kwargs): self.info(msg + '..', *args, **kwargs) start_time = timeit.default_timer() yield end_time = timeit.default_timer() self.info("%s took %.2fs." % (msg, end_time - start_time), *args, **kwargs)
def __download_range(self, k, dst): try: _, ext = os.path.splitext(dst) ds = [] parts = [] logging.info("Download %s start", k.name) for startByte in range(0, k.size, self.splitMB): output_part = self.new_temp_file(suffix=ext) parts.append(output_part) endByte = min(startByte + self.splitMB - 1, k.size) logging.debug( "deferToThreadPool %s start=%d end=%d size=%d cnt=%d", k.name, startByte, endByte, endByte - startByte, len(ds), ) d = twisted.internet.threads.deferToThreadPool( reactor, reactor.getThreadPool(), # @UndefinedVariable self.__downloadOne, k, startByte, endByte, output_part, len(ds), ) ds.append(d) if os.path.exists(dst): os.remove(dst) fout = file(dst, "wb") start = timeit.default_timer() for cnt, p in enumerate(parts): yield ds[cnt] shutil.copyfileobj(file(p, "rb"), fout) size = min(k.size, (cnt + 1) * self.splitMB) elapsed = timeit.default_timer() - start speedstr = formatFileSize(size / elapsed) sizestr = formatFileSize(size) percent = (float(cnt) / len(parts)) * 100.0 logging.info( "%03d/%03d (%.2f%%) speed=%s/s, elapsed=%.2f, size=%s", cnt, len(parts), percent, speedstr, elapsed, sizestr, ) except Exception: logging.error("download error", exc_info=True) raise
def main(): start_time = timeit.default_timer() proxies = [] targets = ['http://www.google-proxy.net/','http://free-proxy-list.net/'] for i in range(len(targets)): proxy = proxy_scraper(targets[i]) for u in range(len(proxy)): proxy_found = str(proxy[u]['ip'])+":"+str(proxy[u]['port']) if proxy_found not in proxies: if is_proxy_existed(proxy[u]['ip']) != True: print proxy[u]['ip'] +" - "+ proxy[u]['port'] +" - "+ proxy[u]['hostname'] create_proxy(proxy[u]['ip'], proxy[u]['port'], proxy[u]['hostname'], proxy[u]['service'], proxy[u]['latitude'], proxy[u]['longitude'], proxy[u]['city'], proxy[u]['country']) proxies.append(proxy_found) # save to a file file_name = "data_proxies.cfg" write_file( file_name, "\n".join(proxies) ) print("\n%s proxies found. File saved. You can find it under '%s'." % (len(proxies), file_name)) # measure time print "\nElapsed time: %d sec" % (timeit.default_timer() - start_time)
def create_features(features): import timeit source = load_source() start = timeit.default_timer() compute_features(source, features) end = timeit.default_timer() print("save all features takes ", (end-start))
def __execEvent__(self, eventName, ntime, commandHandler): last = self.__events__[eventName]["lastExecTime"] timeInterval = self.__events__[eventName]["timeInterval"] if ntime - last >= timeInterval: start = default_timer() self.__events__[eventName]["function"](commandHandler, self.__events__[eventName]["channels"]) timeTaken = default_timer() - start stats = self.__events__[eventName]["stats"] if stats["average"] == None: stats["average"] = timeTaken stats["min"] = timeTaken stats["max"] = timeTaken else: stats["average"] = (stats["average"]+timeTaken) / 2.0 if timeTaken < stats["min"]: stats["min"] = timeTaken if timeTaken > stats["max"]: stats["max"] = timeTaken self.__events__[eventName]["lastExecTime"] = time.time()
def main(): print "[Facebook Album Downloader v1]" start = timeit.default_timer() # hide images prefs = {"profile.managed_default_content_settings.images": 2} extensions = webdriver.ChromeOptions() extensions.add_experimental_option("prefs", prefs) browser = webdriver.Chrome(executable_path="chromedriver", chrome_options=extensions) findAlbum(browser) createAlbumPath() queue = Queue() for x in range(max_workers): worker = DownloadWorker(queue) worker.daemon = True worker.start() print "[Getting Image Links]" linkImages = getImageLinks(browser) print "[Found: " + str(len(linkImages)) + "]" for fullRes in linkImages: queue.put(fullRes) print "[Downloading...]" queue.join() browser.quit() stop = timeit.default_timer() print "[Time taken: %ss]" % str(stop - start) raw_input("Press any key to continue...")
def read_features(features): """ read all the features in the 'features' array and return a numpy array currently only compute the grand mean and std """ start = timeit.default_timer() x = [] y = [] for fn in glob.glob(os.path.join(FT_DIR, "*.npy")): start = fn.rfind('/') end = fn.rfind('.') ext = fn[start+1:end] genre, _= ext.split('_') data = np.load(fn) surface_ft = data[:-1] #5 features ft_vec = [np.mean(ft) for ft in surface_ft] + [np.std(ft) for ft in surface_ft] ceps = data[-1]#mfcc features cep_len = len(ceps) ft_vec += np.mean(ceps[int(cep_len / 10.):int(cep_len * 9 / 10.)], axis=0).tolist() x.append(ft_vec) y.append(GENRE_DICT[genre]) end = timeit.default_timer() print("reading all features takes: ", (end - start)) return np.array(x), np.array(y)
def index_project(self, project_name): project_data = self.watcher.projects[project_name]["project_data"] cfc_folders = project_data.get(self.folder_key, []) mappings = project_data.get("mappings", []) project_file_dir = os.path.dirname(project_name) if len(cfc_folders) == 0: return start_time = timeit.default_timer() index = {} print("CFML: indexing components in project '" + project_name + "'") for cfc_folder in sorted(cfc_folders, key=lambda d: d["path"]): root_path = utils.normalize_path(cfc_folder["path"], project_file_dir) path_index = self.parser.parse_directory(root_path) index.update(path_index) self.data[project_name] = { "index": index, "cache": {file_path: {} for file_path in index} } self.build_project_data(project_name) index_time = timeit.default_timer() - start_time message = "CFML: indexing components in project '{}' completed - {} files indexed in {:.2f} seconds" print(message.format(project_name, str(len(index)), index_time)) self.notify_listeners(project_name)
def trim_data(crime_data, part, total_parts): print 'Trimming unnecessary data...', time1 = tm.default_timer() crime_data = crime_data[crime_data['YEAR'] >= 2006] crime_data = crime_data[crime_data['YEAR'] <= 2015] crime_data = crime_data[pd.notnull(crime_data['NEIGHBOURHOOD'])] crime_data = crime_data.drop('HUNDRED_BLOCK', axis=1) crime_data = crime_data.sort_index() if TEST_VAL: print 'Taking subset of crime data (1000 row sample)...', crime_data = crime_data.head(1005) if part is not None and total_parts is not None: start_index = int(1.0*(part-1)/total_parts*crime_data['YEAR'].count()) end_index = int(1.0*part/total_parts*crime_data['YEAR'].count()) if part == total_parts: end_index = crime_data['YEAR'].count() crime_data = crime_data[start_index:end_index] print 'Start index, end index, size:',start_index,end_index, crime_data['YEAR'].count() print 'Finished' print 'Time taken:', tm.default_timer()-time1, ' seconds\n' return crime_data
def launch_jobs(quandl_codes, num_workers, calc_date, authtoken="", freq='M', span=60): job_queue = Queue.Queue() for b in quandl_codes: job_queue.put(b) print "Length %d"%job_queue.qsize() thlist = [] s_time = timeit.default_timer() fp = open("output.csv","w") heading = "Ticker, Date, "+",".join(Worker._itemlist)+"\n" fp.write(heading) s_date = dutil.shift_months(calc_date, -(span+6)) trim_start = s_date.strftime('%Y-%m-%d') trim_end = calc_date.strftime('%Y-%m-%d') calc_param = {"calc_date": calc_date, "freq" : freq, "span":60} for i in range(num_workers): th = Worker(job_queue, trim_start, trim_end, calc_param, authtoken, fp) th.daemon = True th.start() thlist.append(th) print "Finished launching jobs" e_time = timeit.default_timer() print "Time taken ",(e_time - s_time) # block until the queue is empty job_queue.join()
def spawn_runpy(cp, wait=60, cb=check_rst): "as decorator to run job" global WAITQ, RUNQ, CFG pool = Pool(processes=CFG['MAXJOBS']) while len(WAITQ) > 0 or len(RUNQ) > 0: if len(RUNQ) <= CFG['MAXJOBS'] and len(WAITQ) > 0: path, test = WAITQ.pop() rst = pool.apply_async(call_runpy, (cp, path, test,)) RUNQ.append((rst, test, timeit.default_timer())) else: for r in RUNQ: usec = float("%.2f" %(timeit.default_timer()-r[2])) if r[0].successful: print "[{0}] success used {1} usec".format(r[1], usec) RUNQ.remove(r) if cb: cb(r[1], 'pass', usec) else: if usec > CFG['TIMEOUT']: print "[{0}] unsuccess used timeout {1} usec".format(r[1], usec) r[0].terminate() if cb: cb(r[1], 'fail', usec) time.sleep(float(wait))
def test_exercise_6(self): con = self.con con.isolation_level = None cur = con.cursor() N = 30000 ############################# # Exercise 6 # # Change the following schema to include an index on column "a". cur.execute('CREATE TABLE "numbers" (a INTEGER)') # # ############################# rows = [] for i in range(0, N): rows.append( (i,) ) cur.executemany('INSERT INTO "numbers" VALUES (?)', rows) start_time = timeit.default_timer() cur.execute('select min(a) from numbers') print("exercise_6: That took %f ms." % ((timeit.default_timer() - start_time) * 1000,)) data = cur.fetchall() cur.close() self.assertTrue(data[0][0] == 0)
def evaluate(im, algo, gt_illuminant, i, range_thresh, bin_num, dst_folder): new_im = None start_time = timeit.default_timer() if algo=="grayworld": new_im = cv2.xphoto.autowbGrayworld(im, 0.95) elif algo=="nothing": new_im = im elif algo=="learning_based": new_im = cv2.xphoto.autowbLearningBased(im, None, range_thresh, 0.98, bin_num) elif algo=="GT": gains = gt_illuminant / min(gt_illuminant) g1 = float(1.0 / gains[2]) g2 = float(1.0 / gains[1]) g3 = float(1.0 / gains[0]) new_im = cv2.xphoto.applyChannelGains(im, g1, g2, g3) time = 1000*(timeit.default_timer() - start_time) #time in ms if len(dst_folder)>0: if not os.path.exists(dst_folder): os.makedirs(dst_folder) im_name = ("%04d_" % i) + algo + ".jpg" cv2.imwrite(os.path.join(dst_folder, im_name), stretch_to_8bit(new_im)) #recover the illuminant from the color balancing result, assuming the standard model: estimated_illuminant = [0, 0, 0] eps = 0.01 estimated_illuminant[2] = np.percentile((im[:,:,0] + eps) / (new_im[:,:,0] + eps), 50) estimated_illuminant[1] = np.percentile((im[:,:,1] + eps) / (new_im[:,:,1] + eps), 50) estimated_illuminant[0] = np.percentile((im[:,:,2] + eps) / (new_im[:,:,2] + eps), 50) res = np.arccos(np.dot(gt_illuminant,estimated_illuminant)/ (np.linalg.norm(gt_illuminant) * np.linalg.norm(estimated_illuminant))) return (time, (res / np.pi) * 180)
def execute(self): start_time = timeit.default_timer() response = self.svc.call() end_time = timeit.default_timer() self.elapsed_time = end_time - start_time return self.validate(response)
def runTestCode(self): """ This function ties into the debug menu. It is meant to allow execution of some test code. Feel free to change the contents of this function. """ start = timeit.default_timer() monsters = [] lib = Libraries.MonsterLibrary() stop = timeit.default_timer() time = stop - start print "Created library in " + str(time) + " seconds" for i in range(0, 10000): myRandom = lib.getRandomMonster(random.randint(0, 80)) monsters.append(myRandom) # lib = Libraries.ItemLibrary() # myItem = lib.createItem('heal') # print myItem # myItem = lib.createItem('sword') # print myItem # myItem = lib.createItem('cloak') # print myItem # myItem = lib.createItem('fireball') # print myItem stop = timeit.default_timer() time = stop - start print "Created " + str(len(monsters)) + " monsters in " + str(time) + " seconds"
def scan_vocab(self, documents, progress_per=10000, trim_rule=None): logger.info("collecting all words and their counts") document_no = -1 total_words = 0 min_reduce = 1 interval_start = default_timer() - 0.00001 # guard against next sample being identical interval_count = 0 vocab = defaultdict(int) for document_no, document in enumerate(documents): if document_no % progress_per == 0: interval_rate = (total_words - interval_count) / (default_timer() - interval_start) logger.info("PROGRESS: at example #%i, processed %i words (%i/s), %i word types, %i tags", document_no, total_words, interval_rate, len(vocab), len(self.docvecs)) interval_start = default_timer() interval_count = total_words document_length = len(document.words) for tag in document.tags: self.docvecs.note_doctag(tag, document_no, document_length) for word in document.words: vocab[word] += 1 total_words += len(document.words) if self.max_vocab_size and len(vocab) > self.max_vocab_size: utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) min_reduce += 1 logger.info("collected %i word types and %i unique tags from a corpus of %i examples and %i words", len(vocab), len(self.docvecs), document_no + 1, total_words) self.corpus_count = document_no + 1 self.raw_vocab = vocab
def run(self): try: self.running = True self.logger.info("[Relay] Relay controller activated in " "{}ms".format((timeit.default_timer()-self.thread_startup_timer)*1000)) while (self.running): current_time = datetime.datetime.now() for relay_id in self.relay_id: if (self.relay_on_until[relay_id] < current_time and self.relay_on_duration[relay_id] and self.relay_pin[relay_id]): # Use threads to prevent a slow execution of a # process that could slow the loop turn_relay_off = threading.Thread( target=self.relay_on_off, args=(relay_id, 'off',)) turn_relay_off.start() if self.relay_last_duration[relay_id] > 0: write_db = threading.Thread( target=write_influxdb, args=(self.logger, INFLUXDB_HOST, INFLUXDB_PORT, INFLUXDB_USER, INFLUXDB_PASSWORD, INFLUXDB_DATABASE, 'relay', relay_id, 'duration_sec', float(self.relay_last_duration[relay_id]),)) write_db.start() time.sleep(0.01) finally: self.all_relays_off() self.running = False self.logger.info("[Relay] Relay controller deactivated in " "{}ms".format((timeit.default_timer()-self.thread_shutdown_timer)*1000))
def train(self, examples, cv_extract, epochs, learning_rate): """ Specializes the network for prediction on the given examples, using the given center extract function, the given number of epochs, and the given learning rate. """ input = T.vector(name="training_input", dtype=theano.config.floatX) tf = self.get_specialization_function(input, cv_extract, learning_rate) indices = list(range(examples.get_value(borrow=True).shape[0])) start_time = timeit.default_timer() # TODO: batches? for epoch in range(epochs): self.rng.shuffle(indices) costs = [] for j in indices: cost = tf(examples.get_value(borrow=True)[j].reshape(-1)) costs.append(cost) debug( "... [{}] epoch {: 3d} done {} ...".format( str(datetime.timedelta(seconds=timeit.default_timer()-start_time)), epoch + 1, "(min/avg cost {:0.3f}/{:0.3f})".format( float(float(min(costs))), float(float(sum(costs)/float(len(costs)))) ) ) )
def simulate(new_N = N, new_R = R, new_D = D): global N N = new_N global R R = new_R global D D = new_D global distance_arr distance_arr = [([0] * N) for i in xrange(N)] global sensor_network sensor_network = [Sensor(i) for i in range(N)] for x in xrange(N): for y in xrange(N): if x != y and distance_arr[x][y] == 0: distance_arr[x][y] = sensor_distance(sensor_network[x].position, sensor_network[y].position) distance_arr[y][x] = distance_arr[x][y] start = timeit.default_timer() [s.start() for s in sensor_network] [s.join() for s in sensor_network] stop = timeit.default_timer() return stop - start
def evaluate(self, p, sim, plt): start = timeit.default_timer() sim.run(p.T) end = timeit.default_timer() speed = p.T / (end - start) data = sim.data[self.p_ens] last = [] for row in data.T: nz = np.nonzero(row>0.05)[0] if len(nz) == 0: last.append(0) else: last.append(nz[-1]) time_to_inhibit = np.array(last)*p.dt if plt: plt.plot(sim.trange(), sim.data[self.p_ens]) for t in time_to_inhibit: plt.axvline(t) plt.axhline(0.05, linestyle='--', c='k') plt.xlabel('time (s) with increasing inhibition') plt.ylabel('decoded output') return dict(time_to_inhibit=np.mean(time_to_inhibit), speed=speed)
def load_indicators_to_mongo_zh(is_incremental): print("start loading indicator data(zh) from JSON file to MongoDB...") all_start = timeit.default_timer() static = Static() f = io.open(static.output_folder + '/worldbank_wdi_indicators_zh.json', 'r', encoding='utf8', errors='ignore') json_str = f.readline() indicator_array = json.loads(json_str) f.close() client = MongoClient(static.mongo_url, static.mongo_port) db = client[static.database_name] ## print(db.collection_names()) indicator_col = db[static.indicator_col_name] if not is_incremental: indicator_col.drop() for ind in indicator_array: indicator_key = ind['id'].replace('.', '_') + '_ZH' data_type = 'number' if(ind['name'].find('百分比') > -1): data_type = 'percentage' topics = [] for topic in ind['topics']: topics.append(topic['value']) indicator_rec = {'indicator_key': indicator_key, 'original_id': ind['id'], 'indicator_text': ind['name'], 'data_type': data_type, 'sourceOrganization': ind['sourceOrganization'], 'sourceNote': ind['sourceNote'], 'topics': topics, 'data_source': '世界发展指标', 'dimension': [{'dimension_key': 'year', 'dimension_text': '年'}, {'dimension_key': 'region', 'dimension_text': '区域'}, {'dimension_key': 'country', 'dimension_text': '国家'}]} pk = indicator_col.insert(indicator_rec) print(indicator_key + ' ' + ind['name'] + ' inserted.') print("job is complete.") print("total records: " + str(indicator_col.count())) print("total time cost: " + str(round(timeit.default_timer() - all_start)) + 's')
def _run_analyzers_on_event(self): '''Run all analysers on the current event, self.event. Returns a tuple (success?, last_analyzer_name). ''' for i,analyzer in enumerate(self._analyzers): if not analyzer.beginLoopCalled: analyzer.beginLoop(self.setup) start = timeit.default_timer() if self.memReportFirstEvent >=0 and iEv >= self.memReportFirstEvent: memNow=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss if memNow > self.memLast : print "Mem Jump detected before analyzer %s at event %s. RSS(before,after,difference) %s %s %s "%( analyzer.name, iEv, self.memLast, memNow, memNow-self.memLast) self.memLast=memNow ret = analyzer.process( self.event ) if self.memReportFirstEvent >=0 and iEv >= self.memReportFirstEvent: memNow=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss if memNow > self.memLast : print "Mem Jump detected in analyzer %s at event %s. RSS(before,after,difference) %s %s %s "%( analyzer.name, iEv, self.memLast, memNow, memNow-self.memLast) self.memLast=memNow if self.timeReport: self.timeReport[i]['events'] += 1 if self.timeReport[i]['events'] > 0: self.timeReport[i]['time'] += timeit.default_timer() - start if ret == False: return (False, analyzer.name) return (True, analyzer.name)
def load_rowdata_to_mongo_zh(is_incremental): print("start loading row data(zh) from JSON file to MongoDB...") all_start = timeit.default_timer() static = Static() bydim_dir = static.output_folder + static.dataset_bydim_folder client = MongoClient(static.mongo_url, static.mongo_port) db = client[static.database_name] dataset_col = db[static.dataset_col_name] if not is_incremental: dataset_col.drop() file_path_array = [] for idx, file in enumerate(os.listdir(bydim_dir)): file_path = os.path.join(bydim_dir, file) if os.path.isfile(file_path): file_path_array.append(file_path) print(str(len(file_path_array)) + " files are loaded") counter = [] mapfunc = partial(insert_by_dim, counter=counter, dataset_col=dataset_col, all_start=all_start) pool = ThreadPool(12) pool.map(mapfunc, file_path_array) pool.close() pool.join() print("All the threads are completed. Total number is " + str(len(counter)) + "\n") print("total time cost: " + str(round(timeit.default_timer() - all_start)) + 's')
def loop_sd_mean(alphabet): print("======== sd-mean test===========") start = timeit.default_timer() count = 0 letters_number_list = [] entropy_list = [] for i in list(range(1,101)): # this is sd alphabet1 = eliminate_sd(alphabet,i) for j in list(range(1,101)): # this is mean alphabet2 = eliminate_mean(alphabet1,j) letters_number = len(alphabet2) letters_number_list.append((i,j,letters_number)) balanced_alphabet = rebalance(alphabet2) entropy = calculate_entropy(balanced_alphabet) entropy_list.append((i,j,entropy)) count = count+1 print(count) stop = timeit.default_timer() time = (stop - start) print (letters_number_list) print (entropy_list) print("======== sd-mean test===========") print('Running Time (s): %f' %time)
def worker(F, chargers, sensors, p_list, sensors_p, p_list_p): """worker function, used to create processing""" result = {} tic = timeit.default_timer() anser = reconfiguration.iaa.solution(chargers, sensors, p_list, args['B'], sensors_p, p_list_p, F, args['p_min']) toc = timeit.default_timer() result['IAA'] = (toc - tic, anser) if DEBUG: print "=============================================" print "# solution IAA #" print "=============================================" pprint(anser) tic = timeit.default_timer() anser = solution.solutionOpt.solution(chargers, sensors_p, p_list_p) toc = timeit.default_timer() result['Opt'] = (toc - tic, anser) if DEBUG: print "=============================================" print "# solution Opt #" print "=============================================" pprint(anser) return result
def main(): """ """ logging.info("Reading file:%s", "data/sample.avi") vid = AoRecording.AoRecording(filepath="data/sample.avi") vid.load_video() logging.info("Starting parallel processing") tic = timeit.default_timer() vid.filter_frames() vid.fixed_align_frames() vid.complete_align_parallel() vid.create_average_frame() vid.create_stdev_frame() toc = timeit.default_timer() print "Parallel Process took {}:".format(toc - tic) vid.create_stdev_frame() logging.info("writing output") vid.write_video("output/output_parallel.avi") vid.write_average_frame("output/lucky_average_parallel.png") vid.write_frame("output/lucky_stdev.png", "stdev") logging.info("Starting serial processing") tic = timeit.default_timer() vid.filter_frames() vid.fixed_align_frames() vid.complete_align() vid.create_average_frame() toc = timeit.default_timer() print "Serial Process took {}:".format(toc - tic) logging.info("writing output") vid.write_video("output/output_serial.avi") vid.write_frame("output/lucky_average_serial.png", "average")
def analyze_files(self, iterCount, loci_classes, adapt_threshold): Rmodel = VRmodel.VregMRmodel(iterCount, loci_classes, adapt_threshold) print "len(Rmodel.rfmodels)=", len(Rmodel.rfmodels) ofile = open("bkg_out.dat","a+") Rmodel.set_bckgoutfile( ofile ) for species in self.speciesList: fbar= self.S[species]["WGS"] print fbar outFile = self.outDir + os.path.basename(fbar).replace(".fasta", "_"+str(iterCount)+"_outRF.fasta") ofile = open(outFile,"w") Rmodel.set_outfile( ofile ) fb = self.outDir + os.path.basename(fbar).replace(".fasta", "_"+str(iterCount)+"_exon.fasta") exfile1 = open(fb,"w") Rmodel.set_exon_outfiles( exfile1 ) start_time = timeit.default_timer() gene_cnt=0 for strand in [1, -1]: qbar=deepcopy(self.contigs) print "STRAND=", strand for record in SeqIO.parse(fbar, "fasta"): if self.check_contigs: if ( record.id.split("|")[3] not in self.contigs): continue print "record.id=", record.id print "cnts=",record.id.split("|")[3] print "qbar=", qbar if self.check_contigs: qbar.remove(record.id.split("|")[3]) if strand == 1: seq=record.seq else: seq=record.seq.reverse_complement() Rmodel.set_record(record.id, record.name, record.description) seq_size=len(seq) res= self.mapper( divide_work(seq) ) """ print "len(res)=", len(res) for ix in range(2): print res[ix][0], res[ix][1], type(res[ix][2]) """ Elist=Rmodel.exon_MRprobabilities(res) gene_cnt = Rmodel.V_exon_model(gene_cnt, seq, strand, Elist) #res=None #Elist=None if len(qbar)==0: break ofile.close() elapsed = timeit.default_timer() - start_time print "ELAPSED TIME =", elapsed
def pretrain(self, examples, epoch_counts, corruption_rates, learning_rates): """ Trains the network for autoencoding on the given examples, given lists of epoch counts, corruption rates, and learning rates each equal in length to the number of layers in the stack. """ tfs = self.get_training_functions(corruption_rates, learning_rates) indices = list(range(examples.get_value(borrow=True).shape[0])) start_time = timeit.default_timer() for i in range(len(self.layers)): # TODO: batches? for epoch in range(epoch_counts[i]): self.rng.shuffle(indices) costs = [] for j in indices: cost = tfs[i](examples.get_value(borrow=True)[j].reshape(-1)) costs.append(cost) debug( "... [{}] epoch {: 3d} at layer {: 2d} done {} ...".format( str(datetime.timedelta(seconds=timeit.default_timer()-start_time)), epoch + 1, i, "(min/avg cost {:0.3f}/{:0.3f})".format( float(min(costs)), float(sum(costs)/float(len(costs))), ) ) )
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC from sklearn.decomposition import PCA from sklearn.pipeline import Pipeline from sklearn.utils import resample from sklearn.feature_selection import SelectFromModel from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.impute import SimpleImputer import timeit # In[2]: Import data startzeitDaten = timeit.default_timer() data = pd.read_csv(r"Data\SP500_data_new.csv" ,parse_dates = ["adate", "qdate", "public_date"], dayfirst = True)#, index_col=["gvkey", "datadate"]) data_NaN = data.dropna() data_y = data.dropna(subset =["splticrm"]) Names1 = pd.read_excel(r"Data\Names1.xlsx", header = 0) Names1 = Names1.drop(["Data Type", "Help"], axis = 1) Names1.columns = ["Name", "ExName"] Names2 = pd.read_excel(r"Data\Names2.xlsx", header = 0) Names2 = Names2.drop(["Data Type","Help"], axis = 1) Names2.columns = ["Name", "ExName"] features1RF_mean = pd.read_csv(r"Data\RF1_mean.csv", header = 0).dropna() features1RF_mean.columns = ["Ort","Name", "Wert"]
def __exit__(self, _1, _2, _3): self.t_end = timeit.default_timer() self.dt = self.t_end - self.t_start
def __enter__(self): self.t_start = timeit.default_timer() return self
g.add_node(5, pos = (2,2)) g.add_node(6, pos = (3,1)) g.add_node(7, pos = (4,1)) # create the edges in the graph g.add_edge(1,2, weight = 50) g.add_edge(1,3, weight = 50) g.add_edge(2,4, weight = 10) g.add_edge(2,5, weight = 20) g.add_edge(4,6, weight = 20) g.add_edge(5,6, weight = 10) g.add_edge(6,7, weight = 20) g.add_edge(3,7, weight = 50) if __name__ == "__main__": startTime = timeit.default_timer() path1 = bfs(g,1,7,0)[0] cost1 = bfsPathCost(g, 1, 7) endTime = timeit.default_timer() calculations = bfs(g,1,7,0)[1] # find the runtime of the program in seconds runTime = (endTime-startTime)*10**6 print('The verticies found by BFS: ' + str(path1)) print('Cost of BFS: ' + str(cost1)) print('Runtime in microseconds: ' + str(runTime))
def process_dump( input_file, template_file, out_file, file_size, file_compress, process_count, html_safe, ): """ :param input_file: name of the wikipedia dump file; '-' to read from stdin :param template_file: optional file with template definitions. :param out_file: directory where to store extracted data, or '-' for stdout :param file_size: max size of each extracted file, or None for no max (one file) :param file_compress: whether to compress files with bzip. :param process_count: number of extraction processes to spawn. """ global knownNamespaces global templateNamespace, templatePrefix global moduleNamespace, modulePrefix urlbase = "" # This is obtained from <siteinfo> input = decode_open(input_file) # collect siteinfo for line in input: line = line # .decode('utf-8') m = tagRE.search(line) if not m: continue tag = m.group(2) if tag == "base": # discover urlbase from the xml dump file # /mediawiki/siteinfo/base base = m.group(3) urlbase = base[:base.rfind("/")] elif tag == "namespace": knownNamespaces.add(m.group(3)) if re.search('key="10"', line): templateNamespace = m.group(3) templatePrefix = templateNamespace + ":" elif re.search('key="828"', line): moduleNamespace = m.group(3) modulePrefix = moduleNamespace + ":" elif tag == "/siteinfo": break if expand_templates: # preprocess template_load_start = default_timer() if template_file and os.path.exists(template_file): logging.info( "Preprocessing '%s' to collect template definitions: this may take some time.", template_file, ) file = decode_open(template_file) templates = load_templates(file) file.close() else: if input_file == "-": # can't scan then reset stdin; must error w/ suggestion to specify template_file raise ValueError( "to use templates with stdin dump, must supply explicit template-file" ) logging.info( "Preprocessing '%s' to collect template definitions: this may take some time.", input_file, ) templates = load_templates(input, template_file) input.close() input = decode_open(input_file) template_load_elapsed = default_timer() - template_load_start logging.info("Loaded %d templates in %.1fs", templates, template_load_elapsed) if out_file == "-": output = sys.stdout if file_compress: logging.warn( "writing to stdout, so no output compression (use an external tool)" ) else: nextFile = NextFile(out_file) output = OutputSplitter(nextFile, file_size, file_compress) # process pages logging.info("Starting page extraction from %s.", input_file) extract_start = default_timer() # Parallel Map/Reduce: # - pages to be processed are dispatched to workers # - a reduce process collects the results, sort them and print them. maxsize = 10 * process_count # output queue output_queue = Queue(maxsize=maxsize) # Reduce job that sorts and prints output reduce = Process(target=reduce_process, args=(output_queue, output)) reduce.start() # initialize jobs queue jobs_queue = Queue(maxsize=maxsize) # start worker processes logging.info("Using %d extract processes.", process_count) workers = [] for _ in range(max(1, process_count)): extractor = Process(target=extract_process, args=(jobs_queue, output_queue, html_safe)) extractor.daemon = True # only live while parent process lives extractor.start() workers.append(extractor) # Mapper process # we collect individual lines, since str.join() is significantly faster # than concatenation page = [] id = "" revid = "" last_id = "" ordinal = 0 # page count inText = False redirect = False for line in input: if "<" not in line: # faster than doing re.search() if inText: page.append(line) continue m = tagRE.search(line) if not m: continue tag = m.group(2) if tag == "page": page = [] redirect = False elif tag == "id" and not id: id = m.group(3) elif tag == "id" and id: # <revision> <id></id> </revision> revid = m.group(3) elif tag == "title": title = m.group(3) elif tag == "redirect": redirect = True elif tag == "text": inText = True line = line[m.start(3):m.end(3)] page.append(line) if m.lastindex == 4: # open-close inText = False elif tag == "/text": if m.group(1): page.append(m.group(1)) inText = False elif inText: page.append(line) elif tag == "/page": colon = title.find(":") if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and not redirect and not title.startswith(templateNamespace)): job = (id, revid, urlbase, title, page, ordinal) jobs_queue.put(job) # goes to any available extract_process last_id = id ordinal += 1 id = "" revid = "" page = [] input.close() # signal termination for _ in workers: jobs_queue.put(None) # wait for workers to terminate for w in workers: w.join() # signal end of work to reduce process output_queue.put(None) # wait for it to finish reduce.join() if output != sys.stdout: output.close() extract_duration = default_timer() - extract_start extract_rate = ordinal / extract_duration logging.info( "Finished %d-process extraction of %d articles in %.1fs (%.1f art/s)", process_count, ordinal, extract_duration, extract_rate, )
def sgd_optimization_mnist( learning_rate=0.013, n_epochs=100, dataset='D:\JupyterWorkspace\DeepLearningTutorial\mnist.pkl.gz', batch_size=50): """ Demonstrate stochastic gradient descent optimization of a log-linear model This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ ''' datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] ''' data = sio.loadmat( 'D:\ResearchWork\Machine learning and MI\Code\MEMD-DL\subject_1_scheme8.mat' ) processedEEG = data['processedEEG'] processedERD = data['processedERD'] processedSpectrum = data['processedSpectrum'] trainLabels = data['trainLabels'] trainLabels = trainLabels.T def extractbands(processedSpectrum, trainLabels): muband = processedSpectrum[:512, :, :] muband = muband[10:60, :, :] betaband = processedSpectrum[512:1024, :, :] betaband = betaband[29:100, :, :] totfeat = muband.shape[0] + betaband.shape[0] trainData = numpy.zeros((processedSpectrum.shape[2], totfeat * 2)) labels = numpy.zeros((processedSpectrum.shape[2])) for i in range( processedSpectrum.shape[2]): #(processedSpectrum.shape[2] trainData[i, :] = numpy.concatenate( (muband[:, 0, i], muband[:, 2, i], betaband[:, 0, i], betaband[:, 2, i])) labels[i] = trainLabels[i] return trainData, labels trainData, labels = extractbands(processedSpectrum, trainLabels) indices = numpy.random.permutation(processedEEG.shape[2]) training_idx, test_idx = indices[:350], indices[350:] train_set_x, test_set_x = trainData[training_idx, :], trainData[ test_idx, :] train_set_y, test_set_y = labels[training_idx], labels[test_idx] valid_set_x = test_set_x valid_set_y = test_set_y test_set_x = theano.shared(numpy.asarray(test_set_x, dtype=theano.config.floatX), borrow=True) test_set_y = theano.shared(numpy.asarray(test_set_y, dtype=theano.config.floatX), borrow=True) test_set_y = T.cast(test_set_y, 'int32') train_set_x = theano.shared(numpy.asarray(train_set_x, dtype=theano.config.floatX), borrow=True) train_set_y = theano.shared(numpy.asarray(train_set_y, dtype=theano.config.floatX), borrow=True) train_set_y = T.cast(train_set_y, 'int32') valid_set_x = theano.shared(numpy.asarray(valid_set_x, dtype=theano.config.floatX), borrow=True) valid_set_y = theano.shared(numpy.asarray(valid_set_y, dtype=theano.config.floatX), borrow=True) valid_set_y = T.cast(valid_set_y, 'int32') # compute number of minibatches for training, validation and testing ''' n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size ''' n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size n_valid_batches = 1 n_test_batches = 1 ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # generate symbolic variables for input (x and y represent a # minibatch) x = T.matrix('x') # data, presented as rasterized images y = T.ivector('y') # labels, presented as 1D vector of [int] labels # construct the logistic regression class # Each MNIST image has size 28*28 classifier = LogisticRegression(input=x, n_in=242, n_out=2) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.negative_log_likelihood(y) # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # compute the gradient of cost with respect to theta = (W,b) g_W = T.grad(cost=cost, wrt=classifier.W) g_b = T.grad(cost=cost, wrt=classifier.b) # start-snippet-3 # specify how to update the parameters of the model as a list of # (variable, update expression) pairs. updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-3 ############### # TRAIN MODEL # ############### print('... training the model') # early-stopping parameters patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf test_score = 0. start_time = timeit.default_timer() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of' ' best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) # save the best model with open('best_model.pkl', 'wb') as f: pickle.dump(classifier, f) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print(('Optimization complete with best validation score of %f %%,' 'with test performance %f %%') % (best_validation_loss * 100., test_score * 100.)) print('The code run for %d epochs, with %f epochs/sec' % (epoch, 1. * epoch / (end_time - start_time))) '''
plt.axvline(i, color='k', lw=1) for i in spk0[1].spike_times: plt.axvline(i, color='r', lw=1) ax.set_ylabel('$w$') # ax = fig.add_subplot(4, 2, 8) # ax.plot(wt[:, 1, 1], 'r', lw=3) # ax.set_ylabel('$w$') # fig = plt.figure(figsize=(10, 5)) # for i in spk[0].spike_times: # plt.plot(wt[:, 1, 1], 'r', lw=3) # plt.axvline(i, color='k', lw=1) plt.savefig('/tmp/%s.png' % (os.path.splitext(os.path.basename(__file__))[0])) plt.close() print('End %s:run()' % (os.path.splitext(os.path.basename(__file__))[0])) if __name__ == '__main__': print('Begin %s:main()' % (os.path.splitext(os.path.basename(__file__))[0])) start_t = timeit.default_timer() setup() run() print("End %s:main() , running time: %f seconds" % (os.path.splitext( os.path.basename(__file__))[0], timeit.default_timer() - start_t))
filename_in = "trans-out.csv" user_f = "users-large.csv" # Read csv in_read = csv.reader(open(filename_in,"rb"), delimiter=',',quoting=csv.QUOTE_ALL) users_read = csv.reader(open(user_f,"rb"), delimiter=',',quoting=csv.QUOTE_ALL) in_read.next() users_read.next() # Create user dict # Optimization problem here if user list doesn't fit in memory users_dict = {} for user in users_read: users_dict[user[0]] = {"spending-limit" : user[1], "round-to" : user[2]} start = timeit.default_timer() for row in in_read: # row[0] == time # row[1] == account # row[2] == transaction number # row[3] == amount # See if account number is our users if row[1] in users_dict: user_info = users_dict[row[1]] if float(row[3]) <= float(user_info["spending-limit"]): change = roundUp(float(row[3]), float(user_info["round-to"])) if change != "0.0": sendToBackend(row[0], change, row[2], row[1]) print "REQUEST TRANS AMNT " + change + " FROM " + row[1] + " REFNUM: " + row[2] else:
def train(self, sentences, total_words=None, word_count=0, total_examples=None, queue_factor=2, report_delay=1.0): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.) To support linear learning-rate decay from (initial) alpha to min_alpha, either total_examples (count of sentences) or total_words (count of raw words in sentences) should be provided, unless the sentences are the same as those that were used to initially build the vocabulary. """ logger.info("Starting training.") self.neg_labels = [] if self.negative > 0: # precompute negative labels optimization for pure-python training self.neg_labels = zeros(self.negative + 1) self.neg_labels[0] = 1. if FAST_VERSION < 0: import warnings warnings.warn( "C extension not loaded for Word2Vec, training will be slow. " "Install a C compiler and reinstall gensim for fast training.") self.neg_labels = [] if self.negative > 0: # precompute negative labels optimization for pure-python training self.neg_labels = zeros(self.negative + 1) self.neg_labels[0] = 1. logger.info( "training model with %i workers on %i vocabulary and %i features, " "using sg=%s hs=%s sample=%s negative=%s window=%s", self.workers, len(self.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative, self.window) if not self.vocab: raise RuntimeError( "you must first build vocabulary before training the model") if not hasattr(self, "syn0"): raise RuntimeError( "you must first finalize vocabulary before training the model") if total_words is None and total_examples is None: if self.corpus_count: total_examples = self.corpus_count logger.info( "expecting %i sentences, matching count from corpus used for vocabulary survey", total_examples) else: raise ValueError( "you must provide either total_words or total_examples, to enable alpha and progress calculations" ) job_tally = 0 if self.iter > 1: sentences = utils.RepeatCorpusNTimes(sentences, self.iter) total_words = total_words and total_words * self.iter total_examples = total_examples and total_examples * self.iter def worker_loop(): """Train the model, lifting lists of sentences from the job_queue.""" work = matutils.zeros_aligned( self.layer1_size, dtype=REAL) # per-thread private work memory neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) jobs_processed = 0 while True: job = job_queue.get() if job is None: progress_queue.put(None) break # no more jobs => quit this worker sentences, pairwise, alpha = job tally, raw_tally = self._do_train_job(sentences, pairwise, alpha, (work, neu1)) progress_queue.put( (len(sentences), tally, raw_tally)) # report back progress jobs_processed += 1 logger.debug("worker exiting, processed %i jobs", jobs_processed) def job_producer(): """Fill jobs queue using the input `sentences` iterator.""" job_batch, batch_size = [], 0 pushed_words, pushed_examples = 0, 0 next_alpha = self.alpha if next_alpha > self.min_alpha_yet_reached: logger.warn("Effective 'alpha' higher than previous training cycles") self.min_alpha_yet_reached = next_alpha job_no = 0 for sent_idx, sentence in enumerate(sentences): sentence_length = self._raw_word_count([sentence]) # can we fit this sentence into the existing job batch? if batch_size + sentence_length <= self.batch_words: # yes => add it to the current job job_batch.append(sentence) batch_size += sentence_length else: # no => submit the existing job pair_idx = list( numpy.random.choice( range(len(self.pairwise_constraints)), int(batch_size * 0.2))) pairwise_samples = [self.pairwise_constraints[x] for x in pair_idx] logger.debug( "queueing job #%i (%i words, %i sentences, %i constraints) at alpha %.05f", job_no, batch_size, len(job_batch), len(pairwise_samples), next_alpha) job_no += 1 job_queue.put((job_batch, pairwise_samples, next_alpha)) # update the learning rate for the next job if self.min_alpha < next_alpha: if total_examples: # examples-based decay pushed_examples += len(job_batch) progress = 1.0 * pushed_examples / total_examples else: # words-based decay pushed_words += self._raw_word_count(job_batch) progress = 1.0 * pushed_words / total_words next_alpha = self.alpha - (self.alpha - self.min_alpha) * progress next_alpha = max(self.min_alpha, next_alpha) # add the sentence that didn't fit as the first item of a new job job_batch, batch_size = [sentence], sentence_length # add the last job too (may be significantly smaller than batch_words) if job_batch: logger.debug( "queueing job #%i (%i words, %i sentences, %i constraints) at alpha %.05f", job_no, batch_size, len(job_batch), len(self.pairwise_constraints), next_alpha) job_no += 1 job_queue.put((job_batch, self.pairwise_constraints, next_alpha)) if job_no == 0 and self.train_count == 0: logger.warning( "train() called with an empty iterator (if not intended, " "be sure to provide a corpus that offers restartable " "iteration = an iterable).") # give the workers heads up that they can finish -- no more work! for _ in xrange(self.workers): job_queue.put(None) logger.debug("job loop exiting, total %i jobs", job_no) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( job_queue = Queue(maxsize=queue_factor * self.workers) progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) workers = [ threading.Thread(target=worker_loop) for _ in xrange(self.workers) ] unfinished_worker_count = len(workers) workers.append(threading.Thread(target=job_producer)) for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() example_count, trained_word_count, raw_word_count = 0, 0, word_count start, next_report = default_timer() - 0.00001, 1.0 while unfinished_worker_count > 0: report = progress_queue.get() # blocks if workers too slow if report is None: # a thread reporting that it finished unfinished_worker_count -= 1 logger.info( "worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) continue examples, trained_words, raw_words = report job_tally += 1 # update progress stats example_count += examples trained_word_count += trained_words # only words in vocab & sampled raw_word_count += raw_words # log progress once every report_delay seconds elapsed = default_timer() - start if elapsed >= next_report: if total_examples: # examples-based progress % logger.info( "PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i", 100.0 * example_count / total_examples, trained_word_count / elapsed, utils.qsize(job_queue), utils.qsize(progress_queue)) else: # words-based progress % logger.info( "PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i", 100.0 * raw_word_count / total_words, trained_word_count / elapsed, utils.qsize(job_queue), utils.qsize(progress_queue)) next_report = elapsed + report_delay # all done; report the final stats elapsed = default_timer() - start logger.info( "training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed) if job_tally < 10 * self.workers: logger.warn( "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay" ) # check that the input corpus hasn't changed during iteration if total_examples and total_examples != example_count: logger.warn( "supplied example count (%i) did not equal expected count (%i)", example_count, total_examples) if total_words and total_words != raw_word_count: logger.warn( "supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words) self.train_count += 1 # number of times train() has been called self.total_train_time += elapsed self.clear_sims() return trained_word_count
def main(): """Parse command line options/arguments and execute.""" try: arg_names = [ "help", "version", "quick", "strict", "debug", "stop-tag=" ] opts, args = getopt.getopt(sys.argv[1:], "hvqsdct:v", arg_names) except getopt.GetoptError: usage(2) detailed = True stop_tag = DEFAULT_STOP_TAG debug = False strict = False color = False for option, arg in opts: if option in ("-h", "--help"): usage(0) if option in ("-v", "--version"): show_version() if option in ("-q", "--quick"): detailed = False if option in ("-t", "--stop-tag"): stop_tag = arg if option in ("-s", "--strict"): strict = True if option in ("-d", "--debug"): debug = True if option in ("-c", "--color"): color = True if not args: usage(2) exif_log.setup_logger(debug, color) # output info for each file for filename in args: file_start = timeit.default_timer() try: img_file = open(str(filename), 'rb') except IOError: logger.error("'%s' is unreadable", filename) continue logger.info("Opening: %s", filename) tag_start = timeit.default_timer() # get the tags data = process_file(img_file, stop_tag=stop_tag, details=detailed, strict=strict, debug=debug) tag_stop = timeit.default_timer() if not data: logger.warning("No EXIF information found\n") continue if 'JPEGThumbnail' in data: logger.info('File has JPEG thumbnail') del data['JPEGThumbnail'] if 'TIFFThumbnail' in data: logger.info('File has TIFF thumbnail') del data['TIFFThumbnail'] tag_keys = list(data.keys()) tag_keys.sort() for i in tag_keys: try: logger.info('%s (%s): %s', i, FIELD_TYPES[data[i].field_type][2], data[i].printable) except: logger.error("%s : %s", i, str(data[i])) file_stop = timeit.default_timer() logger.debug("Tags processed in %s seconds", tag_stop - tag_start) logger.debug("File processed in %s seconds", file_stop - file_start) print("")
ds_model = ds_model.sel(nregions=cR) # # Take mean of ensemble # ds_model = ds_model.mean(dim='ensemble') # Get model plotting specs cc = E.model_color[cmod] cl = E.model_linestyle[cmod] # Plot Model if i == 0: # Control only one initiailzation label in legend no_init_label = False else: no_init_label = True import timeit start_time = timeit.default_timer() ice_plot.plot_reforecast(ds=ds_model, axin=ax1, labelin=E.model[cmod]['model_label'], color=cc, marker=None, linestyle=cl, no_init_label=no_init_label) print( (timeit.default_timer() - start_time), ' seconds.' ) # Memeory clean up ds_model = None cxlims = ax1.get_xlim() # add obs and climotrend if (cR==99):
def transposonmapper(bamfile=bam_arg, gfffile=None, essentialfiles=None, genenamesfile=None): ''' This function is created for analysis of SATAY data using the species Saccharomyces Cerevisiae. It outputs the following files that store information regarding the location of all insertions: - .bed-file: Includes all individual basepair locations of the whole genome where at least one transposon has been mapped and the number of insertions for each locations (the number of reads) according to the Browser Extensible Data (bed) format. A distinction is made between reads that had a different reading orientation during sequencing. The number of reads are stored using the equation #reads*20+100 (e.g. 2 reads is stored as 140). - .wig-file: Includes all individual basepair locations of the whole genome where at least one transposon has been mapped and the number of insertions for each locations (the number of reads) according to the Wiggle (wig) format. In this file no distinction is made between reads that had a different reading orientation during sequencing. The number of reads are stored as the absolute count. - _pergene.txt-file: Includes all genes (currently 6600) with the total number of insertions and number of reads within the genomic region of the gene. - _peressential.txt-file: Includes all annotated essential genes (currently 1186) with the total number of insertions and number of reads within the genomic region of the gene. - _pergene_insertions.txt-file: Includes all genes with their genomic location (i.e. chromosome number, start and end position) and the locations of all insertions within the gene location. It also include the number number of reads per insertions. - _peressential_insertions.txt-file: Includes all essential genes with their genomic location (i.e. chromosome number, start and end position) and the locations of all insertions within the gene location. It also include the number number of reads per insertions. (note that in the latter two files, the genomic locations are continous, for example chromosome II does not start at 0, but at 'length chromosome I + 1' etc.). The output files are saved at the location of the input file using the same name as the input file, but with the corresponding extension. The function assumes that the reads are already aligned to a reference genome. The input data should be a .bam-file and the location where the .bam-file is stored should also contain an index file (.bam.bai-file, which for example can be created using sambamba). This function takes the following inputs: - bamfile [required]: Path to the bamfile. This location should also contain the .bam.bai index file (does not need to be input in this function). - gfffile [optional]: Path to a .gff-file including all gene information (e.g. downloaded from SGD). Default file is 'Saccharomyces_cerevisiae.R64-1-1.99.gff3'. - essentialfiles [optional]: Path to a .txt file containing a list all essential genes. Every line should consist of a single essential gene and the file should have one header line. Ideally this file is created using 'Create_EssentialGenes_list.py'. Default file is 'Cerevisiae_AllEssentialGenes_List.txt'. - genenamesfile [optional]: Path to text file that includes aliases for all genes. Default file is 'Yeast_Protein_Names.txt'. When the arguments for the optional files are not given, the files are used that are stored at the following location: "path_current_pythonscript/../data_files" The function uses the pysam package for handling bam files (see pysam.readthedocs.io/en/latest/index.html) and therefore this function only runs on Linux systems with SAMTools installed. ''' #%% LOADING BAM FILE if bamfile is None: path = os.path.join('/home', 'gregoryvanbeek', 'Documents', 'data_processing') filename = 'SRR062634.filt_trimmed.sorted.bam' bamfile = os.path.join(path, filename) else: filename = os.path.basename(bamfile) path = bamfile.replace(filename, '') if os.path.isfile(bamfile): print('Running: ', bamfile) else: raise ValueError('Bam file not found at: ', bamfile) #%% LOADING ADDITIONAL FILES files_path = os.path.join(dirname, '..', '..', 'data_files') #LOADING GFF-FILE if gfffile is None: gfffile = os.path.join(files_path, 'Saccharomyces_cerevisiae.R64-1-1.99.gff3') if not os.path.isfile(gfffile): raise ValueError('Path to GFF-file does not exist.') #LOADING TEXT FILES WITH ESSENTIAL GENES if essentialfiles is None: essentialfiles = os.path.join(files_path, 'Cerevisiae_AllEssentialGenes_List.txt') if not os.path.isfile(essentialfiles): raise ValueError('Following path does not exist: ' + essentialfiles) del essentialfiles #LOADING TEXT FILE WITH GENE NAME ALIASES if genenamesfile is None: genenamesfile = os.path.join(files_path, 'Yeast_Protein_Names.txt') if not os.path.isfile(genenamesfile): raise ValueError('Following path does not exist: ' + genenamesfile) #%% READ BAM FILE bam = pysam.AlignmentFile(bamfile, 'rb') #open bam formatted file for reading #%% GET NAMES OF ALL CHROMOSOMES AS STORED IN THE BAM FILE ref_tid_dict = {} # 'I' | 0, 'II' | 1, ... ref_name_list = [] # 'I', 'II', ... for i in range( bam.nreferences ): #if bam.nreferences does not work, use range(17) #16 chromosomes and the mitochondrial chromosome ref_name = bam.get_reference_name(i) ref_tid_dict[ref_name] = bam.get_tid(ref_name) ref_name_list.append(ref_name) del (ref_name, i) #%% CONVERT CHROMOSOME NAMES IN DATA FILE TO ROMAN NUMERALS ref_romannums = chromosomename_roman_to_arabic()[0] ref_tid_roman_dict = {} for key, val in ref_tid_dict.items(): ref_tid_roman_dict[ref_romannums[int(val) + 1]] = key del (key, val, ref_romannums) #%% GET SEQUENCE LENGTHS OF ALL CHROMOSOMES chr_length_dict = {} # 'I' | 230218, 'II' | 813184, ... chr_summedlength_dict = {} # 'I' | 0, 'II' | 230218, 'III' | 1043402, ... ref_summedlength = 0 for key in ref_tid_dict: ref_length = bam.get_reference_length(key) chr_length_dict[key] = ref_length chr_summedlength_dict[key] = ref_summedlength ref_summedlength += ref_length del (key, ref_length, ref_summedlength) #%% GET NUMBER OF MAPPED, UNMAPPED AND TOTAL AMOUNT OF READS PER CHROMOSOME # total_reads = bam.mapped stats = bam.get_index_statistics() chr_mappedreads_dict = {} # 'I' | [mapped, unmapped, total reads] for stat in stats: chr_mappedreads_dict[stat[0]] = [stat[1], stat[2], stat[3]] if stat[2] != 0: warnings.warn('Unmapped reads found in chromosome ' + stat[0]) del (stat, stats) #%% GET ALL READS WITHIN A SPECIFIED GENOMIC REGION tnnumber_dict = {} ll = 0 #Number of unique insertions in entire genome for kk in ref_name_list: timer_start = timeit.default_timer() read_counter = 0 N_reads_kk = chr_mappedreads_dict[kk][2] start_array = np.empty(shape=(N_reads_kk), dtype=int) flag_array = np.empty(shape=(N_reads_kk), dtype=int) readlength_array = np.empty(shape=(N_reads_kk), dtype=int) #RETREIVING ALL THE READS FROM THE CURRENT CHROMOSOME. print('Getting reads for chromosome %s ...' % kk) for reads in bam.fetch(kk, 0, chr_length_dict[kk], until_eof=True): read = str(reads).split('\t') start_array[read_counter] = int(read[3]) + 1 flag_array[read_counter] = int(read[1]) readlength_array[read_counter] = int(len(read[9])) read_counter += 1 #CORRECT STARTING POSITION FOR READS WITH REVERSED ORIENTATION flag0coor_array = np.where( flag_array == 0) #coordinates reads 5' -> 3' flag16coor_array = np.where( flag_array == 16) # coordinates reads 3' -> 5' startdirect_array = start_array[flag0coor_array] flagdirect_array = flag_array[flag0coor_array] startindirect_array = start_array[flag16coor_array] + readlength_array[ flag16coor_array] flagindirect_array = flag_array[flag16coor_array] start2_array = np.concatenate((startdirect_array, startindirect_array), axis=0) flag2_array = np.concatenate((flagdirect_array, flagindirect_array), axis=0) del (flag0coor_array, flag16coor_array, startdirect_array, flagdirect_array, startindirect_array, flagindirect_array) start2_sortindices = start2_array.argsort( kind='mergesort') #use mergesort for stable sorting start2_array = start2_array[start2_sortindices] flag2_array = flag2_array[start2_sortindices] del start2_sortindices #CREATE ARRAY OF START POSITION AND FLAGS OF ALL READS IN GENOME ref_tid_kk = int(ref_tid_dict[kk] + 1) if ll == 0: tncoordinates_array = np.array([]) mm = 0 # Number of unique reads per insertion jj = 1 # Number of unique reads in current chromosome (Number of transposons in current chromosome) for ii in range(1, len(start2_array)): if abs( start2_array[ii] - start2_array[ii - 1] ) <= 2 and flag2_array[ii] == flag2_array[ ii - 1]: #If two subsequent reads are within two basepairs and have the same orientation, add them together. mm += 1 else: avg_start_pos = abs( round(np.mean(start2_array[ii - mm - 1:ii]))) if tncoordinates_array.size == 0: #include first read tncoordinates_array = np.array([ ref_tid_kk, int(avg_start_pos), int(flag2_array[ii - 1]) ]) readnumb_list = [mm + 1] else: tncoordinates_array = np.vstack((tncoordinates_array, [ ref_tid_kk, int(avg_start_pos), int(flag2_array[ii - 1]) ])) readnumb_list.append(mm + 1) mm = 0 jj += 1 ll += 1 if ii == len(start2_array) - 1: #include last read avg_start_pos = abs( round(np.mean(start2_array[ii - mm - 1:ii]))) tncoordinates_array = np.vstack((tncoordinates_array, [ ref_tid_kk, int(avg_start_pos), int(flag2_array[ii - 1]) ])) readnumb_list.append(mm + 1) tnnumber_dict[kk] = jj del (jj, start_array, flag_array, readlength_array, flag2_array, start2_array, ref_tid_kk) timer_end = timeit.default_timer() print('Chromosome %s completed in %.3f seconds' % (kk, (timer_end - timer_start))) print('') readnumb_array = np.array(readnumb_list) del readnumb_list tncoordinatescopy_array = np.array(tncoordinates_array, copy=True) #%% GET LIST OF ALL GENES AND ALL ESSENTIAL GENES print('Getting coordinates of all genes ...') # GET POSITION GENES gff_path = os.path.join(files_path, 'Saccharomyces_cerevisiae.R64-1-1.99.gff3') genecoordinates_dict = gene_position( gff_path) #'YAL069W' | ['I', 335, 649], ... # GET ALL ANNOTATED ESSENTIAL GENES essential_path = os.path.join(files_path, 'Cerevisiae_AllEssentialGenes_List.txt') essentialcoordinates_dict = {} with open(essential_path, 'r') as f: genes = f.readlines()[1:] for gene in genes: name = gene.strip('\n') essentialcoordinates_dict[name] = genecoordinates_dict.get( name).copy() # GET ALIASES OF ALL GENES names_path = os.path.join(files_path, 'Yeast_Protein_Names.txt') aliases_designation_dict = gene_aliases(names_path)[ 0] #'YMR056C' \ ['AAC1'], ... del (gff_path, gene, genes, name, essential_path) #%% CONCATENATE ALL CHROMOSOMES #FOR EACH INSERTION LOCATION, ADD THE LENGTH OF ALL PREVIOUS CHROMOSOMES. ll = 0 for ii in range(1, len(ref_name_list)): ll += chr_length_dict[ref_name_list[ii - 1]] aa = np.where(tncoordinatescopy_array[:, 0] == ii + 1) tncoordinatescopy_array[aa, 1] = tncoordinatescopy_array[aa, 1] + ll #FOR EACH GENE LOCATION, ADD THE LENGTH OF ALL PREVIOUS CHROMOSOMES. for key in genecoordinates_dict: gene_chrom = ref_tid_roman_dict.get(genecoordinates_dict.get(key)[0]) genecoordinates_dict[key][1] = genecoordinates_dict.get( key)[1] + chr_summedlength_dict.get(gene_chrom) genecoordinates_dict[key][2] = genecoordinates_dict.get( key)[2] + chr_summedlength_dict.get(gene_chrom) #FOR EACH ESSENTIAL GENE LOCATION, ADD THE LENGTH OF ALL PREVIOUS CHROMOSOMES. for key in essentialcoordinates_dict: gene_chrom = ref_tid_roman_dict.get( essentialcoordinates_dict.get(key)[0]) essentialcoordinates_dict[key][1] = essentialcoordinates_dict.get( key)[1] + chr_summedlength_dict.get(gene_chrom) essentialcoordinates_dict[key][2] = essentialcoordinates_dict.get( key)[2] + chr_summedlength_dict.get(gene_chrom) del (ii, ll, aa, key, gene_chrom) #%% GET NUMBER OF TRANSPOSONS AND READS PER GENE print('Get number of insertions and reads per gene ...') #ALL GENES tnpergene_dict = {} readpergene_dict = {} tncoordinates_pergene_dict = {} # readpergenecrude_dict = {} for gene in genecoordinates_dict: xx = np.where( np.logical_and( tncoordinatescopy_array[:, 1] >= genecoordinates_dict.get(gene)[1], tncoordinatescopy_array[:, 1] <= genecoordinates_dict.get(gene) [2])) #get all insertions within range of current gene tnpergene_dict[gene] = np.size(xx) readpergene_dict[gene] = sum(readnumb_array[xx]) - max( readnumb_array[xx], default=0) #REMOVE LARGEST VALUE TO REDUCE NOISE # readpergenecrude_dict[gene] = sum(readnumb_array[xx]) if np.size(xx) > 0: tncoordinates_pergene_dict[gene] = [ genecoordinates_dict.get(gene)[0], genecoordinates_dict.get(gene)[1], genecoordinates_dict.get(gene)[2], list(tncoordinatescopy_array[xx[0][0]:xx[0][-1] + 1, 1]), list(readnumb_array[xx]) ] else: tncoordinates_pergene_dict[gene] = [ genecoordinates_dict.get(gene)[0], genecoordinates_dict.get(gene)[1], genecoordinates_dict.get(gene)[2], [], [] ] #ONLY ESSENTIAL GENES tnperessential_dict = {} readperessential_dict = {} tncoordinates_peressential_dict = {} # readperessentialcrude_dict = {} for gene in essentialcoordinates_dict: xx = np.where( np.logical_and( tncoordinatescopy_array[:, 1] >= essentialcoordinates_dict.get(gene)[1], tncoordinatescopy_array[:, 1] <= essentialcoordinates_dict.get(gene)[2])) tnperessential_dict[gene] = np.size(xx) readperessential_dict[gene] = sum(readnumb_array[xx]) - max( readnumb_array[xx], default=0) # readperessentialcrude_dict[gene] = sum(readnumb_array[xx]) if np.size(xx) > 0: tncoordinates_peressential_dict[gene] = [ essentialcoordinates_dict.get(gene)[0], essentialcoordinates_dict.get(gene)[1], essentialcoordinates_dict.get(gene)[2], list(tncoordinatescopy_array[xx[0][0]:xx[0][-1] + 1, 1]), list(readnumb_array[xx]) ] else: tncoordinates_peressential_dict[gene] = [ essentialcoordinates_dict.get(gene)[0], essentialcoordinates_dict.get(gene)[1], essentialcoordinates_dict.get(gene)[2], [], [] ] del (xx, gene) #%% CREATE BED FILE bedfile = bamfile + '.bed' print('Writing bed file at: ', bedfile) print('') with open(bedfile, 'w') as f: f.write('track name=' + filename + ' useScore=1\n') coordinates_counter = 0 for tn in tncoordinates_array: refname = [ key for key, val in ref_tid_dict.items() if val == tn[0] - 1 ][0] if refname == 'Mito': refname = 'M' f.write('chr' + refname + ' ' + str(tn[1]) + ' ' + str(tn[1] + 1) + ' . ' + str(100 + readnumb_array[coordinates_counter] * 20) + '\n') coordinates_counter += 1 del (bedfile, coordinates_counter, refname) #%% CREATE TEXT FILE WITH TRANSPOSONS AND READS PER GENE pergenefile = bamfile + '_pergene.txt' print('Writing pergene.txt file at: ', pergenefile) print('') with open(pergenefile, 'w') as f: f.write( 'Gene name\tNumber of transposons per gene\tNumber of reads per gene\n' ) for gene in tnpergene_dict: tnpergene = tnpergene_dict[gene] readpergene = readpergene_dict[gene] if gene in aliases_designation_dict: gene_alias = aliases_designation_dict.get(gene)[0] else: gene_alias = gene f.write(gene_alias + '\t' + str(tnpergene) + '\t' + str(readpergene) + '\n') del (pergenefile, gene, gene_alias, tnpergene, readpergene) #%% CREATE TEXT FILE TRANSPOSONS AND READS PER ESSENTIAL GENE peressentialfile = bamfile + '_peressential.txt' print('Writing peressential.txt file at: ', peressentialfile) print('') with open(peressentialfile, 'w') as f: f.write( 'Gene name\tNumber of transposons per gene\tNumber of reads per gene\n' ) for essential in tnperessential_dict: tnperessential = tnperessential_dict[essential] readperessential = readperessential_dict[essential] if essential in aliases_designation_dict: essential_alias = aliases_designation_dict.get(essential)[0] else: essential_alias = essential f.write(essential_alias + '\t' + str(tnperessential) + '\t' + str(readperessential) + '\n') del (peressentialfile, essential, essential_alias, tnperessential, readperessential) #%% CREATE TEXT FILE WITH LOCATION OF INSERTIONS AND READS PER GENE pergeneinsertionsfile = bamfile + '_pergene_insertions.txt' print('Witing pergene_insertions.txt file at: ', pergeneinsertionsfile) print('') with open(pergeneinsertionsfile, 'w') as f: f.write( 'Gene name\tChromosome\tStart location\tEnd location\tInsertion locations\tReads per insertion location\n' ) for gene in tncoordinates_pergene_dict: gene_chrom = ref_tid_roman_dict.get( genecoordinates_dict.get(gene)[0]) tncoordinates = [ ins - chr_summedlength_dict.get(gene_chrom) for ins in tncoordinates_pergene_dict[gene][3] ] if gene in aliases_designation_dict: gene_alias = aliases_designation_dict.get(gene)[0] else: gene_alias = gene f.write(gene_alias + '\t' + str(tncoordinates_pergene_dict[gene][0]) + '\t' + str(tncoordinates_pergene_dict[gene][1] - chr_summedlength_dict.get(gene_chrom)) + '\t' + str(tncoordinates_pergene_dict[gene][2] - chr_summedlength_dict.get(gene_chrom)) + '\t' + str(tncoordinates) + '\t' + str(tncoordinates_pergene_dict[gene][4]) + '\n') del (gene, gene_chrom, tncoordinates, gene_alias, pergeneinsertionsfile) #%% CREATE TEXT FILE WITH LOCATION OF INSERTIONS AND READS PER ESSENTIAL GENE peressentialinsertionsfile = bamfile + '_peressential_insertions.txt' print('Writing peressential_insertions.txt file at: ', peressentialinsertionsfile) print('') with open(peressentialinsertionsfile, 'w') as f: f.write( 'Essential gene name\tChromosome\tStart location\tEnd location\tInsertion locations\tReads per insertion location\n' ) for essential in tncoordinates_peressential_dict: gene_chrom = ref_tid_roman_dict.get( genecoordinates_dict.get(essential)[0]) tncoordinates = [ ins - chr_summedlength_dict.get(gene_chrom) for ins in tncoordinates_peressential_dict[essential][3] ] if essential in aliases_designation_dict: essential_alias = aliases_designation_dict.get(essential)[0] else: essential_alias = essential f.write(essential_alias + '\t' + str(tncoordinates_peressential_dict[essential][0]) + '\t' + str(tncoordinates_peressential_dict[essential][1] - chr_summedlength_dict.get(gene_chrom)) + '\t' + str(tncoordinates_peressential_dict[essential][2] - chr_summedlength_dict.get(gene_chrom)) + '\t' + str(tncoordinates) + '\t' + str(tncoordinates_peressential_dict[essential][4]) + '\n') del (essential, gene_chrom, tncoordinates, essential_alias, peressentialinsertionsfile) #%% ADD INSERTIONS AT SAME LOCATION BUT WITH DIFFERENT ORIENTATIONS TOGETHER (FOR STORING IN WIG-FILE) wigfile = bamfile + '.wig' print('Writing wig file at: ', wigfile) print('') readnumbwig_array = readnumb_array.copy() unique_index_array = np.array([], dtype=int) #=cc N_uniques_perchr_list = [] ll = 0 for kk in ref_name_list: index = np.where(tncoordinates_array[:, 0] == int( ref_tid_dict[kk] + 1)) #get indices for current chromosome. unique_index = np.unique( tncoordinates_array[index][:, 1], return_index=True )[1] #get all insertion locations (in tncoordinates, all rows, column 1) unique_index_array = np.append(unique_index_array, (unique_index + ll), axis=0) ll += np.count_nonzero(tncoordinates_array[:, 0] == int(ref_tid_dict[kk] + 1)) N_uniques_perchr_list.append( ll) #total amount unique indices found untill current chromosome del (ll, kk, unique_index) duplicate_list = [] #=dd ll = 0 index_last_unique_previous_chromosome = 0 for ii in N_uniques_perchr_list: index_last_unique = np.where(unique_index_array <= ii)[0][-1] for jj in range(ll, ii): if int(jj) not in unique_index_array[ index_last_unique_previous_chromosome:index_last_unique]: duplicate_list.append(jj) index_last_unique_previous_chromosome = index_last_unique ll = ii #SUM READNUMB VALUES AT INDEX IN DUPLICATE_LIST AND DUPLICATE_LIST-1 for ii in duplicate_list: readnumbwig_array[ii - 1] = readnumbwig_array[ii - 1] + readnumbwig_array[ii] tncoordinateswig_duplicatesremoved_array = np.delete(tncoordinates_array, duplicate_list, axis=0) readnumbwig_duplicatesremoved_array = np.delete(readnumbwig_array, duplicate_list, axis=0) del (ll, ii, jj, N_uniques_perchr_list, index_last_unique, duplicate_list, readnumbwig_array) #%% CREATING WIG FILE with open(wigfile, 'w') as f: f.write('track type=wiggle_0 ,maxheightPixels=60 name=' + filename + '\n') for kk in ref_name_list: f.write('VariableStep chrom=chr' + kk + '\n') index = np.where(tncoordinateswig_duplicatesremoved_array[:, 0] == int(ref_tid_dict[kk] + 1)) #get indices for current chromosome. for ii in index[0]: f.write( str(tncoordinateswig_duplicatesremoved_array[ii][1]) + ' ' + str(readnumbwig_duplicatesremoved_array[ii]) + '\n') del (wigfile, kk, ii, index)
# early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0:
def main(): time_start = timeit.default_timer() #global PATH df = pd.read_csv("/home/dummy/try/SesOutFinalUS.tsv", dtype=object, header=None, delimiter="\t", error_bad_lines=False) #df1 = df.head(200) #df1.to_csv(PATH + "/outputs/sampleOut1.tsv", index = False, header= None, sep='\t') print 'file read' #['U','SES', 'DUR', 'SC','C','URL','D','T'] df.columns = [0, 1, 2, 8, 9, 3, 4, 5] users = {} curr_user = df.iloc[0, 0] curr_session = df.iloc[0, 1] url_list = [] session_list = [] tier1 = 0 tier2 = 0 tier3 = 0 tier4 = 0 url_book = 'https://secure.celebritycruises.com/booking/paymentConfirmation' url_held = 'https://secure.celebritycruises.com/booking/courtesyHoldConfirmation' visits = [[0 for i in range(5)] for j in range(4)] #lists of lists duration_list = [[0 for i in range(5)] for j in range(4)] day_diff = [[0 for i in range(4)] for j in range(4)] div_dur = [[0 for i in range(5)] for j in range(4)] duration = get_sec(df.loc[0, 2]) temp = True count = 0 c = 0 session_f = [0, 0, 0, 0] sess_duration = [] for i in range(1, len(df.index)): if df[0][i] == curr_user: if df[3][i] not in url_list: url_list.append(df[3][i]) else: count += 1 if url_book in url_list or url_held in url_list: users[curr_user] = Node(3, df[1][i - 1]) #2 is index of tier4 tiers = 3 else: if int(df[1][i - 1]) == 1 and len(set(url_list)) in [0, 1]: tiers = 0 c += 1 elif int(df[1][i - 1]) == 1 and len(set(url_list)) >= 2: tiers = 1 #index for tier2 else: tiers = 2 #index for tier3 users[curr_user] = Node(tiers, df[1][i - 1]) session_f[tiers] += int(df[1][i - 1]) curr_user = df[0][i] del url_list[:] url_list.append(df[3][i]) print 'Total number of Users are ' + str(count) fivePlus = [0, 0, 0, 0] foTofi = [0, 0, 0, 0] for node in users.values(): if node.sessions >= 5: index = 4 fivePlus[node.tier] += (node.sessions - 4) foTofi[node.tier] += 1 else: index = node.sessions - 1 if node.tier == 0: tier1 += 1 visits[0][index] += 1 elif node.tier == 1: tier2 += 1 visits[1][index] += 1 elif node.tier == 2: tier3 += 1 visits[2][index] += 1 else: tier4 += 1 visits[3][index] += 1 print fivePlus for i in range(4): for j in range(5): div_dur[i][j] = doSum(i, j, visits) curr_session = df[1][0] temp = int(curr_session) - 1 curr_user = df[0][0] usr_page = df[3][0] r = users[curr_user].tier duration_list[r][int(curr_session)] += get_sec(df[2][0]) for index, user in enumerate(df[0][1:len(df.index)], start=1): if user != curr_user: duration_list[users[curr_user].tier][temp] += get_sec(df[2][index - 1]) curr_user = user curr_session = df[1][index] temp = getTemp(int(curr_session)) #usr_page = df[3][index] elif df[1][index] != curr_session: if temp != 4: sec = get_sec(df[5][index].strip(' GMT')) - get_sec( df[5][index - 1].strip(' GMT')) l1 = map(int, df.iloc[index, 6].split('-')) l2 = map(int, df.iloc[index - 1, 6].split('-')) d1 = date(l1[0], l1[1], l1[2]) d2 = date(l2[0], l2[1], l2[2]) d = (d1 - d2).days if sec < 0: d = d - 1 day_diff[users[curr_user].tier][temp] += d duration_list[users[curr_user].tier][temp] += get_sec(df[2][index - 1]) curr_session = df[1][index] temp = getTemp(int(curr_session)) else: pass for i in range(4): for j in range(5): if div_dur[i][j] == 0: duration_list[i][j] = 0 else: if j == 4 and fivePlus[i] != 0: temp = fivePlus[i] else: temp = div_dur[i][j] duration_list[i][j] = str( datetime.timedelta(seconds=int(duration_list[i][j] / temp))) for i in range(4): for j in range(4): if div_dur[i][j] == 0: day_diff[i][j] = 0 else: if j == 3 and foTofi[i] != 0: temp = foTofi[i] else: temp = div_dur[i][j] day_diff[i][j] = day_diff[i][j] / temp print tier1, tier2, tier3, tier4 print '\n' print visits[0] print visits[1] print visits[2] print visits[3] print '\n' print div_dur[0] print div_dur[1] print div_dur[2] print div_dur[3] print '\n' print session_f[0] print session_f[1] print session_f[2] print session_f[3] print '\n' print duration_list[0] print duration_list[1] print duration_list[2] print duration_list[3] print '\n' print day_diff[0] print day_diff[1] print day_diff[2] print day_diff[3] print '\n' print tier1 * 100 / (tier1 + tier2 + tier3 + tier4) time_stop = timeit.default_timer() print time_stop - time_start
def tearDown(self): '''after each test function''' pass def do(self, func): '''todo''' self.assertEqual(func("hello", "ll"), 2) self.assertEqual(func("aaaaa", "bba"), -1) self.assertEqual(func("aaaac", "aac"), 2) pass def test_func(self): self.do(s.strStr) self.assertEqual(s.make_prefix("abc"), [-1, 0, 0]) self.assertEqual(s.make_prefix("aac"), [-1, 0, 1]) self.assertEqual(s.make_prefix("ababc"), [-1, 0, 0, 1, 2]) if __name__ == "__main__": count = 100000 t = "ababcabcde" p = "abcd" utils.print_func_run_time(count, s.strStr, t = t, p = p) b = timeit.default_timer() for i in range(count): t.index(p) print(timeit.default_timer() - b) unittest.main()
_save_log_ = False if _save_log_: from datetime import datetime from std_logger import StdFileLoggerCtrl # save all console activity to out_log_file out_log_file = os.path.join( r'P:\Synchronize\python_script_logs\\%s_log_%s.log' % (os.path.basename(__file__), datetime.now().strftime('%Y%m%d%H%M%S'))) log_link = StdFileLoggerCtrl(out_log_file) print('#### Started on %s ####\n' % time.asctime()) START = timeit.default_timer() #========================================================================== # When in post_mortem: # 1. "where" to show the stack # 2. "up" move the stack up to an older frame # 3. "down" move the stack down to a newer frame # 4. "interact" start an interactive interpreter #========================================================================== if DEBUG_FLAG: try: main() except: import pdb
performance = get_current_performance( np.zeros(int(num_examples / update_interval)), 0) # set firing rates to zero initially for name in input_population_names: input_groups[name + 'e'].rate = 0 # initialize network j = 0 num_retries = 0 b.run(0) weights_name = 'XeAe' + '_' + ending # start recording time start_time = timeit.default_timer() while j < num_examples: # fetched rates depend on training / test phase, and whether we use the # testing dataset for the test phase if test_mode: if use_testing_set: rates = testing['x'][j % 10000, :, :] / 8. * input_intensity else: rates = training['x'][j % 60000, :, :] / 8. * input_intensity else: # ensure weights don't grow without bound normalize_weights() # get the firing rates of the next input example rates = training['x'][j % 60000, :, :] / 8. * input_intensity
from utils import * cv2.setNumThreads(0) cv2.ocl.setUseOpenCL(False) test_dir = 'test/images' pred_folder = 'pred154_loc' models_folder = 'weights' all_files = np.array(get_files()) val_idxs = train_test_split(np.arange(len(all_files)).astype(int), test_size=0.1, random_state=0)[1] all_files= all_files[val_idxs] if __name__ == '__main__': t0 = timeit.default_timer() makedirs(pred_folder, exist_ok=True) os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' os.environ["CUDA_VISIBLE_DEVICES"] = sys.argv[1] # cudnn.benchmark = True models = [] for seed in [0]: snap_to_load = 'se154_loc_{}_1_best'.format(seed) model = SeNet154_Unet_Loc().cuda() model = nn.DataParallel(model).cuda() print("=> loading checkpoint '{}'".format(snap_to_load))
x['id6'] = x['id6'].astype('category') small = fread(src_jn_y[0]).to_pandas() small['id4'] = small['id4'].astype('category') medium = fread(src_jn_y[1]).to_pandas() medium['id4'] = medium['id4'].astype('category') medium['id5'] = medium['id5'].astype('category') big = fread(src_jn_y[2]).to_pandas() big['id4'] = big['id4'].astype('category') big['id5'] = big['id5'].astype('category') big['id6'] = big['id6'].astype('category') print(len(x.index), flush=True) print(len(small.index), flush=True) print(len(medium.index), flush=True) print(len(big.index), flush=True) task_init = timeit.default_timer() print("joining...", flush=True) question = "small inner on int" # q1 gc.collect() t_start = timeit.default_timer() ans = x.merge(small, on='id1') print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = [ans['v1'].sum(), ans['v2'].sum()] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) del ans gc.collect()
def main(arguments,output_filename): ''' Parse user input, query SQL database, generate pandas dataframes, export JSON for D3 and print HTML code ''' ###################################################### ### start the alert div that contains any output generated here ###################################################### algorithm_output_str = '' timing = {} start_all = timeit.default_timer() ###################################################### # generate an input overview table ###################################################### arg_names = ['Genes','Cluster by','Color by','Interaction type',\ 'Minimal number of experiments','Minimal number of publications', 'Minimal number of methods','Method types',\ 'Process','Compartment','Expression','Max. number nodes','Filter condition'] input_dict = { arg_names[i]:arguments[i].replace("_"," ") for i in range(len(arg_names)) } # does not include unique_str and excel_flag input_dict['Expression'] = input_dict['Expression'].replace('G1P','G1(P)') # brackets removed in PHP df_user_input = pd.DataFrame.from_dict(input_dict,orient='index') df_user_input = df_user_input.reindex(index = arg_names) df_user_input.columns = ['user input'] df_user_input_to_print = df_user_input.to_html(classes=['table','table-condensed','table-bordered']) ### process arguments primary_nodes,cluster_by,color_by,int_type,\ min_exp,min_pub,min_methods,method_types,\ process,compartment,expression,\ max_nodes,filter_condition,\ excel_flag,filter_flag,unique_str = arguments # make sure types are correct color_by = color_by.replace('_',' ') cluster_by = cluster_by.replace('_',' ') filter_condition = filter_condition.replace('_',' ') process = process.split(',') method_types = method_types.split(',') method_types = [x.replace('_',' ') for x in method_types] expression = expression.split(',') if 'G1P' in expression: # brackets removed in php ind = expression.index('G1P') expression[ind] = 'G1(P)' process = [x.replace("_"," ") for x in process] primary_nodes_str = primary_nodes if '_' in primary_nodes: primary_nodes = primary_nodes.split('_') else: primary_nodes = [primary_nodes] min_exp = int(min_exp) min_pub = int(min_pub) min_methods = int(min_methods) max_nodes = int(max_nodes) excel_flag = bool(int(excel_flag)) filter_flag = bool(int(filter_flag)) split_types = int_type.split(',') compartment = compartment.replace('_',' ') timing['input'] = timeit.default_timer() - start_all if excel_flag: ###################################################### # WRITE TO EXCEL ###################################################### # THIS HAS TO HAPPEN BEFORE HTML REPLACEMENTS start_excel = timeit.default_timer() write_excel_file(primary_nodes_str+'_'+unique_str) timing['excel'] = timeit.default_timer() - start_excel print(timing) return ###################################################### ### get all interactions related to the input IDs ###################################################### start_initial = timeit.default_timer() database = SCRIPT_DIR+"/data/DB_genes_and_interactions.db" conn = create_connection(database) # get all interactions in which the given genes takes part placeholders = ', '.join('?' for unused in primary_nodes) # '?, ?, ?, ...' # The query differs based on whether we need to subselect on the 'type' of interaction if len(split_types) == 3: query = "SELECT source,target FROM interactions WHERE ( (source IN (%s) or target IN (%s)) and num_experiments >= (%s) \ and num_publications >= (%s) and num_methods >= (%s))" % (placeholders,placeholders,min_exp,min_pub,min_methods) cursor = conn.execute(query,primary_nodes+primary_nodes) else: placeholders_type = ', '.join('?' for unused in split_types) query = "SELECT source,target FROM interactions WHERE ( (source IN (%s) or target IN (%s)) AND type IN (%s) \ AND num_experiments >= (%s) and num_publications >= (%s) and num_methods >= (%s))" % (placeholders,placeholders, \ placeholders_type,min_exp,min_pub,min_methods) cursor = conn.execute(query,primary_nodes+primary_nodes+split_types) # construct dataframe of interacting genes: the nodes node_list = list(set([x for y in cursor for x in y])) # get rid of duplicates of which there will be many if len(node_list) == 0: raise ValueError('No interactions matching these conditions.') # get the info from the database for each node to make the 'nodes' dataframe if 'No_data' in expression: query = """SELECT standard_name,systematic_name,name_desc,desc,go_term_1,go_term_2,\ GFP_abundance,GFP_localization,CYCLoPs_Excel_string,CYCLoPs_html,expression_peak_phase,\ expression_peak_time,CYCLoPs_dict FROM genes \ WHERE standard_name in (%s) AND (standard_name in (%s) OR expression_peak_phase in (%s) OR expression_peak_phase is NULL) AND (standard_name in (%s) OR go_term_1 in (%s) OR go_term_2 in (%s))""" \ % (', '.join('?' for _ in node_list), ', '.join('?' for _ in primary_nodes), ', '.join('?' for _ in expression), ', '.join('?' for _ in primary_nodes),', '.join('?' for _ in process),', '.join('?' for _ in process)) else: query = """SELECT standard_name,systematic_name,name_desc,desc,go_term_1,go_term_2,\ GFP_abundance,GFP_localization,CYCLoPs_Excel_string,CYCLoPs_html,expression_peak_phase,\ expression_peak_time,CYCLoPs_dict FROM genes \ WHERE standard_name in (%s) AND (standard_name in (%s) OR expression_peak_phase in (%s)) AND (standard_name in (%s) OR go_term_1 in (%s) OR go_term_2 in (%s))""" \ % (', '.join('?' for _ in node_list), ', '.join('?' for _ in primary_nodes), ', '.join('?' for _ in expression), ', '.join('?' for _ in primary_nodes), ', '.join('?' for _ in process),', '.join('?' for _ in process)) cursor = conn.execute(query,node_list+primary_nodes+expression+primary_nodes+process+process) data = [list(l) for l in cursor] # cursor itself is a generator, this is a list of lists nodes = pd.DataFrame(data,columns=['Standard name','Systematic name','Name description','Description', 'GO term 1','GO term 2','GFP abundance','GFP localization','CYCLoPs_Excel_string', 'CYCLoPs_html','Expression peak phase','Expression peak time','CYCLoPs_dict']) timing['Get node information from database'] = timeit.default_timer() - start_initial ### make actual dictionaries out of CYCLoPs_dict column nodes['CYCLoPs_dict'] = nodes['CYCLoPs_dict'].apply(ast.literal_eval) len_nodes_query = len(nodes) ###################################################### ### BASED ON THE COMPARTMENT FILTER: DROP NODES ###################################################### start_node_drop = timeit.default_timer() if 'GFP:' in compartment: comp_to_check = compartment.replace('GFP:','') print('Prior to compartment filtering:', len(nodes), 'nodes. Filtering on', comp_to_check) s = pd.Series([comp_to_check in x for x in nodes['GFP localization'].str.split(', ')]) nodes = nodes[s.values] nodes = nodes.reset_index(drop=True) print('After compartment filtering:', len(nodes), 'nodes.') elif 'CYCLoPs:' in compartment: comp_to_check = compartment.replace('CYCLoPs:','') print('Prior to compartment filtering:', len(nodes), 'nodes. Filtering on', comp_to_check) l_o_l = [[list(nodes.iloc[i]['CYCLoPs_dict'][x].keys()) for x in list(nodes.iloc[i]['CYCLoPs_dict'].keys()) ] for i in range(len(nodes)) ] s = pd.Series([comp_to_check in [v for WT in l_o_l[i] for v in WT] for i in range(len(l_o_l))]) nodes = nodes[s.values] nodes = nodes.reset_index(drop=True) print('After compartment filtering:', len(nodes), 'nodes.') else: #it is 'Any' pass ### Combine the expression columns nodes['Expression peak'] = nodes['Expression peak phase'] + " (" + nodes['Expression peak time'].map(str) + " min)" nodes['Expression peak'] = nodes['Expression peak'].mask(nodes['Expression peak'].isnull(), "No data") # alphabetize nodes = nodes.sort_values(by='Standard name',ascending=True) nodes = nodes.reset_index(drop=True) node_list = list(nodes['Standard name'].values) nodes['primary node'] = [x in primary_nodes for x in nodes['Standard name']] if len(nodes) == 0: raise ValueError("Filtering left no nodes.") timing['Node filter: compartment'] = timeit.default_timer() - start_node_drop ###################################################### # Clustering and coloring ###################################################### start = timeit.default_timer() ### Clustering part if cluster_by in ['GO term 1','GO term 2']: nodes['cluster'] = nodes[cluster_by] elif 'CYCLoPs WT' in cluster_by: WT_string = 'WT' + cluster_by[-1] # loop over all nodes find their highest expression compartment for the WT given by WT_string # NOTE: SOMETIMES A DICTIONARY WITH EXPRESSION DATA FOR A GIVEN WT IS EMPTY WE NEED TO CHECK FOR THIS # Example: GET1 in WT1 l = nodes['CYCLoPs_dict'].values l_max_comps = [ max(l[i][WT_string], key=lambda key: l[i][WT_string][key]) if (type(l[i]) != str and len(l[i][WT_string]) > 0) else 'No data' for i in range(len(nodes))] nodes['cluster'] = pd.Series(l_max_comps).values elif cluster_by == 'No clustering': nodes['cluster'] = ['No clustering' for i in range(len(nodes))] else: raise SystemExit(cluster_by,f"Unexpected value for clustering variable: {cluster_by}.") if color_by in ['GO term 1','GO term 2']: # set the color based on the color_by variable in a new column of 'nodes' DF nodes['color'] = nodes[color_by] elif 'CYCLoPs WT' in color_by: WT_string = 'WT' + color_by[-1] # loop over all nodes find their highest expression compartment for the WT given by WT_string # NOTE: SOMETIMES A DICTIONARY WITH EXPRESSION DATA FOR A GIVEN WT IS EMPTY WE NEED TO CHECK FOR THIS # Example: GET1 in WT1 l = nodes['CYCLoPs_dict'].values l_max_comps = [ max(l[i][WT_string], key=lambda key: l[i][WT_string][key]) if \ (type(l[i]) != str and len(l[i][WT_string]) > 0) else 'No data' for i in range(len(nodes))] # set the color based on the maximum compartment found above in a new column in the nodes DF nodes['color'] = pd.Series(l_max_comps).values elif color_by == "Peak expression phase": nodes['color'] = nodes['Expression peak phase'] elif color_by == 'No coloring': nodes['color'] = ["No data" for i in range(len(nodes))] else: raise SystemExit(color_by, f'Unexpected value for coloring variable: {color_by}') # now we can drop expression peak phase/time as separate fields nodes = nodes.drop('Expression peak phase',1) nodes = nodes.drop('Expression peak time',1) timing['Setting node cluster and color attributes'] = timeit.default_timer() - start len_nodes_filtered_comp = len(nodes) ###################################################### ### GET ALL INTERACTIONS BETWEEN ALL NODES ###################################################### start_final_sql = timeit.default_timer() max_interactions = 10000 # a too high value here seems to make the server run out of memory and this is the most time-expensive step on the server placeholders = ', '.join('?' for unused in node_list) # '?, ?, ?, ...' placeholders_primary_nodes = ', '.join('?' for unused in primary_nodes) # Multiple query options # if there are more than max_interactions satisfying the criteria then ORDEr BY: # - Pick interactions with primary_nodes first # - pick regulations/physical over genetic # - pick more over less: exp, pubs, methods # - Pick regulation over physical when equal in exp/pubs/methods because regulatory interactions are often singular in these. if len(split_types) == 3: query = "SELECT * FROM interactions \ WHERE ( (source IN (%s) AND target IN (%s)) \ AND num_experiments >= (%s) AND num_publications >= (%s) AND num_methods >= (%s)) \ ORDER BY \ CASE WHEN ((source IN (%s)) OR (target IN (%s))) THEN 1 ELSE 2 END ASC, \ CASE type WHEN 'physical' OR 'regulation' THEN 1 WHEN 'genetic' THEN 2 END ASC, \ num_experiments DESC, num_publications DESC, num_methods DESC, \ CASE type WHEN 'regulation' THEN 1 WHEN 'physical' THEN 2 WHEN 'genetic' THEN 3 END ASC \ limit (%s)" \ % (placeholders,placeholders,min_exp,min_pub,min_methods,placeholders_primary_nodes,placeholders_primary_nodes,max_interactions) interactome = pd.read_sql_query(query, conn, params=node_list+node_list+primary_nodes+primary_nodes) else: placeholders_type = ', '.join('?' for unused in split_types) query = "SELECT * FROM interactions \ WHERE ( (source IN (%s) AND target IN (%s)) AND type IN (%s) \ AND num_experiments >= (%s) and num_publications >= (%s) and num_methods >= (%s)) \ ORDER BY \ CASE WHEN ((source IN (%s)) OR (target IN (%s))) THEN 1 ELSE 2 END ASC, \ CASE type WHEN 'physical' OR 'regulation' THEN 1 WHEN 'genetic' THEN 2 END ASC, \ num_experiments DESC, num_publications DESC, num_methods DESC, \ CASE type WHEN 'regulation' THEN 1 WHEN 'physical' THEN 2 WHEN 'genetic' THEN 3 END ASC \ limit (%s)" \ % (placeholders, placeholders,placeholders_type,min_exp,min_pub,min_methods,placeholders_primary_nodes,placeholders_primary_nodes,max_interactions) interactome = pd.read_sql_query(query, conn, params=node_list+node_list+split_types+primary_nodes+primary_nodes) interactome.columns = ['source','target','type','Evidence','Evidence HTML','#Experiments',\ '#Publications','#Methods'] timing['Interactome SQL + dataframe + processing'] = timeit.default_timer() - start_final_sql ###################################################### ### BASED ON THE METHOD TYPES FILTER: DROP INTERACTIONS ###################################################### start = timeit.default_timer() to_drop = [] with open(SCRIPT_DIR+'/data/unique_experimental_methods.txt') as f: read_methods = f.read().splitlines() total_methods = len(read_methods) if len(method_types) < total_methods: # some have been deselected algorithm_output_str += '<p>' + 'We have on file: ' + str(total_methods) + ' methods. User queried for: ' + str(len(method_types)) + '</p>' len_before = len(interactome) interactome = interactome[interactome.apply(lambda x: find_methods_in_evidence(x['Evidence'],method_types),1)] algorithm_output_str += '<p>' + 'We dropped: ' + str(len_before - len(interactome)) + ' interactions based on the methods.' + '</p>' if len(interactome) == 0: raise ValueError('No interactions matching these conditions.') timing['Filter based on methods'] = timeit.default_timer() - start ###################################################### # Network properties with networkx: 1 ###################################################### start = timeit.default_timer() df_network = pd.Series() df_network['Number of nodes'] = len(nodes) df_network['Number of edges'] = len(interactome) # use networkx nodes, interactome, df_network, G = calc_network_props(primary_nodes, nodes, interactome, df_network, filter_condition) df_network = df_network.to_frame() df_network = df_network.transpose() timing['networkx properties calculation'] = timeit.default_timer() - start ###################################################### # Export visualized networkx graph to graph formats (GEFX) ###################################################### start = timeit.default_timer() nx.write_gexf(G, SCRIPT_DIR+'/../output/networkx/' + primary_nodes_str + "_" + unique_str + "_full.gexf") timing['networkx export'] = timeit.default_timer() - start ###################################################### # Save the full network information ###################################################### start = timeit.default_timer() nodes_full = nodes.copy() interactome_full = interactome.copy() timing['Save full network'] = timeit.default_timer() - start ###################################################### # Pickle the dataframes ###################################################### start = timeit.default_timer() filename_base = os.path.abspath(SCRIPT_DIR+'/../output/excel_files/') file_id = primary_nodes_str+'_'+unique_str df_user_input.to_pickle(filename_base+'/user_input_'+file_id) nodes_full.to_pickle(filename_base+'/nodes_'+file_id) interactome_full.to_pickle(filename_base+'/interactome_'+file_id) timing['Pickle full network'] = timeit.default_timer() - start # ###################################################### # # WRITE "FULL" NETWORK TO JSON # # this will include a filtering step for really big networks # ###################################################### start_json = timeit.default_timer() write_network_to_json(nodes_full,interactome_full,filter_condition,output_filename,G,'full',primary_nodes) timing['json_full'] = timeit.default_timer() - start_json ###################################################### # FILTER NODES TO MANAGEABLE VISUALIZATION if (filter_flag): start_filter = timeit.default_timer() len_interactome = len(interactome) # reduce nodes nodes = nodes.sort_values(by=['primary node',filter_condition],ascending=False) nodes = nodes.iloc[:max_nodes] nodes.reset_index(drop=True,inplace=True) # reduce interactions n = nodes['Standard name'].values # list of remaining node IDs interactome = interactome[ (interactome['source'].isin(n)) & (interactome['target'].isin(n)) ] interactome.reset_index(drop=True,inplace=True) # SHOW WARNING MESSAGE ABOUT FILTER STEP filter_message = "Note: this query returned {} nodes and {} interactions. We reduced the network to {} nodes based on {} resulting in {} interactions. \ All interactions and nodes are contained in the <i>full</i> Excel file. ".format(len_nodes_filtered_comp,len_interactome,max_nodes,filter_condition,len(interactome)) s = filter_message print("<script>create_alert(\""+s+"\",\"alert-warning\");</script>") timing['filter'] = timeit.default_timer() - start_filter ###################################################### # Network properties with networkx: 2 ###################################################### start = timeit.default_timer() # df_network = pd.Series() df_network['Number of nodes'] = len(nodes) df_network['Number of edges'] = len(interactome) # use networkx nodes, interactome, df_network, G = calc_network_props(primary_nodes, nodes, interactome, df_network, filter_condition) timing['networkx properties calculation'] += timeit.default_timer() - start ###################################################### # Export full networkx graph to graph formats (GEFX) ###################################################### start = timeit.default_timer() nx.write_gexf(G, SCRIPT_DIR+'/../output/networkx/' + primary_nodes_str + "_" + unique_str + ".gexf") timing['networkx export'] += timeit.default_timer() - start ###################################################### # Nxviz image generation: matrixplot ###################################################### start = timeit.default_timer() c = nv.MatrixPlot(G) c.draw() plt.savefig(SCRIPT_DIR+'/../output/nxviz/matrix_' + unique_str + '.png') timing['nxviz matrix plot'] = timeit.default_timer() - start ###################################################### ### Write the network to json ###################################################### start_json = timeit.default_timer() write_network_to_json(nodes,interactome,filter_condition,output_filename,G) timing['json'] = timeit.default_timer() - start_json # remove the Evidence HTML column interactome = interactome.drop('Evidence',1) interactome = interactome.rename(columns={'Evidence HTML':'Evidence'}) if not excel_flag: ###################################################### ### End output text alert div ###################################################### print("</div>") ###################################################### # Generate strings for the nodes and interactome dataframes to print ###################################################### start_print = timeit.default_timer() # drop columns nodes = nodes.drop(['Description','CYCLoPs_Excel_string','CYCLoPs_dict','cluster','color'],1) # Add HTML links to database/SGD to symbols nodes['Standard name'] = nodes['Standard name'].apply(lambda x: "<a href='index.php?id=database&gene=" + x + "' target='blank'>" + x + "</a>") # change CYCLoPs column name and export html # escape makes the HTML links work nodes = nodes.rename(columns={'CYCLoPs_html':'CYCLoPs'}) nodes = nodes.to_html(escape=False,index=False,classes=['table','table-condensed','table-bordered']) nodes = nodes.replace('<table','<table id=\"proteins_table\"',1) interactome['source'] = interactome['source'].apply(lambda x: "<a href='index.php?id=database&gene=" + x + "' target='blank'>" + x + "</a>" ) interactome['target'] = interactome['target'].apply(lambda x: "<a href='index.php?id=database&gene=" + x + "' target='blank'>" + x + "</a>") # escape makes the HTML links work interactome = interactome.to_html(escape=False,index=False,classes=['table','table-condensed','table-bordered']) interactome = interactome.replace('<table','<table id=\"interactions_table\"',1) ###################################################### # PRINT COLLAPSABLE BOOTSTRAP HTML CODE WITH THE DATAFRAMES ###################################################### # the 'in' class makes the collapse open by default: the interactions here print(""" <div class="panel-group" id="accordion"> <div class="panel panel-default"> <div class="panel-heading"> <h4 class="panel-title"> <a data-toggle="collapse" data-parent="#accordion" href="#collapse1"> User input</a> </h4> </div> <div id="collapse1" class="panel-collapse collapse"> <div class="panel-body"> <div class="table-responsive"> """) print(df_user_input_to_print) print(""" </div> </div> </div> </div> <div class="panel panel-default"> <div class="panel-heading"> <h4 class="panel-title"> <a data-toggle="collapse" data-parent="#accordion" href="#collapse2"> Network properties</a> </h4> </div> <div id="collapse2" class="panel-collapse collapse"> <div class="panel-body"> <div class="table-responsive"> """) print(df_network.to_html(classes=['table','table-condensed','table-bordered'],index=False)) print(""" </div> </div> </div> </div> <div class="panel panel-default"> <div class="panel-heading"> <h4 class="panel-title"> <a data-toggle="collapse" data-parent="#accordion" href="#collapse3"> Network nodes (proteins)</a> </h4> </div> <div id="collapse3" class="panel-collapse collapse"> <div class="panel-body"> Use the search utility to find the gene you are looking for. The table scrolls horizontally and vertically. By clicking the column headers the table will be sorted on that column. Use shift+click to sort on multiple columns. Default sorting is on number of experiments, number of publications, number of methods and alphabetical on standard name, in that order. <div class="table-responsive"> """) print(nodes) print(""" </div> </div> </div> </div> <div class="panel panel-default"> <div class="panel-heading"> <h4 class="panel-title"> <a data-toggle="collapse" data-parent="#accordion" href="#collapse4"> Interactions</a> </h4> </div> <div id="collapse4" class="panel-collapse collapse"> <div class="panel-body"> Use the search utility to find the gene you are looking for. By clicking the column headers the table will be sorted on that column. Use shift+click to sort on multiple columns. Default sorting is on number of experiments, number of publications, number of methods and alphabetical on standard name, in that order. <div class="table-responsive"> """) print(interactome) print(""" </div> </div> </div> </div> """) ###################################################### # Optional diagnostics ###################################################### print(""" <div class="panel panel-default"> <div class="panel-heading"> <h4 class="panel-title"> <a data-toggle="collapse" data-parent="#accordion" href="#collapse5"> Diagnostics: calculation time</a> </h4> </div> <div id="collapse5" class="panel-collapse collapse"> <div class="panel-body"> <div class="table-responsive"> """) timing['print frames'] = timeit.default_timer() - start_print timing['all'] = timeit.default_timer() - start_all df_timing = pd.Series(timing) df_timing = df_timing.to_frame() df_timing.columns = ['Time'] df_timing['Percentage'] = [v/timing['all']*100 for v in df_timing['Time'] ] print(df_timing.sort_values('Percentage').to_html(classes=['table','table-condensed','table-bordered'])) print("Accounted for:", sum([timing[k] for k in timing if k != 'all' ])/timing['all'] * 100, "percent of the time spent in Python.") print(""" </div> </div> </div> </div> </div> """) ###################################################### # Show algorithm output in an alert at the bottom of the page ###################################################### if algorithm_output_str != '': print("<div class=\"alert alert-dismissable alert-info\">") print(algorithm_output_str) print("</div>")
"""Step3. Save all the results""" if not os.path.exists(args.outputDir): os.makedirs(args.outputDir) pvalues = {} pvalues['Gpvals'] = gpvals.tolist() pvalues['clu_pvals'] = clu_pvals.tolist() pvalues['Lpvals_fdr'] = lpvals_fdr.tolist() with open(os.path.join(args.outputDir, 'pvalues.json'), 'w') as outfile: json.dump(pvalues, outfile) efit = {} efit['efitBetas'] = efit_beta.tolist() efit['efitYdesign'] = efity_design.tolist() efit['efitEtas'] = efit_eta.tolist() with open(os.path.join(args.outputDir, 'efit.json'), 'w') as outfile: json.dump(efit, outfile) if __name__ == '__main__': args = parser.parse_args() start_all = timeit.default_timer() run_script(args) stop_all = timeit.default_timer() delta_time_all = str(stop_all - start_all) print("The total elapsed time is " + delta_time_all)
def multiply_by_Z_viaMKL( self, x ): '''Multiplies the vector passed as argument by the matrix Z''' code = 'multiply_by_Z_viaMKL' start = default_timer() # Dissecting the "cspblas_dcsrgemv" name: # "c" - for "c-blas" like interface (as opposed to fortran) # Also means expects sparse arrays to use 0-based indexing, which python does # "sp" for sparse # "d" for double-precision # "csr" for compressed row format # "ge" for "general", e.g., the matrix has no special structure such as symmetry # "mv" for "matrix-vector" multiply A = self.data_obj.Z if not sparse.isspmatrix_csr(A): raise Exception("Matrix must be in csr format") (m,n) = A.shape # # The data of the matrix # data = A.data.ctypes.data_as(POINTER(c_double)) # indptr = A.indptr.ctypes.data_as(POINTER(c_int)) # indices = A.indices.ctypes.data_as(POINTER(c_int)) # Allocate output, using same conventions as input nVectors = 1 if x.ndim is 1: y = np.empty(m,dtype=np.double,order='F') if x.size != n: raise Exception("x must have n entries. x.size is %d, n is %d" % (x.size,n)) elif x.shape[1] is 1: y = np.empty((m,1),dtype=np.double,order='F') if x.shape[0] != n: raise Exception("x must have n entries. x.size is %d, n is %d" % (x.size,n)) else: nVectors = x.shape[1] y = np.empty((m,nVectors),dtype=np.double,order='F') if x.shape[0] != n: raise Exception("x must have n entries. x.size is %d, n is %d" % (x.size,n)) # Check input if x.dtype.type is not np.double: x = x.astype(np.double,copy=True) # Put it in column-major order, otherwise for nVectors > 1 this FAILS completely if x.flags['F_CONTIGUOUS'] is not True: x = x.copy(order='F') if nVectors == 1: np_x = x.ctypes.data_as(POINTER(c_double)) np_y = y.ctypes.data_as(POINTER(c_double)) # now call MKL. This returns the answer in np_y, which links to y self.SpMV(byref(c_char(b"N")), byref(c_int(m)), self.Z_data , self.Z_indptr, self.Z_indices, np_x, np_y) else: for columns in xrange(nVectors): xx = x[:,columns] yy = y[:,columns] np_x = xx.ctypes.data_as(POINTER(c_double)) np_y = yy.ctypes.data_as(POINTER(c_double)) self.SpMV(byref(c_char("N")), byref(c_int(m)),data,indptr, indices, np_x, np_y) end = default_timer() time_elapsed = end - start self.update_time(code, time_elapsed) return y
def main(): bstTimeStart = timeit.default_timer() bst(A) bstTimeEnd = timeit.default_timer() print("The BST comparison count is:", bstCount) print("The BST completion time is:", (bstTimeEnd - bstTimeStart) * 1000)
from pyspark import SparkConf from pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.feature import PCA from pyspark.ml.linalg import Vectors from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField, StringType, IntegerType if __name__ == "__main__": timerstart = timeit.default_timer() #Check arguments if len(sys.argv) != 3: print("Usage: train.py <csv dataset> <model output path>", file=sys.stderr) sys.exit(-1) # Create spark session spark = SparkSession\ .builder\ .appName("TrainDecisionTreeIDS-Python")\ .getOrCreate() # Define dataset schema, dataset csv generated by flowtbag https://github.com/DanielArndt/flowtbag schema = StructType([
def main(opts): adj2_ = torch.from_numpy(graph.cihp2pascal_nlp_adj).float() adj2_test = adj2_.unsqueeze(0).unsqueeze(0).expand(1, 1, 7, 20).cuda().transpose( 2, 3) adj1_ = Variable( torch.from_numpy(graph.preprocess_adj(graph.pascal_graph)).float()) adj3_test = adj1_.unsqueeze(0).unsqueeze(0).expand(1, 1, 7, 7).cuda() cihp_adj = graph.preprocess_adj(graph.cihp_graph) adj3_ = Variable(torch.from_numpy(cihp_adj).float()) adj1_test = adj3_.unsqueeze(0).unsqueeze(0).expand(1, 1, 20, 20).cuda() p = OrderedDict() # Parameters to include in report p['trainBatch'] = opts.batch # Training batch size p['nAveGrad'] = 1 # Average the gradient of several iterations p['lr'] = opts.lr # Learning rate p['lrFtr'] = 1e-5 p['lraspp'] = 1e-5 p['lrpro'] = 1e-5 p['lrdecoder'] = 1e-5 p['lrother'] = 1e-5 p['wd'] = 5e-4 # Weight decay p['momentum'] = 0.9 # Momentum p['epoch_size'] = 10 # How many epochs to change learning rate p['num_workers'] = opts.numworker backbone = 'xception' # Use xception or resnet as feature extractor, with open(opts.txt_file, 'r') as f: img_list = f.readlines() max_id = 0 save_dir_root = os.path.join(os.path.dirname(os.path.abspath(__file__))) exp_name = os.path.dirname(os.path.abspath(__file__)).split('/')[-1] runs = glob.glob(os.path.join(save_dir_root, 'run', 'run_*')) for r in runs: run_id = int(r.split('_')[-1]) if run_id >= max_id: max_id = run_id + 1 # run_id = int(runs[-1].split('_')[-1]) + 1 if runs else 0 # Network definition if backbone == 'xception': net = deeplab_xception_transfer.deeplab_xception_transfer_projection_savemem( n_classes=opts.classes, os=16, hidden_layers=opts.hidden_layers, source_classes=7, ) elif backbone == 'resnet': # net = deeplab_resnet.DeepLabv3_plus(nInputChannels=3, n_classes=7, os=16, pretrained=True) raise NotImplementedError else: raise NotImplementedError if gpu_id >= 0: net.cuda() # net load weights if not opts.loadmodel == '': x = torch.load(opts.loadmodel) net.load_source_model(x) print('load model:', opts.loadmodel) else: print('no model load !!!!!!!!') ## multi scale scale_list = [1, 0.5, 0.75, 1.25, 1.5, 1.75] testloader_list = [] testloader_flip_list = [] for pv in scale_list: composed_transforms_ts = transforms.Compose( [tr.Scale_(pv), tr.Normalize_xception_tf(), tr.ToTensor_()]) composed_transforms_ts_flip = transforms.Compose([ tr.Scale_(pv), tr.HorizontalFlip(), tr.Normalize_xception_tf(), tr.ToTensor_() ]) voc_val = cihp.VOCSegmentation(split='test', transform=composed_transforms_ts) voc_val_f = cihp.VOCSegmentation(split='test', transform=composed_transforms_ts_flip) testloader = DataLoader(voc_val, batch_size=1, shuffle=False, num_workers=p['num_workers']) testloader_flip = DataLoader(voc_val_f, batch_size=1, shuffle=False, num_workers=p['num_workers']) testloader_list.append(copy.deepcopy(testloader)) testloader_flip_list.append(copy.deepcopy(testloader_flip)) print("Eval Network") if not os.path.exists(opts.output_path + 'cihp_output_vis/'): os.makedirs(opts.output_path + 'cihp_output_vis/') if not os.path.exists(opts.output_path + 'cihp_output/'): os.makedirs(opts.output_path + 'cihp_output/') start_time = timeit.default_timer() # One testing epoch total_iou = 0.0 net.eval() for ii, large_sample_batched in enumerate( zip(*testloader_list, *testloader_flip_list)): print(ii) #1 0.5 0.75 1.25 1.5 1.75 ; flip: sample1 = large_sample_batched[:6] sample2 = large_sample_batched[6:] for iii, sample_batched in enumerate(zip(sample1, sample2)): inputs, labels = sample_batched[0]['image'], sample_batched[0][ 'label'] inputs_f, _ = sample_batched[1]['image'], sample_batched[1][ 'label'] inputs = torch.cat((inputs, inputs_f), dim=0) if iii == 0: _, _, h, w = inputs.size() # assert inputs.size() == inputs_f.size() # Forward pass of the mini-batch inputs, labels = Variable(inputs, requires_grad=False), Variable(labels) with torch.no_grad(): if gpu_id >= 0: inputs, labels = inputs.cuda(), labels.cuda() # outputs = net.forward(inputs) # pdb.set_trace() outputs = net.forward(inputs, adj1_test.cuda(), adj3_test.cuda(), adj2_test.cuda()) outputs = (outputs[0] + flip(flip_cihp(outputs[1]), dim=-1)) / 2 outputs = outputs.unsqueeze(0) if iii > 0: outputs = F.upsample(outputs, size=(h, w), mode='bilinear', align_corners=True) outputs_final = outputs_final + outputs else: outputs_final = outputs.clone() ################ plot pic predictions = torch.max(outputs_final, 1)[1] prob_predictions = torch.max(outputs_final, 1)[0] results = predictions.cpu().numpy() prob_results = prob_predictions.cpu().numpy() vis_res = decode_labels(results) parsing_im = Image.fromarray(vis_res[0]) parsing_im.save(opts.output_path + 'cihp_output_vis/{}.png'.format(img_list[ii][:-1])) cv2.imwrite( opts.output_path + 'cihp_output/{}.png'.format(img_list[ii][:-1]), results[0, :, :]) # np.save('../../cihp_prob_output/{}.npy'.format(img_list[ii][:-1]), prob_results[0, :, :]) # pred_list.append(predictions.cpu()) # label_list.append(labels.squeeze(1).cpu()) # loss = criterion(outputs, labels, batch_average=True) # running_loss_ts += loss.item() # total_iou += utils.get_iou(predictions, labels) end_time = timeit.default_timer() print('time use for ' + str(ii) + ' is :' + str(end_time - start_time)) # Eval pred_path = opts.output_path + 'cihp_output/' eval_(pred_path=pred_path, gt_path=opts.gt_path, classes=opts.classes, txt_file=opts.txt_file)
matrix[k - 1][j - 1] = 1 + lcs(X, Y, k - 1, j - 1, matrix) return matrix[k - 1][j - 1] else: # 최적 부분 구조 성질 2번, 3번 # store it in arr to avoid further repetitive # work in future function calls matrix[k - 1][j - 1] = max(lcs(X, Y, k, j - 1, matrix), lcs(X, Y, k - 1, j, matrix)) return matrix[k - 1][j - 1] sys.setrecursionlimit(10000) n = int(input("Size: ")) # 생성할 문자열의 길이 string_pool = string.ascii_uppercase # 대문자만 이용하여 문자열 생성 X = "" Y = "" for i in range(n): # 랜덤한 문자열 생성 X += random.choice(string_pool) # 랜덤한 문자열 하나 선택 Y += random.choice(string_pool) # 랜덤한 문자열 하나 선택 print("X: ", X) # 입력 수열1 출력 print("Y: ", Y) # 입력 수열2 출력 n = len(Y) # 입력 수열의 길이 n dp = [[-1] * (n+1) for _ in range(n+1)] # 이전에 계산한 값을 저장할 2차원 리스트 선언 t1 = timeit.default_timer() # LCS 알고리즘 시작시간 print("Length of LCS:", lcs(X, Y, n, n, dp)) # LCS 함수 호출 및 반환값 출력 t2 = timeit.default_timer() # LCS 알고리즘 종료시간 print("Running time: ", (t2 - t1) * 1000) # 삽입 정렬 실행시간
def __enter__(self): self._start = default_timer()
conn_spec=conn_dict) A = mmread('../ii.wmat') #rows, cols = A.nonzero() nest.Connect(A.row + NE + 1, A.col + NE + 1, syn_spec=inh_syn_dict, conn_spec=conn_dict) if (not fast): spikes = nest.Create("spike_detector", 1, [{ "label": "va-py-ex", "to_file": True }]) spikes_E = spikes[:1] nest.Connect(nodes_E[:N_rec], spikes_E) starttime = timeit.default_timer() nest.Simulate(simtime) totaltime = timeit.default_timer() - starttime print("Real Time Sim: " + str(totaltime) + "s") if (fast): f = open("timefile.dat", "w") f.write("%f" % totaltime) f.close() if (not fast): rate_iaf = nest.GetStatus(spikes)[0]["n_events"] / ( (simtime / 1000.0) * N_rec) print("Average Rate of recorded electrodes: " + str(rate_iaf) + "Hz")
def __exit__(self, typ, value, traceback): # Time can go backwards. self._gauge.set(max(default_timer() - self._start, 0))
def __exit__(self, typ, value, traceback): # Time can go backwards. self._histogram.observe(max(default_timer() - self._start, 0))