def main(): """ Run program """ cmn.make_dir(cmn.LOG_DIR) logging.basicConfig( # filename=os.path.join(cmn.LOG_DIR, cmn.LOG_FILE_NAME % time.strftime(cmn.LOG_FILE_NAME_TIME)), handlers=[ logging.FileHandler( os.path.join( cmn.LOG_DIR, cmn.LOG_FILE_NAME % time.strftime(cmn.LOG_FILE_NAME_TIME))), logging.StreamHandler() ], level=cmn.LOG_LEVEL, format=cmn.LOG_MSG_FORMAT, datefmt=cmn.LOG_TIME_FORMAT) start_time = time.time() logging.info("Starting program ...\n") if "-hmo" in sys.argv: logging.info( "Detected 'heatmap only' command line argument, program will use pre-existing combined PTEN correlation file.\n" ) else: combine.run() heatmap.run() logging.info("Finished running program.") run_time = time.time() - start_time logging.info("Total run time: %s.\n" % time.strftime('%H:%M:%S', time.gmtime(run_time)))
def main(): """ Run program """ cmn.make_dir(cmn.LOG_DIR) logging.basicConfig( # filename=os.path.join(cmn.LOG_DIR, cmn.LOG_FILE_NAME % time.strftime(cmn.LOG_FILE_NAME_TIME)), handlers=[ logging.FileHandler(os.path.join(cmn.LOG_DIR, cmn.LOG_FILE_NAME % time.strftime(cmn.LOG_FILE_NAME_TIME))), logging.StreamHandler() ], level=cmn.LOG_LEVEL, format=cmn.LOG_MSG_FORMAT, datefmt=cmn.LOG_TIME_FORMAT ) start_time = time.time() logging.info("Starting program ...") if "-man" in sys.argv: logging.info("Detected 'manifest only' command line argument, program will only generate file manifests.\n") file_lists.run() elif "-dlo" in sys.argv: logging.info("Detected 'download only' command line argument, program will use existing manifest lists.\n") files.run() else: logging.warning("No valid command line arguments, program will ignore arguments.\n") file_lists.run() files.run() logging.info("Finished running program.") run_time = time.time() - start_time logging.info("Total run time: %s.\n" % time.strftime('%H:%M:%S', time.gmtime(run_time)))
def train(args): assert args.num_classes common.make_dir(args.checkout_dir) nnet = CNN(args.left_context + args.right_context + 1, args.feat_dim, num_maps, pooling_size, filter_size, conn_dim, args.num_classes) print(nnet) nnet.cuda() criterion = nn.CrossEntropyLoss() optimizer = th.optim.Adam(nnet.parameters(), lr=args.learning_rate) train_dataset = THCHS30(root=args.data_dir, data_type='train', left_context=left_context, right_context=right_context, model_type='cnn') train_loader = data.DataLoader(dataset=train_dataset, batch_size=args.min_batch, shuffle=True, num_workers=6) test_dataset = THCHS30(root=args.data_dir, data_type='test', left_context=left_context, right_context=right_context, model_type='cnn') test_loader = data.DataLoader(dataset=test_dataset, batch_size=args.min_batch, shuffle=True, num_workers=6) cross_validate(-1, nnet, test_dataset, test_loader) for epoch in range(args.num_epochs): common.train_one_epoch(nnet, criterion, optimizer, train_loader) cross_validate(epoch, nnet, test_dataset, test_loader) th.save(nnet, common..join_path(args.checkout_dir, 'cnn.{}.pkl'.format(epoch + 1)))
def train(args): common.make_dir(args.checkout_dir) # nnet nnet = RNN((args.left_context + args.right_context + 1) * args.feat_dim, \ hidden_layer, hidden_size, args.num_classes, dropout=dropout) print(nnet) nnet.cuda() criterion = nn.CrossEntropyLoss() optimizer = th.optim.Adam(nnet.parameters(), lr=args.learning_rate) train_dataset = THCHS30(root=args.data_dir, data_type='train') train_loader = data.DataLoader(dataset=train_dataset, batch_size=args.min_batch, shuffle=True) test_dataset = THCHS30(root=args.data_dir, data_type='test') test_loader = data.DataLoader(dataset=test_dataset, batch_size=args.min_batch, shuffle=True) cross_validate(-1, nnet, test_loader, test_dataset.num_frames) for epoch in range(args.num_epochs): common.train_one_epoch(nnet, criterion, optimizer, train_loader, is_rnn=True) cross_validate(epoch, nnet, test_loader, test_dataset.num_frames) th.save( nnet, common.join_path(args.checkout_dir, 'rnn.{}.pkl'.format(epoch + 1)))
def configure_modbus_logger(cfg, recycle_logs=True): """ Configure the logger. Args: cfg (Namespace): The PUReST config namespace. """ logger = logging.getLogger("modbus_tk") if isinstance(cfg, dict): cfg = Configuration(**cfg) if cfg.no_modbus_log: logger.setLevel(logging.ERROR) logger.addHandler(logging.NullHandler()) else: logger.setLevel(logging.DEBUG) fmt = "%(asctime)s - %(levelname)s - " "%(module)s::%(funcName)s @ %(lineno)d - %(message)s" fmtr = logging.Formatter(fmt) if not cfg.no_modbus_console_log: sh = logging.StreamHandler() sh.setFormatter(fmtr) sh.setLevel(cfg.modbus_console_log_level.upper()) logger.addHandler(sh) if not cfg.no_modbus_file_log: modbus_log = path(cfg.modbus_log) if recycle_logs: remove_file(modbus_log) make_dir(os.path.dirname(modbus_log)) fh = logging.FileHandler(modbus_log) fh.setFormatter(fmtr) fh.setLevel(cfg.modbus_file_log_level.upper()) logger.addHandler(fh)
def __call__(self): import h5py self.logger.info('load training data: ' + self.infile) fin = h5py.File(self.infile, 'r') X_train = fin[self.xname][:] y_train = fin[self.yname][:] fin.close() valid_data = None if self.valid_file: self.logger.info('load validation data: ' + self.valid_file) fin = h5py.File(self.valid_file, 'r') X_valid = fin[self.valid_xname][:] y_valid = fin[self.valid_yname][:] fin.close() valid_data = (X_valid, y_valid) window_size = X_train.shape[1] from keras.optimizers import RMSprop optimizer = RMSprop(lr=self.learning_rate) # load model # variables optimizer, loss may be overloaded regression = self.regression if self.regression: loss = 'mean_squared_error' metrics = ['mean_squared_error'] else: loss = 'binary_crossentropy' metrics = ['accuracy'] with open(self.model_script, 'r') as f: exec compile(f.read(), self.model_script, 'exec') model.compile(optimizer=optimizer, loss=loss, metrics=metrics) model.summary() callbacks = [] if self.tensorboard_log_dir: from keras.callbacks import TensorBoard callbacks = [TensorBoard(log_dir=self.tensorboard_log_dir)] else: callbacks = [] if self.keras_log is not None: self.logger.info('open CSV log file: {}'.format(self.keras_log)) make_dir(os.path.dirname(self.keras_log)) callbacks.append(keras.callbacks.CSVLogger(self.keras_log)) self.logger.info('train model') model.fit(X_train, y_train, batch_size=self.batch_size, epochs=self.epochs, callbacks=callbacks, verbose=self.keras_verbose, validation_data=valid_data) self.logger.info('save model: {}'.format(self.model_file)) prepare_output_file(self.model_file) model.save(self.model_file)
def __call__(self): import h5py self.logger.info('load model: {}'.format(self.model_file)) if self.model_format == 'keras': model = keras.models.load_model(self.model_file) elif self.model_format == 'sklearn': import cPickle with open(self.model_file, 'r') as f: model = cPickle.load(f) self.logger.info('load data: {}'.format(self.infile)) fin = h5py.File(self.infile, 'r') X_test = fin[self.xname][:] y_test = fin[self.yname][:] fin.close() self.logger.info('run the model') if self.model_format == 'keras': y_pred = model.predict(X_test, batch_size=self.batch_size) elif self.model_format == 'sklearn': y_pred = model.predict(X_test) y_pred = np.squeeze(y_pred) if self.swap_labels: self.logger.info('swap labels') y_pred = 1 - y_pred y_pred_labels = (y_pred >= self.cutoff).astype('int32') # ingore NaNs in y_test y_test = y_test.flatten() y_pred = y_pred.flatten() y_pred_labels = y_pred_labels.flatten() not_nan_mask = np.logical_not(np.isnan(y_test)) y_test = y_test[not_nan_mask] y_pred = y_pred[not_nan_mask] y_pred_labels = y_pred_labels[not_nan_mask] scores = {} for metric in self.metrics: # y_pred is an array of continous scores scorer = get_scorer(metric) if metric == 'roc_auc': scores[metric] = scorer(y_test, y_pred) else: scores[metric] = scorer(y_test, y_pred_labels) self.logger.info('metric {} = {}'.format(metric, scores[metric])) if self.outfile is not None: self.logger.info('save file: {}'.format(self.outfile)) make_dir(os.path.dirname(self.outfile)) fout = h5py.File(self.outfile, 'w') fout.create_dataset('y_true', data=y_test) fout.create_dataset('y_pred', data=y_pred) fout.create_dataset('y_pred_labels', data=y_pred_labels) grp = fout.create_group('metrics') for metric in self.metrics: grp.create_dataset(metric, data=scores[metric]) fout.close()
def create_channel_folder(work_dir, channel_id): global error_creator error_creator = 0 try: if error_renamer == 0: common.make_dir(os.path.join(work_dir, channel_id)) else: return except: error_creator = 1
def run(): """ Run the downloading script to download files listed in previously acquired manifest lists. """ # number of file currently being requested/downloaded file_number = [1] cmn.make_dir(cmn.DL_DIR) download_files(file_number)
def __call__(self): model_weights = h5py.File(self.model_file, 'r')['/model_weights/dense_1/dense_1_W:0'][:] window_size = model_weights.shape[0]/len(self.alphabet) model_weights = model_weights.reshape((window_size, 4)) fig, axes = plt.subplots(nrows=4, ncols=1, figsize=(12, 9), sharey=True) for i in range(len(self.alphabet)): ax = axes[i] ax.bar(np.arange(window_size), model_weights[:, i], color='b', edgecolor='none') ax.set_xticks(np.arange(window_size, step=5)) ax.set_xlim(0, window_size) ax.set_title('Logistic regression weights ({})'.format(self.alphabet[i])) ax.set_ylabel('Weight') self.logger.info('savefig: {}'.format(self.outfile)) make_dir(os.path.dirname(self.outfile)) plt.tight_layout() plt.savefig(self.outfile, dpi=150, bbox_inches='tight')
def get_video_info(video_id): video_info_list = [] items = youtube.videos().list(part="snippet, contentDetails", id=video_id).execute()["items"][0] resp_save_dest = "data/resp/yt/v/" timestamp = common.now_iso(1) # UTC+0 common.make_dir(resp_save_dest) common.write_json(resp_save_dest + timestamp + " " + video_id + ".json", items) video_info_list.append(items["snippet"]["channelTitle"]) # 0 video_info_list.append(items["snippet"]["channelId"]) # 1 video_info_list.append(items["snippet"]["publishedAt"]) # 2 video_info_list.append(items["snippet"]["title"]) # 3 video_info_list.append(items["snippet"]["description"]) # 4 video_info_list.append(items["contentDetails"]["duration"]) # 5 return video_info_list # Throw video_info_list to fileproc.
def __call__(self): self.treatment = read_hdf5(self.treatment_file) self.control = read_hdf5(self.control_file) name_counts = np.unique(np.concatenate( [self.treatment['name'], self.control['name']]), return_counts=True) common_names = name_counts[0][name_counts[1] == 2] if self.names is None: self.names = common_names[:self.max_plots] from matplotlib.backends.backend_pdf import PdfPages self.logger.info('open pdf file: {}'.format(self.outfile)) make_dir(os.path.dirname(self.outfile)) with PdfPages(self.outfile) as pdf: for seqname in self.names: self.logger.info('plot track: {}'.format(seqname)) pdf.savefig(self.plot_tracks(seqname))
def analyze(self): self.icshape = read_hdf5(self.icshape_file) self.background = read_hdf5(self.background_file) self.target = read_hdf5(self.target_file) name_counts = np.unique(np.concatenate([ self.icshape['name'], self.background['name'], self.target['name'] ]), return_counts=True) common_names = name_counts[0][name_counts[1] == 3] if self.names is None: self.names = common_names[:self.max_plots] from matplotlib.backends.backend_pdf import PdfPages self.logger.info('open pdf file: {}'.format(self.outfile)) make_dir(os.path.dirname(self.outfile)) with PdfPages(self.outfile) as pdf: for seqname in self.names: self.logger.info('plot track: {}'.format(seqname)) pdf.savefig(self.plot_tracks(seqname))
def main(): """ Run program """ cmn.make_dir(cmn.LOG_DIR) logging.basicConfig( # filename=os.path.join(cmn.LOG_DIR, cmn.LOG_FILE_NAME % time.strftime(cmn.LOG_FILE_NAME_TIME)), handlers=[ logging.FileHandler( os.path.join( cmn.LOG_DIR, cmn.LOG_FILE_NAME % time.strftime(cmn.LOG_FILE_NAME_TIME))), logging.StreamHandler() ], level=cmn.LOG_LEVEL, format=cmn.LOG_MSG_FORMAT, datefmt=cmn.LOG_TIME_FORMAT) start_time = time.time() logging.info("Starting program ...") r, h = False, True if "-r" in sys.argv: logging.info( "Detected 'raw data' command line argument, program will integrate raw counts (non-normalized).\n" ) r = True if "-nh" in sys.argv: logging.info( "Detected 'no headers' command line argument, program will not add headers to integrated files.\n" ) h = False else: logging.warning( "No valid command line arguments, program will ignore arguments.\n" ) integrate.run(r, h) logging.info("Finished running program.") run_time = time.time() - start_time logging.info("Total run time: %s.\n" % time.strftime('%H:%M:%S', time.gmtime(run_time)))
def run(): """ Run the file manifest-list files generating stage """ # create required directory cmn.make_dir(cmn.FILE_LIST_DIR) # generate RNA file lists n_rna = gen_file_lists("RNA") # generate miRNA file lists n_mirna = gen_file_lists("miRNA") logging.info("manifest list files generated in '%s' :" % cmn.FILE_LIST_DIR) logging.info( "%s RNA-seq manifest lists generated, with %s manifests in each list." % (n_rna, cmn.FILES_PER_LIST)) logging.info( "%s miRNA-seq manifest lists generated, with %s manifests in each list.\n" % (n_mirna, cmn.FILES_PER_LIST))
def __call__(self): self.logger.info('load DMS-seq scores from: {}'.format( self.dmsseq_file)) dmsseq = GenomicData(self.dmsseq_file, ['dmsseq']) scores = dmsseq['dmsseq'] cutoff1 = np.percentile(scores, self.percentile) cutoff2 = np.percentile(scores, 100 - self.percentile) self.logger.info('DMS-seq score cutoffs: {}-{}'.format( cutoff1, cutoff2)) discard = np.logical_and(cutoff1 < scores, scores < cutoff2) scores[(scores <= cutoff1) & np.logical_not(discard)] = 0 scores[(scores >= cutoff2) & np.logical_not(discard)] = 1 fasta_f = IndexedFastaReader(self.sequence_file) # calculate base distribution self.logger.info('calculate base distribution') self.offsets = range(-self.max_offset, self.max_offset + 1) base_dist = np.zeros([len(self.offsets), 2, 4], dtype='int64') progress = ProgressBar(len(dmsseq.names), title='') for name in dmsseq.names: seq = np.frombuffer(fasta_f[name], dtype='S1') values = dmsseq.feature('dmsseq', name) ind_valid = (np.logical_not(np.isnan(values)))[0] ind_one_ts = np.nonzero(values == 1)[0] ind_zero_ts = np.nonzero(values == 0)[0] for i_offset, offset in enumerate(self.offsets): ind_one = ind_one_ts + offset ind_one = ind_one[(ind_one >= 0) & (ind_one < len(seq))] ind_zero = ind_zero_ts + offset ind_zero = ind_zero[(ind_zero >= 0) & (ind_zero < len(seq))] for i in range(len(self.alphabet)): if len(ind_zero) > 0: base_dist[i_offset, 0, i] += ( seq[ind_zero] == self.alphabet[i]).sum() if len(ind_one) > 0: base_dist[i_offset, 1, i] += ( seq[ind_one] == self.alphabet[i]).sum() progress.update() progress.finish() fasta_f.close() base_dist = base_dist.astype('float64') # plot fig, axes = plt.subplots(nrows=2, ncols=len(self.offsets), figsize=(20, 4), sharey=True) fig.tight_layout() for i, offset in enumerate(self.offsets): for label in (0, 1): self.logger.debug('plot_base_dist: {}, {}'.format( label, offset)) base_dist[i, label, :] /= base_dist[i, label, :].sum() ax = axes[label, i] ax.bar(np.arange(len(self.alphabet)), base_dist[i, label, :], color='k', edgecolor='none', align='center') ax.set_xticks(np.arange(len(self.alphabet))) ax.set_xticklabels(self.alphabet) ax.set_ylabel('Density') ax.set_title('({}, {})'.format(label, offset)) self.logger.info('savefig: {}'.format(self.outfile)) make_dir(os.path.dirname(self.outfile)) plt.savefig(self.outfile, dpi=150, bbox_inches='tight')
def _download_files(file_list, file_number): """ Download files listed in one manifest list object :param file_list: Object a json object of the list from one manifest-list file :param file_number: List contains 1 element that is the current number/index of the file being downloaded """ # 'file' here is a file manifest, or information of a file for file in file_list["data"]["hits"]: logging.info("Processing file #%s ..." % file_number[0]) ''' this is where the organizational structure of the directories for the downloaded files is defined. Using sample_manifest.json as a guideline. Directory path for each file is defined then created if necessary ''' file_dir_path = [cmn.DL_DIR] try: file_dir_path.append(file["cases"][0]["project"]["primary_site"]) except KeyError: logging.warning( "File manifest did not provide attribute 'primary_site' for tissue sample." ) file_dir_path.append("unknown") try: file_dir_path.append(file["cases"][0]["demographic"]["gender"]) except KeyError: logging.warning( "File manifest did not provide attribute 'gender' for tissue sample." ) file_dir_path.append("unknown") try: file_dir_path.append(file["cases"][0]["samples"][0]["sample_id"]) except KeyError: logging.warning( "File manifest did not provide attribute 'sample_id' for tissue sample." ) file_dir_path.append("unknown") file_dir = os.path.join(*file_dir_path) cmn.make_dir(file_dir) file_path = os.path.join(file_dir, file["file_name"]) gzip_archive = False if file_path.endswith('.gz') or file_path.endswith('.GZ'): logging.info("File is a gzip archive.") gzip_archive = True if (not gzip_archive and not os.path.isfile(file_path)) or ( gzip_archive and not os.path.isfile('%s.txt' % file_path[:-3])): logging.info("Requesting file '%s'..." % file["file_name"]) # send a GET request for this single file using it's UUID r = requests.get("%s/%s" % (URL, file["file_id"])) logging.info(r) if r.status_code == requests.codes.ok: logging.info("Request ok, downloading file ...") logging.info("Writing file to '%s' ..." % file_path) # write HTTP response to the file as a byte stream with open(file_path, 'wb') as file_out: for chunk in r.iter_content(chunk_size=128): file_out.write(chunk) if gzip_archive: logging.info("Extracting archive to '%s' ..." % ('%s.txt' % file_path[:-3])) with gzip.open(file_path, 'rb') as f_in, open( '%s.txt' % file_path[:-3], 'wb') as f_out: shutil.copyfileobj(f_in, f_out) logging.info( "Text file extracted, removing gzip archive ...") os.remove(file_path) logging.info("Gzip archive removed.") logging.info("File downloaded.\n") else: logging.error("Request failed, skipping file.\n") else: if gzip_archive: logging.info("File '%s' already exists.\n" % '%s.txt' % file_path[:-3]) else: logging.info("File '%s' already exists.\n" % file_path) file_number[0] += 1
def combine_corr_files(): """ """ combined_file_path = os.path.join(cmn.DL_DIR, cmn.PTEN_CORR_FNAME) n = 30 m = -30 gdf = pd.read_csv(cmn.GENE_REF_FILE, sep="\t", index_col=0) gene_list = [l[0] for l in gdf.values[:n].tolist()] gene_names = [l for l in gdf.index[:n].tolist()] with open(combined_file_path, 'w') as out_file: out_file.write("\n".join(["0"] + list(gene_names))) cdf = pd.read_csv(combined_file_path, sep="\t", index_col=0) i = 0 for cancer_type in cmn.list_dir(cmn.DL_DIR): if cancer_type not in cmn.SKIP_CTYPE: for gender in cmn.list_dir(os.path.join(cmn.DL_DIR, cancer_type)): if gender not in cmn.SKIP_GENDER: logging.info( "Reading correlation files for '%s' > '%s' ..." % (cancer_type, gender)) samples_path = os.path.join(cmn.DL_DIR, cancer_type, gender) corr_file = cmn.CORR_FNAME % (cancer_type.lower(), gender.lower(), "rna", "normalized") corr_file_path = os.path.join(samples_path, corr_file) if os.path.isfile(corr_file_path): logging.info( "Found correlation file, processing ...\n") df = pd.read_csv(corr_file_path, sep="\t", index_col=0) ver_dict = { key: value for (key, value) in [cc.split(".") for cc in df.index.values] } entries = [ abs( float(df.loc[".".join( (gene_code, ver_dict[gene_code]))][0])) if gene_code in ver_dict else 0.0 for gene_code in gene_list ] cdf.insert(i, "%s-%s" % (cancer_type, gender), entries) i += 1 logging.info("Done.\n") else: logging.warning( "No correlation file found for '%s' > '%s'.\n" % (cancer_type, gender)) logging.info("Writing file ...") cdf.to_csv(combined_file_path, sep="\t") cmn.make_dir("ref") cdf.to_csv(os.path.join("ref", cmn.PTEN_CORR_FNAME), sep="\t")