def check_completion(project_pk, code_map, bucket_name): ''' params project_pk: code_map: bucket_name: bucket name ''' while code_map: for code_info_map in code_map: code, samplename, vcffilename, cloud_dge_dir = code_info_map script = ' '.join(["gcloud alpha genomics operations describe", code, "--format='yaml(done, error, metadata.events)'"]) proc = subprocess.Popen(script, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, _ = proc.communicate() out_map = yaml.safe_load(stdout) done_status = out_map["done"] #errors[code] = out_map["error"] # setting up file locations if done_status == True: # data format may be altered: #samplename = code_info_map["samplename"] #vcffilename = code_info_map["vcffilename"] #cloud_dge_dir = code_info_map["bucket_path"] #samplename, vcffilename, cloud_dge_dir = code_info_map # fix this!!! project = Project.objects.get(pk=project_pk) storage_client = storage.Client() bucket = storage_client.get_bucket(bucket_name) # ToDo fix the path to file (must be sent to app) destination = os.path.join(cloud_dge_dir, os.path.basename(vcffilename)) vcf_blob = bucket.blob(destination) public_link = LINK_ROOT % (bucket.name, vcf_blob.name) r = Resource(project=project, basename=samplename, public_link=public_link, resource_type='VCF files') r.save() del code_map[code] time.sleep(900) project.completed = True project.in_progress = False project.has_downloads = True project.status_message = "Completed variant calling" message_html = """ <html> <body> Your variant calling analysis has finished. Log-in to download your results. </body> </html> """ email_utils.send_email(message_html, [project.owner.email,], \ '[CCCB] Variant calling analysis completed')
def finalize(project_pk): project = Project.objects.get(pk=project_pk) bucket_name = project.bucket storage_client = storage.Client() bucket = storage_client.get_bucket(bucket_name) all_contents = bucket.list_blobs() all_contents = [x for x in all_contents] # turn iterator into list config_params = cp_.parse_config() # get the files that are in the result folder: outfiles = {} outfiles['BAM Files'] = [] bam_pattern = '%s/.*.bam$' % config_params['output_bucket'] outfiles['BAM Files'].extend([x for x in all_contents if re.match(bam_pattern, x.name) is not None]) bai_pattern = '%s/.*.bam.bai$' % config_params['output_bucket'] outfiles['BAM Files'].extend([x for x in all_contents if re.match(bai_pattern, x.name) is not None]) outfiles['Quantification table'] = [x for x in all_contents if x.name == os.path.join(config_params['output_bucket'], config_params['merged_counts_filename'])] # add user's privileges to these: for key, filelist in outfiles.items(): for f in filelist: print 'grant ownership on %s' % f acl = f.acl entity = acl.user(project.owner.email) entity.grant_read() acl.save() # register the files with the download app public_link = LINK_ROOT % (bucket.name, f.name) r = Resource(project=project, basename = os.path.basename(f.name), public_link = public_link, resource_type = key) r.save() set_meta_cmd = 'gsutil setmeta -h "Content-Disposition: attachment; filename=%s" gs://%s/%s' % (os.path.basename(f.name), bucket_name, f.name) print 'set meta cmd: %s' % set_meta_cmd process = subprocess.Popen(set_meta_cmd, shell = True, stderr=subprocess.STDOUT, stdout=subprocess.PIPE) stdout, stderr = process.communicate() if process.returncode != 0: print 'Error while setting metadata on %s. STDERR was:\n %s' % (f.name, stderr) print 'send notification email' message_html = write_completion_message(project) email_utils.send_email(os.path.join(settings.BASE_DIR, settings.GMAIL_CREDENTIALS), message_html, [project.owner.email,], 'Your pooled CRISPR analysis has completed')
def deseq_call(deseq_cmd, results_dir, cloud_dge_dir, count_matrix_filename, annotation_filename, norm_counts_filename, contrast_name, bucket_name, project_pk): p = subprocess.Popen(deseq_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout, stderr = p.communicate() if p.returncode != 0: with open(os.path.join(results_dir, 'deseq_error.log'), 'w') as fout: fout.write('STDOUT:\n%s\n' % stdout) fout.write('STDERR:\n%s' % stderr) #TODO send error email to CCCB email_list = [x.strip() for x in settings.CCCB_EMAIL_CSV.split(',')] email_utils.send_email(os.path.join(settings.BASE_DIR, settings.GMAIL_CREDENTIALS), "There was a problem with the deseq analysis. Check the %s directory" % results_dir, email_list, '[CCCB] Problem with DGE script') else: project = Project.objects.get(pk=project_pk) storage_client = storage.Client() bucket = storage_client.get_bucket(bucket_name) project_owner = project.owner.email # create cls and gct files for GSEA: norm_counts_filepath = os.path.join(results_dir, norm_counts_filename) nc_matrix = pd.read_table(norm_counts_filepath, index_col=0) samples = nc_matrix.columns.tolist() # Gene is in the index # annotation file has two columns, first is sample name second is group annotations = pd.read_table(os.path.join(results_dir, annotation_filename), index_col=0) group_list = annotations.ix[samples].dropna() # sorts the annotation rows to match the column order of the count matrix unique_groups = group_list.ix[:,0].unique() group_list_str = '\t'.join(group_list.ix[:,0]) # only column left is the group vector, so ok to use 0. Avoids referencing by name with open(os.path.join(results_dir, 'groups.cls'), 'w') as cls_outfile: cls_outfile.write('%d\t%d\t1\n' % (group_list.shape[0], len(unique_groups))) cls_outfile.write('#\t%s\t%s\n' % (unique_groups[0], unique_groups[1])) cls_outfile.write(group_list_str + '\n') nc_matrix['NAME'] = nc_matrix.index.values nc_matrix['Description'] = nc_matrix.index.values col_order = ['NAME', 'Description'] + samples gct_filepath = norm_counts_filepath[:-3] + 'gct' with open(gct_filepath, 'w') as gct_out: intro_lines = '#1.2\n' intro_lines += str(nc_matrix.shape[0]) + '\t' + str(nc_matrix.shape[1]-2) + '\n' gct_out.write(intro_lines) nc_matrix[col_order].to_csv(gct_out, sep=b'\t', index = False) # make some plots for f in glob.glob(os.path.join(results_dir, '*deseq.tsv')): output_figure_path = f.replace('deseq.tsv', 'volcano_plot_v2.pdf') dge_df = pd.read_table(f, sep=b'\t') volcano_plot(dge_df, output_figure_path) # zip everything up zipfile = os.path.join(settings.TEMP_DIR, '%s-%s.zip' % (contrast_name, datetime.datetime.now().strftime('%H%M%S'))) zip_cmd = 'zip -rj %s %s' % (zipfile, results_dir) print 'zip up using command: %s' % zip_cmd p = subprocess.Popen(zip_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout, sterr = p.communicate() if p.returncode != 0: #TODO: send email to cccb? Shouldn't happen and not a user error. pass # the name relative to the bucket destination = os.path.join(cloud_dge_dir, os.path.basename(zipfile)) zip_blob = bucket.blob(destination) zip_blob.upload_from_filename(zipfile) acl = zip_blob.acl entity = acl.user(project_owner) entity.grant_read() acl.save() # remove the file locally os.remove(zipfile) # change the metadata so the download does not append the path set_meta_cmd = 'gsutil setmeta -h "Content-Disposition: attachment; filename=%s" gs://%s/%s' % (os.path.basename(zipfile), bucket.name, destination) process = subprocess.Popen(set_meta_cmd, shell = True, stderr=subprocess.STDOUT, stdout=subprocess.PIPE) stdout, stderr = process.communicate() if process.returncode != 0: print 'There was an error while setting the metadata on the zipped archive with gsutil. Check the logs. STDERR was:%s' % stderr raise Exception('Error during gsutil upload module.') shutil.rmtree(results_dir) # register the zip archive with the download app public_link = LINK_ROOT % (bucket.name, zip_blob.name) r = Resource(project=project, basename = os.path.basename(zipfile), public_link = public_link, resource_type = 'Compressed results') r.save() project.status_message = 'Completed DGE analysis' project.in_progress = False project.save() message_html = """ <html> <body> Your differential analysis has finished. Log-in to download your results </body> </html> """ email_utils.send_email(os.path.join(settings.BASE_DIR, settings.GMAIL_CREDENTIALS), message_html, [project_owner,], '[CCCB] Differential gene expression analysis completed')
def finish_alignment_work(project_pk): """ This pulls together everything and gets it ready for download """ config_params = cp_.parse_config() print 'In finish_alignment_work, config params=' print config_params project = Project.objects.get(pk=project_pk) all_samples = project.sample_set.all() storage_client = storage.Client() bucket_name = project.bucket bucket = storage_client.get_bucket(bucket_name) all_contents = bucket.list_blobs() all_contents = [x for x in all_contents] # turn the original iterator into a list print 'all contents: %s' % all_contents # find all the BAM files (note regex below-- could expose a subset of the BAM files for ease) bam_objs = [] for fl in config_params['default_filter_levels']: bam_pattern = '%s/.*%s.bam$' % (config_params['output_bucket'],fl) bam_objs.extend([x for x in all_contents if re.match(bam_pattern, x.name) is not None]) # also add the .bai files: bai_pattern = '%s/.*%s.bam.bai$' % (config_params['output_bucket'],fl) bam_objs.extend([x for x in all_contents if re.match(bai_pattern, x.name) is not None]) # add user's privileges to these: for b in bam_objs: print 'grant ownership on bam %s' % b acl = b.acl entity = acl.user(project.owner.email) entity.grant_read() acl.save() # register the BAM files with the download app public_link = LINK_ROOT % (bucket.name, b.name) r = Resource(project=project, basename = os.path.basename(b.name), public_link = public_link, resource_type = 'BAM Files') r.save() set_meta_cmd = 'gsutil setmeta -h "Content-Disposition: attachment; filename=%s" gs://%s/%s' % (os.path.basename(b.name), bucket_name, b.name) print 'set meta cmd: %s' % set_meta_cmd process = subprocess.Popen(set_meta_cmd, shell = True, stderr=subprocess.STDOUT, stdout=subprocess.PIPE) stdout, stderr = process.communicate() if process.returncode != 0: print 'Error while setting metadata on bam %s. STDERR was:\n %s' % (b.name, stderr) raise Exception('Error during gsutil upload module.') # find all the count files countfile_objs = [] for fl in config_params['default_filter_levels']: countfiles_pattern = '%s/.*%s.counts$' % (config_params['output_bucket'], fl) countfile_objs.extend([x for x in all_contents if re.match(countfiles_pattern, x.name) is not None]) # add user's privileges to these: for b in countfile_objs: acl = b.acl entity = acl.user(project.owner.email) entity.grant_read() acl.save() # concatenate count files local_dir = os.path.join(settings.TEMP_DIR, bucket.name) try: os.makedirs(local_dir) except OSError as ex: if ex.errno == 17: pass else: print ex.message raise ex raw_count_filepaths = create_merged_counts(bucket, countfile_objs, local_dir, config_params) # upload count files for use when performing DGE analysis for rc in raw_count_filepaths: destination = os.path.join(config_params['output_bucket'], config_params['dge_folder'], os.path.basename(rc)) rc_blob = bucket.blob(destination) rc_blob.upload_from_filename(rc) # make some plots/QC star_log_pattern = '.*%s$' % config_params['star_log_suffix'] star_logs = [x for x in all_contents if re.match(star_log_pattern, x.name) is not None] report_pdf_path = make_qc_report(star_logs, local_dir, config_params) # grab the raw count files: local_files_to_zip = [] local_files_to_zip.extend(glob.glob(os.path.join(local_dir, '%s*' % config_params['raw_count_prefix']))) local_files_to_zip.append(report_pdf_path) # zip them up: #zipfile = os.path.join(local_dir, bucket_name + '-results.zip') timestamp = datetime.datetime.now().strftime('%m%d%y%H%M%S') zipfile = os.path.join(local_dir, 'alignment-results.%s.zip' % timestamp) zip_cmd = 'zip -j %s %s' % (zipfile, ' '.join(local_files_to_zip)) print 'zip up using command: %s' % zip_cmd p = subprocess.Popen(zip_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout, sterr = p.communicate() if p.returncode != 0: #TODO: send email to cccb? Shouldn't happen and not a user error. pass # upload the archive and give it permissions: destination = os.path.join(config_params['output_bucket'], os.path.basename(zipfile)) zip_blob = bucket.blob(destination) zip_blob.upload_from_filename(zipfile) acl = zip_blob.acl entity = acl.user(project.owner.email) entity.grant_read() acl.save() # change the metadata so the download does not append the path set_meta_cmd = 'gsutil setmeta -h "Content-Disposition: attachment; filename=%s" gs://%s/%s' % (os.path.basename(zipfile), bucket_name, destination) process = subprocess.Popen(set_meta_cmd, shell = True, stderr=subprocess.STDOUT, stdout=subprocess.PIPE) stdout, stderr = process.communicate() if process.returncode != 0: print 'There was an error while setting the metadata on the zipped archive with gsutil. Check the logs. STDERR was:%s' % stderr raise Exception('Error during gsutil upload module.') shutil.rmtree(local_dir) # register the zip archive with the download app public_link = LINK_ROOT % (bucket.name, zip_blob.name) r = Resource(project=project, basename = os.path.basename(zipfile), public_link = public_link, resource_type = 'Compressed results') r.save() # notify the client # the second arg is supposedd to be a list of emails print 'send notification email' message_html = write_completion_message(project) email_utils.send_email(os.path.join(settings.BASE_DIR, settings.GMAIL_CREDENTIALS), message_html, [project.owner.email,], '[CCCB] Your RNA-Seq analysis has completed')
def check_completion(project_pk, code_serialized, bucket_name): ''' params project_pk: code_map: bucket_name: bucket name ''' gcloud = "/srv/gcloud/google-cloud-sdk/bin/gcloud" print "DEBUG project_pk:", project_pk print "DEBUG SERIALIZED:", code_serialized print "DEBUG bucket_name:", bucket_name code_map = [c.split('`') for c in code_serialized.split('|')] while code_map: indices_to_del = [] for i, code_info_map in enumerate(code_map): code, samplename, bamfilename, vcffilename, cloud_dge_dir = code_info_map script = ' '.join([settings.GCLOUD_PATH, "alpha genomics operations describe", code, "--format='yaml(done, error, metadata.events)'"]) proc = subprocess.Popen(script, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, _ = proc.communicate() print "DEBUG status stdout: %s" % stdout out_map = yaml.safe_load(stdout) print "DEBUG status yaml:", out_map done_status = out_map["done"] #errors[code] = out_map["error"] # setting up file locations if done_status == True: # data format may be altered: #samplename = code_info_map["samplename"] #vcffilename = code_info_map["vcffilename"] #cloud_dge_dir = code_info_map["bucket_path"] #samplename, vcffilename, cloud_dge_dir = code_info_map # fix this!!! project = Project.objects.get(pk=project_pk) storage_client = storage.Client() bucket = storage_client.get_bucket(bucket_name) # ToDo fix the path to file (must be sent to app) vcf_destination = os.path.join(cloud_dge_dir, os.path.basename(vcffilename)) vcf_blob = bucket.blob(vcf_destination) bam_destination = os.path.join(cloud_dge_dir, os.path.basename(bamfilename)) bam_blob = bucket.blob(bam_destination) print "DEBUG bucket.name: ", bucket.name print "DEBUG vcf_blob.name: ", vcf_blob.name print "DEBUG LINK_ROOT: ", LINK_ROOT vcf_public_link = LINK_ROOT % (bucket.name, '/'.join(["GATK_HaplotypeCaller", vcf_blob.name])) bam_public_link = LINK_ROOT % (bucket.name, '/'.join(["GATK_HaplotypeCaller", bam_blob.name])) print "DEBUG public_link: ", vcf_public_link, bam_public_link r = Resource(project=project, basename=samplename, public_link=vcf_public_link, resource_type='VCF files') r.save() r = Resource(project=project, basename=samplename, public_link=bam_public_link, resource_type='BAM files') r.save() indices_to_del.append(i) code_map = [j for i, j in enumerate(code_map) if i not in indices_to_del] time.sleep(600) project.completed = True project.in_progress = False project.has_downloads = True project.status_message = "Completed variant calling" project.save() message_html = """ <html> <body> Your variant calling analysis has finished. Log-in to download your results. </body> </html> """ email_utils.send_email(os.path.join(settings.BASE_DIR, settings.GMAIL_CREDENTIALS), message_html, [project.owner.email,], \ '[CCCB] Variant calling analysis completed')
def deseq_call(deseq_cmd, results_dir, cloud_dge_dir, contrast_name, bucket_name, project_pk): p = subprocess.Popen(deseq_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout, stderr = p.communicate() if p.returncode != 0: with open(os.path.join(results_dir, 'deseq_error.log'), 'w') as fout: fout.write('STDOUT:\n%s\n' % stdout) fout.write('STDERR:\n%s' % stderr) #TODO send error email to CCCB email_utils.send_email("There was a problem with the deseq analysis. Check the %s directory" % results_dir, settings.CCCB_EMAILS, '[CCCB] Problem with DGE script') else: project = Project.objects.get(pk=project_pk) storage_client = storage.Client() bucket = storage_client.get_bucket(bucket_name) project_owner = project.owner.email # zip everything up zipfile = os.path.join(settings.TEMP_DIR, '%s-%s.zip' % \ (contrast_name, datetime.datetime.now().strftime('%H%M%S'))) zip_cmd = 'zip -rj %s %s' % (zipfile, results_dir) print 'zip up using command: %s' % zip_cmd p = subprocess.Popen(zip_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout, sterr = p.communicate() if p.returncode != 0: #TODO: send email to cccb? Shouldn't happen and not a user error. pass # the name relative to the bucket destination = os.path.join(cloud_dge_dir, os.path.basename(zipfile)) zip_blob = bucket.blob(destination) zip_blob.upload_from_filename(zipfile) acl = zip_blob.acl entity = acl.user(project_owner) entity.grant_read() acl.save() # remove the file locally os.remove(zipfile) # change the metadata so the download does not append the path set_meta_cmd = 'gsutil setmeta -h "Content-Disposition: attachment; filename=%s" gs://%s/%s' % (os.path.basename(zipfile), bucket.name, destination) process = subprocess.Popen(set_meta_cmd, shell = True, stderr=subprocess.STDOUT, stdout=subprocess.PIPE) stdout, stderr = process.communicate() if process.returncode != 0: print 'There was an error while setting the metadata on the zipped archive with gsutil. Check the logs. STDERR was:%s' % stderr raise Exception('Error during gsutil upload module.') shutil.rmtree(results_dir) # register the zip archive with the download app public_link = LINK_ROOT % (bucket.name, zip_blob.name) r = Resource(project=project, basename = os.path.basename(zipfile), public_link = public_link, resource_type = 'Compressed results') r.save() message_html = """ <html> <body> Your differential analysis has finished. Log-in to download your results </body> </html> """ email_utils.send_email(message_html, [project_owner,], '[CCCB] Differential gene expression analysis completed')
def finish_circ_rna_process(project_pk): print 'Do some wrap-up of circRNA pipeline' project = Project.objects.get(pk=project_pk) config_file = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'config.cfg') config_params = config_parser(config_file) # Look into the bucket and pre-fetch the available objects storage_client = storage.Client() bucket_name = project.bucket bucket = storage_client.get_bucket(bucket_name) all_contents = bucket.list_blobs() all_contents = [x for x in all_contents ] # turn the original iterator into a list local_dir = os.path.join(settings.TEMP_DIR, bucket.name) result_dir = os.path.join(local_dir, 'results') try: os.makedirs(result_dir) except OSError as ex: if ex.errno == 17: pass else: print ex.message raise ex is_paired = helpers.get_paired_or_single_status(project_pk) all_samples = project.sample_set.all() # download the files to work on: get_files(all_contents, local_dir, is_paired) # download the proper GTF file for this project: gtf_filepath = get_gtf(storage_client, project, config_params['knife_resource_bucket'], local_dir) concatenated_prob_file = os.path.join( result_dir, config_params['concatenated_probability_file']) concatenated_df = concatenate_circ_junction_reports( all_samples, local_dir, concatenated_prob_file, is_paired) # upload the concatenated file: destination = os.path.join(config_params['output_bucket'], os.path.basename(concatenated_prob_file)) cpf_blob = bucket.blob(destination) cpf_blob.upload_from_filename(concatenated_prob_file) acl = cpf_blob.acl entity = acl.user(project.owner.email) entity.grant_read() acl.save() public_link = LINK_ROOT % (bucket.name, cpf_blob.name) r = Resource(project=project, basename=os.path.basename(concatenated_prob_file), public_link=public_link, resource_type='circRNA quantification') r.save() # make directories for each sample to hold figures: count_threshold = int(config_params['count_threshold']) cdf_threshold = float(config_params['cdf_threshold']) all_sample_dirs = [] for s in all_samples: sample_dir = os.path.join(result_dir, s.name) try: os.mkdir(sample_dir) except OSError as ex: if ex.errno == 17: pass else: print ex.message raise ex make_figures(s, concatenated_df, gtf_filepath, sample_dir, count_threshold, cdf_threshold) all_sample_dirs.append(sample_dir) # zip up the figures: zipfile = os.path.join(local_dir, 'circ_rna_figures.zip') zip_cmd = 'zip -r %s %s' % (zipfile, ' '.join(all_sample_dirs)) print 'zip up using command: %s' % zip_cmd p = subprocess.Popen(zip_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout, sterr = p.communicate() if p.returncode != 0: #TODO: send email to cccb? Shouldn't happen and not a user error. pass # upload the archive and give it permissions: destination = os.path.join(config_params['output_bucket'], os.path.basename(zipfile)) zip_blob = bucket.blob(destination) zip_blob.upload_from_filename(zipfile) acl = zip_blob.acl entity = acl.user(project.owner.email) entity.grant_read() acl.save() # change the metadata so the download does not append the path set_meta_cmd = '%s setmeta -h "Content-Disposition: attachment; filename=%s" gs://%s/%s' % ( settings.GSUTIL_PATH, os.path.basename(zipfile), bucket_name, destination) print 'Issue metadata command: %s' % set_meta_cmd process = subprocess.Popen(set_meta_cmd, shell=True, stderr=subprocess.STDOUT, stdout=subprocess.PIPE) stdout, stderr = process.communicate() if process.returncode != 0: print 'There was an error while setting the metadata on the zipped archive with gsutil. Check the logs. STDERR was:%s' % stderr print 'STDOUT was %s' % stdout raise Exception('Error during gsutil upload module.') shutil.rmtree(local_dir) # register the zip archive with the download app public_link = LINK_ROOT % (bucket.name, zip_blob.name) r = Resource(project=project, basename=os.path.basename(zipfile), public_link=public_link, resource_type='Figures') r.save() # notify the client # the second arg is supposedd to be a list of emails print 'send notification email' message_html = write_completion_message(project) email_utils.send_email( os.path.join(settings.BASE_DIR, settings.GMAIL_CREDENTIALS), message_html, [ project.owner.email, ], '[CCCB] Your circRNA analysis has completed')