def create_gtf_annotation(logger,ws_client,hs_client,internal_services,ws_id,genome_ref,genome_id,fasta_file,directory,token): try: tmp_file = os.path.join(directory,genome_id + "_GFF.gff") fasta_file= generate_fasta(logger,internal_services,token,genome_ref,directory,genome_id) logger.info("Sanitizing the fasta file to correct id names {}".format(datetime.datetime.utcnow())) mapping_filename = c_mapping.create_sanitized_contig_ids(fasta_file) c_mapping.replace_fasta_contig_ids(fasta_file, mapping_filename, to_modified=True) logger.info("Generating FASTA file completed successfully : {}".format(datetime.datetime.utcnow())) generate_gff(logger,internal_services,token,genome_ref,directory,genome_id,tmp_file) c_mapping.replace_gff_contig_ids(tmp_file, mapping_filename, to_modified=True) gtf_path = os.path.join(directory,genome_id+"_GTF.gtf") gtf_cmd = " -E {0} -T -o {1}".format(tmp_file,gtf_path) try: logger.info("Executing: gffread {0}".format(gtf_cmd)) cmdline_output = runProgram(None,"gffread",gtf_cmd,None,directory) except Exception as e: raise Exception("Error Converting the GFF file to GTF using gffread {0},{1}".format(gtf_cmd,"".join(traceback.format_exc()))) #if os.path.exists(tmp_file): os.remove(tmp_file) if os.path.exists(gtf_path): annotation_handle = hs_client.upload(gtf_path) a_handle = { "handle" : annotation_handle ,"size" : os.path.getsize(gtf_path), 'genome_id' : genome_ref} ##Saving GFF/GTF annotation to the workspace res= ws_client.save_objects( {"workspace":ws_id, "objects": [{ "type":"KBaseRNASeq.GFFAnnotation", "data":a_handle, "name":genome_id+"_GTF_Annotation", "hidden":1} ]}) except Exception as e: raise ValueError("Generating GTF file from Genome Annotation object Failed : {}".format("".join(traceback.format_exc()))) return gtf_path
def _create_gtf_annotation_from_genome(self, genome_ref, result_directory): """ Create reference annotation file from genome """ ref = self.ws.get_object_subset([{ 'ref': genome_ref, 'included': ['contigset_ref', 'assembly_ref'] }]) if 'contigset_ref' in ref[0]['data']: contig_id = ref[0]['data']['contigset_ref'] elif 'assembly_ref' in ref[0]['data']: contig_id = ref[0]['data']['assembly_ref'] if contig_id is None: raise ValueError( "Genome at {0} does not have reference to the assembly object". format(genome_ref)) print contig_id log("Generating GFF file from Genome") try: ret = self.au.get_assembly_as_fasta({'ref': contig_id}) fa_output_file = ret['path'] shutil.copy(fa_output_file, result_directory) fa_output_name = os.path.basename(fa_output_file) fa_output_file = os.path.join(result_directory, fa_output_name) mapping_filename = c_mapping.create_sanitized_contig_ids( fa_output_file) # get the GFF ret = self.gfu.genome_to_gff({ 'genome_ref': genome_ref, 'target_dir': result_directory }) genome_gff_file = ret['file_path'] c_mapping.replace_gff_contig_ids(genome_gff_file, mapping_filename, to_modified=True) gtf_ext = ".gtf" if not genome_gff_file.endswith(gtf_ext): gtf_path = os.path.splitext(genome_gff_file)[0] + '.gtf' self._run_gffread(genome_gff_file, gtf_path) else: gtf_path = genome_gff_file log("gtf file : " + gtf_path) except Exception: raise ValueError( "Generating GTF file from Genome Annotation object Failed : {}" .format("".join(traceback.format_exc()))) return gtf_path
e_ws_objs = script_util.if_ws_obj_exists(None, ws_client, params['ws_id'], r_type, reads) missing_objs = [i for i in reads if not i in e_ws_objs] if len(e_ws_objs) != len(reads): raise Exception( 'Missing Library objects {0} in the {1}. please copy them and run this method' .format(",".join(missing_objs), params['ws_id'])) ### Build Hisat2 index fasta_file = script_util.generate_fasta(logger, services, token, annotation_id, hisat2_dir, params['genome_id']) logger.info("Sanitizing the fasta file to correct id names {}".format( datetime.datetime.utcnow())) mapping_filename = c_mapping.create_sanitized_contig_ids(fasta_file) c_mapping.replace_fasta_contig_ids(fasta_file, mapping_filename, to_modified=True) logger.info("Generating FASTA file completed successfully : {}".format( datetime.datetime.utcnow())) hisat2base = os.path.join( hisat2_dir, handler_util.get_file_with_suffix(hisat2_dir, ".fasta")) hisat2base_cmd = '{0} {1}'.format(fasta_file, hisat2base) try: logger.info("Building Index for Hisat2 {0}".format(hisat2base_cmd)) cmdline_output = script_util.runProgram(logger, "hisat2-build", hisat2base_cmd, None, hisat2_dir) except Exception, e: raise Exception("Failed to run command {0}".format(hisat2base_cmd))
if not ret is None: logger.info("GFF Annotation Exist for Genome Annotation {0}.... Skipping step ".format(annotation_name)) gtf_obj= ws_client.get_objects([{'name' : gtf_obj_name,'workspace' : params['ws_id']}])[0] gtf_info = ws_client.get_object_info_new({"objects": [{'name': gtf_obj_name, 'workspace': params['ws_id']}]})[0] gtf_annotation_id = str(gtf_info[6]) + '/' + str(gtf_info[0]) + '/' + str(gtf_info[4]) gtf_id=gtf_obj['data']['handle']['id'] gtf_name=gtf_obj['data']['handle']['file_name'] try: script_util.download_file_from_shock(logger, shock_service_url=services['shock_service_url'], shock_id=gtf_id,filename=gtf_name, directory=diffexp_dir,token=token) gtf_file = os.path.join(diffexp_dir,gtf_name) except Exception,e: raise Exception( "Unable to download shock file, {0}".format(gtf_name)) else: fasta_file= script_util.generate_fasta(logger,services,token,annotation_id,diffexp_dir,annotation_name) logger.info("Sanitizing the fasta file to correct id names {}".format(datetime.datetime.utcnow())) mapping_filename = c_mapping.create_sanitized_contig_ids(fasta_file) c_mapping.replace_fasta_contig_ids(fasta_file, mapping_filename, to_modified=True) logger.info("Generating FASTA file completed successfully : {}".format(datetime.datetime.utcnow())) gtf_file = script_util.create_gtf_annotation(logger,ws_client,hs,services,params['ws_id'],annotation_id,gtf_obj_name,fasta_file,diffexp_dir,token) m_expr_ids = e_sample['data']['mapped_expression_ids'] m_align_exp = [] labels = [] expressions = [] counter = 0 assembly_file = os.path.join(diffexp_dir,ASSEMBLY_GTF_FN) list_file = open(assembly_file,'w') for i in m_expr_ids: for a_id ,e_id in i.items(): #print a_id + ":" + e_id files = {} a_obj,e_obj = ws_client.get_objects(