def output_pickle_file(self, results_output_dir, sample_label): print("Serializing a total of %d events by Pickle." %(len(self.events))) pickle_output_dir = os.path.join(results_output_dir, 'pickle') if not os.path.isdir(pickle_output_dir): os.mkdir(pickle_output_dir) pickle_events_filename = os.path.join(pickle_output_dir, sample_label + '.pickle') pickle_utils.write_pickled_file(self.events, pickle_events_filename) return pickle_events_filename
def output_pickle_file(self, results_output_dir, sample_label): print "Serializing a total of %d events by Pickle." %(len(self.events)) pickle_output_dir = os.path.join(results_output_dir, 'pickle') if not os.path.isdir(pickle_output_dir): os.mkdir(pickle_output_dir) pickle_events_filename = os.path.join(pickle_output_dir, sample_label + '.pickle') pickle_utils.write_pickled_file(self.events, pickle_events_filename) return pickle_events_filename
def serialize_genes(gff_genes, gff_filename, output_dir, compress_id=False): """ Output genes into pickle files by chromosome, by gene. If asked, use compressed IDs (hashes) of the 'ID=' field in the GFF. """ genes_by_chrom = defaultdict(dict) # Split up genes by chromosome for gene_id, gene_info in gff_genes.iteritems(): gene_obj = gene_info["gene_object"] gene_hierarchy = gene_info["hierarchy"] genes_by_chrom[gene_obj.chrom][gene_id] = \ {'gene_object': gene_obj, 'hierarchy': gene_hierarchy} if compress_id: gene_compressed_id = compress_event_name(gene_id) # Store compressed ID genes_by_chrom[gene_obj.chrom][gene_id]['compressed_id'] \ = gene_compressed_id # Mapping from gene IDs to pickled filename gene_id_to_filename = {} # Mapping from compressed IDs (hashes) to gene IDs compressed_id_to_gene_id = {} # Serialize all the genes in each chromosome into their # own directory for chrom, chrom_genes in genes_by_chrom.iteritems(): if chrom.startswith("chr"): chrom_dir_name = chrom else: # Add chr-prefix for ease of finding directory # in downstream steps. chrom_dir_name = "chr%s" %(str(chrom)) # Make directory for chromosome if it doesn't already exist chrom_dir = os.path.join(output_dir, chrom_dir_name) if not os.path.isdir(chrom_dir): #print "Making directory: %s" %(chrom_dir) os.makedirs(chrom_dir) t1 = time.time() # Serialize each gene into a separate file num_genes = len(genes_by_chrom[chrom]) for gene_id, gene_info in genes_by_chrom[chrom].iteritems(): gene_compressed_id = None if compress_id: gene_compressed_id = \ genes_by_chrom[chrom][gene_id]['compressed_id'] gene_filename = \ os.path.abspath(os.path.join(chrom_dir, "%s.pickle" \ %(gene_compressed_id))) else: gene_filename = \ os.path.abspath(os.path.join(chrom_dir, "%s.pickle" %(gene_id))) # Write each gene/event's pickle file pickle_utils.write_pickled_file({gene_id: genes_by_chrom[chrom][gene_id]}, gene_filename) # Record what filename was associated with this gene ID gene_id_to_filename[gene_id] = gene_filename # Record compressed ID (hash) to gene ID if gene_compressed_id is not None: compressed_id_to_gene_id[gene_compressed_id] = gene_id t2 = time.time() #print " - Chromosome serialization took %.2f seconds" %(t2 - t1) print '.', # Shelve the mapping from gene ids to filenames shelved_filename = os.path.join(output_dir, "genes_to_filenames.shelve") shelved_data = shelve.open(shelved_filename) for k, v in gene_id_to_filename.iteritems(): shelved_data[k] = v shelved_data.close() # Shelve the mapping from compressed gene ids to gene ids shelved_filename = os.path.join(output_dir, "compressed_ids_to_genes.shelve") shelved_data = shelve.open(shelved_filename) for k, v in compressed_id_to_gene_id.iteritems(): shelved_data[k] = v shelved_data.close() # Output a list of genes in ordinary GFF format genes_filename = os.path.join(output_dir, "genes.gff") #print "Outputting gene records in GFF format..." #print " - Output file: %s" %(genes_filename) with open(gff_filename) as gff_in: with open(genes_filename, "w") as gff_out: for line in gff_in: if line.startswith("#"): continue record_type = line.strip().split("\t")[2] if record_type == "gene": gff_out.write(line)