def load_release_set(self, assembly_id, session_id, data_release_set=None): if data_release_set is None: today = datetime.now().date() default_config = ConfigHandler().getInstance().get_section_config() data_release_set = collections.OrderedDict() data_release_set["shortname"] = default_config["shortname"] data_release_set["description"] = default_config["description"] data_release_set["assembly_id"] = str(assembly_id) data_release_set["release_date"] = str(today) data_release_set["session_id"] = str(session_id) data_release_set["source_id"] = default_config["source"] release_set_checksum = ChecksumHandler.checksum_list( list(data_release_set.values())) data_release_set["release_checksum"] = release_set_checksum insert_release_set = ( "INSERT INTO release_set (shortname, description, assembly_id, release_date, session_id, \ release_checksum, source_id) VALUES \ (%(shortname)s, %(description)s, %(assembly_id)s, %(release_date)s, %(session_id)s, \ X%(release_checksum)s, %(source_id)s)\ ON DUPLICATE KEY UPDATE release_id=LAST_INSERT_ID(release_id)" ) release_id = self.insert_data(insert_release_set, data_release_set) return release_id
def __init__(self, db_config=None, mypool_name="mypool"): if db_config is None: db_config = ConfigHandler().getInstance().get_section_config( section_name="DATABASE") logger.info("loading in to " + db_config.get("database")) mydbconfig = { "user": db_config.get("user"), "password": db_config.get("pass"), "port": db_config.get("port"), "host": db_config.get("host"), "database": db_config.get("database") } connection_pool = mysql.connector.pooling.MySQLConnectionPool( pool_name=mypool_name, pool_size=32, pool_reset_session=True, **mydbconfig) print(connection_pool) connection_obj = connection_pool.get_connection() self.db_con = connection_obj
def populate_parent_tables(self, init_table_list=None): if init_table_list is None: init_table_list = [ "session", "genome", "assembly", "assembly_alias", "release_source" ] session_id = None genome_id = None assembly_id = None parent_ids = {} if "session" in init_table_list: session_id = self.start_session("Refseq Client " + str(time.time())) parent_ids['session_id'] = session_id print(".........Popultating SESSION table.........\n") if "genome" in init_table_list: genome_data = { "name": "homo_sapiens", "tax_id": str(9606), "session_id": str(session_id) } genome_id = self.load_genome(genome_data) parent_ids['genome_id'] = genome_id print(".........Popultating GENOME table.........\n") if "assembly" in init_table_list: assembly_data = { "genome_id": str(genome_id), "assembly_name": "GRCh38", "session_id": str(session_id) } assembly_id = self.load_assembly(assembly_data) parent_ids['assembly_id'] = assembly_id logger.info(".........Popultating ASSEMBLY table.........\n") if "assembly_alias" in init_table_list: assembly_alias_data = { "alias": "GCA_000001405.25", "genome_id": str(genome_id), "assembly_id": str(assembly_id), "session_id": str(session_id) } assembly_alias_id = self.load_assembly_alias(assembly_alias_data) parent_ids['assembly_alias_id'] = assembly_alias_id logger.info(".........Popultating ASSEMBLY ALIAS table.........\n") if "release_source" in init_table_list: release_source = { "shortname": "Ensembl", "description": "Ensembl data imports from Human Core DBs" } release_source_ensembl = self.load_release_source(release_source) parent_ids['release_source_ensembl'] = release_source_ensembl logger.info(".........Popultating RELEASE SOURCE table.........\n") release_source = { "shortname": "RefSeq", "description": "RefSeq data imports from RefSeq GFF" } release_source_refseq = self.load_release_source(release_source) parent_ids['release_source_refseq'] = release_source_refseq logger.info(".........Popultating REFSEQ table.........\n") # load data_release_set today = datetime.now().date() default_config = ConfigHandler().getInstance().get_section_config() data_release_set = collections.OrderedDict() data_release_set["shortname"] = default_config["shortname"] data_release_set["description"] = default_config["description"] data_release_set["assembly_id"] = str(assembly_id) data_release_set["release_date"] = str(today) data_release_set["session_id"] = str(session_id) data_release_set["source_id"] = str(release_source_refseq) release_set_id = self.load_release_set(assembly_id, session_id, data_release_set) parent_ids['release_id'] = release_set_id return parent_ids
class AnnotationHandler(object): ASSEMBLY_ID = ConfigHandler().getInstance().get_section_config( )["assembly_id"] ASSEMBLY_NAME = ConfigHandler().getInstance().get_section_config( )["assembly_name"] @classmethod def get_annotated_gene(cls, chrom, gene_feature): gene = {} gene['loc_start'] = str(gene_feature.location.start) gene['loc_end'] = str(gene_feature.location.end) gene['loc_strand'] = str(gene_feature.location.strand) gene['loc_region'] = str(chrom) gene['stable_id'] = cls.parse_qualifiers(gene_feature.qualifiers, "Dbxref", "GeneID") gene['stable_id_version'] = 1 gene['assembly_id'] = cls.ASSEMBLY_ID gene['assembly_name'] = cls.ASSEMBLY_NAME # make it none for the moment, otherwise you will get integrity exception hgnc_id = cls.parse_qualifiers(gene_feature.qualifiers, "Dbxref", "HGNC:HGNC") if hgnc_id is not None: hgnc_id = "HGNC:" + hgnc_id gene['hgnc_id'] = hgnc_id gene['session_id'] = None gene['loc_checksum'] = ChecksumHandler.get_location_checksum(gene) gene['gene_checksum'] = ChecksumHandler.get_gene_checksum(gene) return gene @classmethod def get_annotated_transcript(cls, sequence_handler, chrom, mRNA_feature): transcript = {} # Note we have shifted one base here transcript['assembly_id'] = cls.ASSEMBLY_ID transcript['assembly_name'] = cls.ASSEMBLY_NAME transcript['loc_start'] = str(mRNA_feature.location.start + 1) transcript['loc_end'] = str(mRNA_feature.location.end) transcript['loc_strand'] = str(mRNA_feature.location.strand) transcript['loc_region'] = str(chrom) stable_id = mRNA_feature.qualifiers['transcript_id'][0] (transcript_stable_id, transcript_stable_id_version) = stable_id.split(".") transcript['stable_id'] = transcript_stable_id transcript['stable_id_version'] = transcript_stable_id_version transcript['session_id'] = None transcript['transcript_checksum'] = None transcript['exon_set_checksum'] = None transcript['loc_checksum'] = ChecksumHandler.get_location_checksum( transcript) transcript['sequence'] = sequence_handler.get_sequence_by_id( mRNA_feature.qualifiers['transcript_id'][0]) transcript['seq_checksum'] = ChecksumHandler.get_seq_checksum( transcript, 'sequence') return transcript @classmethod def get_annotated_exons(cls, sequence_handler, seq_region, transcript_identifier, refseq_exon_list): exon_sequences = [] refseq_exon_list_relative_coordinates = ExonUtils.compute_exon_coordinates( refseq_exon_list.copy()) # ''' # Ref: BioSeqFeature # Note that the start and end location numbering follow Python's scheme, # thus a GenBank entry of 123..150 (one based counting) becomes a location # of [122:150] (zero based counting). # ''' for exon in refseq_exon_list_relative_coordinates: sequence = sequence_handler.get_seq_record_by_id_location( transcript_identifier, exon['exon_start'], exon['exon_end'], int(exon['exon_strand'])) exon_sequences.append(str(sequence)) # exon_sequences = sequence_handler.get_exon_sequences_by_identifier(transcript_identifier) # print(exon_sequences) annotated_exons = [] if exon_sequences is None: return None if len(refseq_exon_list) != len(exon_sequences): return None for exon_feature, exon_sequence in zip(refseq_exon_list, exon_sequences): annotated_exons.append( cls.get_annotated_exon(seq_region, exon_feature, exon_sequence)) return annotated_exons @classmethod def get_annotated_exon(cls, seq_region, exon_feature, exon_sequence): exon = {} exon['assembly_id'] = cls.ASSEMBLY_ID exon['assembly_name'] = cls.ASSEMBLY_NAME exon['loc_start'] = exon_feature["exon_start"] exon['loc_end'] = exon_feature["exon_end"] exon['loc_strand'] = exon_feature["exon_strand"] exon['loc_region'] = str(seq_region) exon['loc_checksum'] = ChecksumHandler.get_location_checksum(exon) exon['exon_order'] = exon_feature["exon_order"] exon['stable_id'] = exon_feature["exon_stable_id"] exon['stable_id_version'] = exon_feature["exon_stable_id_version"] exon['session_id'] = None exon['exon_seq'] = exon_sequence exon['seq_checksum'] = ChecksumHandler.get_seq_checksum( exon, 'exon_seq') exon['exon_checksum'] = ChecksumHandler.get_exon_checksum(exon) return exon @classmethod def get_annotated_cds(cls, protein_sequence_handler, seq_region, protein_id, cds_list): cds_strand = cds_list[0]['cds_strand'] protein_id = cds_list[0]['protein_id'] (stable_id, stable_id_version) = protein_id.split(".") (translation_start, translation_end) = cls.get_translation_loc(cds_list) translation = {} translation['assembly_id'] = cls.ASSEMBLY_ID translation['assembly_name'] = cls.ASSEMBLY_NAME translation['stable_id'] = stable_id translation['stable_id_version'] = stable_id_version translation['loc_start'] = translation_start translation['loc_end'] = translation_end translation['loc_strand'] = cds_strand translation['loc_region'] = seq_region translation[ 'translation_seq'] = protein_sequence_handler.get_fasta_seq_by_id( protein_id) translation['seq_checksum'] = ChecksumHandler.get_seq_checksum( translation, 'translation_seq') translation['session_id'] = None translation['loc_checksum'] = ChecksumHandler.get_location_checksum( translation) translation[ 'translation_checksum'] = ChecksumHandler.get_translation_checksum( translation) return translation @classmethod def get_translation_loc(cls, cds_list): cds = cds_list[0] if (cds['cds_strand'] == '1'): cds_start = [ cds['cds_start'] for cds in cds_list if cds['cds_order'] == 1 ] cds_end = [ cds['cds_end'] for cds in cds_list if cds['cds_order'] == len(cds_list) ] elif (cds['cds_strand'] == '-1'): cds_start = [ cds['cds_start'] for cds in cds_list if cds['cds_order'] == len(cds_list) ] cds_end = [ cds['cds_end'] for cds in cds_list if cds['cds_order'] == 1 ] if len(cds_start) > 0 and len(cds_end) > 0: return (cds_start[0], cds_end[0]) else: return (0, 0) @classmethod def parse_qualifiers(cls, qualifiers, key_qualifier, attr=None): if key_qualifier in qualifiers: cur_qualifiers = qualifiers[key_qualifier] for cur_qualifier in cur_qualifiers: if attr is not None: my_regex = attr + ":" + "(.*)" matchObj = re.match(my_regex, cur_qualifier, re.M | re.I) # @IgnorePep8 if matchObj and matchObj.group(1): attr_value = matchObj.group(1) return str(attr_value) return None @classmethod def get_seq_region_from_refseq_accession(cls, refseq_accession): matchObj = re.match(r'NC_(\d+)\.\d+', refseq_accession, re.M | re.I) # @IgnorePep8 if matchObj and matchObj.group(1): chrom = int(matchObj.group(1)) if chrom == 23: return "X" elif chrom == 24: return "Y" else: return chrom @classmethod def add_feature_sequence(cls, fasta_handler, feature_locations, feature_id, feature_type='exon'): features_with_seq = [] for feature in feature_locations: feature_seq = fasta_handler.get_fasta_seq_by_id( feature_id, feature[feature_type + '_start'], feature[feature_type + '_end']) feature[feature_type + '_seq'] = feature_seq features_with_seq.append(feature) return features_with_seq
def run(self): mydb_config = ConfigHandler().getInstance().get_section_config(section_name="DATABASE") dbh = DatabaseHandler(db_config=mydb_config, mypool_name="mypool_" + str(self.seq_region)) dbc = dbh.get_connection() sequence_handler = FastaHandler(self.downloaded_files['fasta']) print("Loading protein.....") print(self.downloaded_files['protein']) protein_sequence_handler = FastaHandler(self.downloaded_files['protein']) print("Working on Seq region limit " + str(self.seq_region)) gff_handle = open(self.downloaded_files['gff']) # Chromosome seq level for rec in GFF.parse(gff_handle, limit_info=self.limits, target_lines=1000): for gene_feature in rec.features: # skip regions if gene_feature.type == "region": continue annotated_gene = AnnotationHandler.get_annotated_gene(self.seq_region, gene_feature) # gene level annotated_transcripts = [] for mRNA_feature in gene_feature.sub_features: if 'transcript_id' in mRNA_feature.qualifiers: transcript_id = mRNA_feature.qualifiers['transcript_id'][0] else: continue refseq_exon_list = [] refseq_exon_order = 1 refseq_cds_list = [] refseq_cds_order = 1 for mRNA_sub_feature in mRNA_feature.sub_features: refseq_exon_dict = {} if 'exon' in mRNA_sub_feature.type: # print("Transcript Has exons" + str(mRNA_sub_feature.id)) refseq_exon_dict['exon_stable_id'] = str(mRNA_sub_feature.id) refseq_exon_dict['exon_stable_id_version'] = 1 # dummmy version refseq_exon_dict['exon_order'] = refseq_exon_order # note that we are shifting one base here refseq_exon_dict['exon_start'] = str(mRNA_sub_feature.location.start + 1) refseq_exon_dict['exon_end'] = str(mRNA_sub_feature.location.end) refseq_exon_dict['exon_strand'] = str(mRNA_sub_feature.location.strand) refseq_exon_list.append(refseq_exon_dict) refseq_exon_order += 1 refseq_cds_dict = {} if 'CDS' in mRNA_sub_feature.type: refseq_cds_dict['cds_order'] = refseq_cds_order # note that we are shifting one base here refseq_cds_dict['cds_start'] = str(mRNA_sub_feature.location.start + 1) refseq_cds_dict['cds_end'] = str(mRNA_sub_feature.location.end) refseq_cds_dict['cds_strand'] = str(mRNA_sub_feature.location.strand) refseq_cds_dict['cds_id'] = str(mRNA_sub_feature.id) refseq_cds_dict['protein_id'] = str(mRNA_sub_feature.qualifiers['protein_id'][0]) # @IgnorePep8 refseq_cds_list.append(refseq_cds_dict) refseq_cds_order += 1 annotated_transcript = AnnotationHandler.get_annotated_transcript(sequence_handler, self.seq_region, mRNA_feature) # add sequence and other annotations annotated_exons = [] if len(refseq_exon_list) > 0: annotated_exons = AnnotationHandler.get_annotated_exons(sequence_handler, self.seq_region, transcript_id, refseq_exon_list) if annotated_exons is not None and len(annotated_exons) > 0: exon_set_checksum = ChecksumHandler.get_exon_set_checksum(annotated_exons) annotated_transcript['exon_set_checksum'] = exon_set_checksum annotated_transcript['exons'] = annotated_exons else: annotated_transcript['exons'] = [] annotated_translation = [] if len(refseq_cds_list) > 0: protein_id = refseq_cds_list[0]['protein_id'] annotated_translation = AnnotationHandler.get_annotated_cds(protein_sequence_handler, self.seq_region, protein_id, refseq_cds_list) annotated_transcript['translation'] = annotated_translation else: annotated_transcript['translation'] = [] annotated_transcript['transcript_checksum'] = ChecksumHandler.get_transcript_checksum(annotated_transcript) # @IgnorePep8 annotated_transcripts.append(annotated_transcript) annotated_gene['transcripts'] = annotated_transcripts feature_object_to_save = {} feature_object_to_save["gene"] = annotated_gene if not self.dryrun and annotated_gene is not None and annotated_gene['stable_id'] is not None: print("About to load gene => " + str(annotated_gene['stable_id'])) feature_handler = FeatureHandler(parent_ids=self.parent_ids, dbc=dbc) feature_handler.save_features_to_database(feature_object_to_save) dbc.close() gff_handle.close() print("About to write to the status file") status_dir = self.download_dir + '/' + 'status_logs' if not os.path.exists(status_dir): os.makedirs(status_dir) self.status_file = status_dir + '/' + 'status_file_chr' + str(self.seq_region) status_handle = open(self.status_file, "w") status_handle.write("Done") status_handle.close()
def requires(self): mydefault_config = ConfigHandler().getInstance().get_section_config(section_name="DEFAULT") (gff_filename, gff_file_extension) = os.path.splitext(mydefault_config['gff_file']) # @UnusedVariable (fasta_filename, fasta_file_extension) = os.path.splitext(mydefault_config['fasta_file']) # @UnusedVariable (protein_filename, protein_file_extension) = os.path.splitext(mydefault_config['protein_file']) # @UnusedVariable downloaded_files = {} downloaded_files['gff'] = self.download_dir + "/" + gff_filename downloaded_files['fasta'] = self.download_dir + "/" + fasta_filename downloaded_files['protein'] = self.download_dir + "/" + protein_filename # Examine for available regions # examiner = GFF.GFFExaminer() # load the parent tables parent_ids = None # use for debugging only if not self.dryrun: mydb_config = ConfigHandler().getInstance().get_section_config(section_name="DATABASE") dbh = DatabaseHandler(db_config=mydb_config, mypool_name="mypool_parentids") print(dbh) feature_handler = FeatureHandler(dbc=dbh.get_connection()) parent_ids = feature_handler.populate_parent_tables() print(downloaded_files['gff']) # You could examine the file to get the possible chr, initialising it to save some time # with open(downloaded_files['gff']) as gff_handle_examiner: # possible_limits = examiner.available_limits(gff_handle_examiner) # chromosomes = sorted(possible_limits["gff_id"].keys()) chromosomes = [ ('NC_000001.11',), ('NC_000002.12',), ('NC_000003.12',), ('NC_000004.12',), ('NC_000005.10',), ('NC_000006.12',), ('NC_000007.14',), ('NC_000008.11',), ('NC_000009.12',), ('NC_000010.11',), ('NC_000011.10',), ('NC_000012.12',), ('NC_000013.11',), ('NC_000014.9',), ('NC_000015.10',), ('NC_000016.10',), ('NC_000017.11',), ('NC_000018.10',), ('NC_000019.10',), ('NC_000020.11',), ('NC_000021.9',), ('NC_000022.11',), ('NC_000023.11',), ('NC_000024.10',), ('NC_012920.1',) ] limits = dict() # for testing filter_regions = None for chrom_tuple in chromosomes: chrom = chrom_tuple[0] if not chrom.startswith("NC_"): continue print(chrom_tuple) seq_region = self.get_seq_region_from_refseq_accession(chrom) # Restrict only for filter_region if self.limit_chr is not None: if ',' in self.limit_chr: filter_regions = self.limit_chr.split(',') else: filter_regions = [self.limit_chr] if str(seq_region) not in filter_regions: print(" Skipping " + str(seq_region)) continue limits["gff_id"] = chrom_tuple yield ParseRecord( download_dir=self.download_dir, downloaded_files=downloaded_files, seq_region=str(seq_region), parent_ids=parent_ids, limits=limits, dryrun=self.dryrun )
class DownloadRefSeqSourceFiles(luigi.WrapperTask): """ Wrapper Task to download refseq gff files """ download_dir = luigi.Parameter() task_namespace = 'DownloadRefSeqSourceFiles' assembly_id = ConfigHandler().getInstance().get_section_config()["assembly_id"] assembly_name = ConfigHandler().getInstance().get_section_config()["assembly_name"] source = ConfigHandler().getInstance().get_section_config()["source"] shortname = ConfigHandler().getInstance().get_section_config()["shortname"] description = ConfigHandler().getInstance().get_section_config()["description"] print("Assembly ID " + str(assembly_id)) print("Assembly Name " + str(assembly_name)) print("source name " + str(source)) print("shortname " + str(shortname)) print("description " + str(description)) ftp_root = ConfigHandler().getInstance().get_section_config()["ftp_root"] gff_file = ConfigHandler().getInstance().get_section_config()["gff_file"] fasta_file = ConfigHandler().getInstance().get_section_config()["fasta_file"] protein_file = ConfigHandler().getInstance().get_section_config()["protein_file"] print("ftp_root {}".format(ftp_root)) print("gff_file {}".format(gff_file)) print("fasta_file {}".format(fasta_file)) print("protein_file {}".format(protein_file)) # ftp_root = 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.38_GRCh38.p12/' # #ftp_root = 'http://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/GCF_000001405.38_GRCh38.p12' # @IgnorePep8 # gff_file = 'GCF_000001405.38_GRCh38.p12_genomic.gff.gz' # fasta_file = 'GCF_000001405.38_GRCh38.p12_rna.fna.gz' # protein_file = 'GCF_000001405.38_GRCh38.p12_protein.faa.gz' files_to_download = [gff_file, fasta_file, protein_file] # files_to_download = [gff_file] def complete(self): complete_list = [] for file_ in self.files_to_download: base = os.path.basename(file_) downloaded_file_url_zipped = self.download_dir + '/' + file_ downloaded_file_url_unzipped = self.download_dir + '/' + os.path.splitext(base)[0] if os.path.exists(downloaded_file_url_zipped) and os.path.exists(downloaded_file_url_unzipped): complete_list.append(True) if len(complete_list) == len(self.files_to_download): return True else: return False def requires(self): for file_ in self.files_to_download: yield DownloadRefSeqSourceFile( download_dir=self.download_dir, file_to_download=file_, ftp_root=self.ftp_root) yield UnzipRefSeqFile( download_dir=self.download_dir, file_to_download=file_, ftp_root=self.ftp_root )