def read_data_and_update_database(nex_session, fw): taxon_to_taxonomy_id = dict([(x.taxid, x.taxonomy_id) for x in nex_session.query(Taxonomy).all()]) name_to_dbentity_id = dict([ (x.systematic_name, x.dbentity_id) for x in nex_session.query(Locusdbentity).all() ]) contig_to_contig_id = dict([(x.format_name, x.contig_id) for x in nex_session.query(Contig).all()]) key_to_annotation_id = dict([ ((x.dbentity_id, x.taxonomy_id, x.contig_id), x.annotation_id) for x in nex_session.query(Proteinsequenceannotation).all() ]) f = open(data_file) strain_to_taxon_mapping = get_strain_taxid_mapping() header = None for line in f: pieces = line.strip().split("\t") if pieces[0] == 'name': header = pieces[3:] continue name = pieces[0] dbentity_id = name_to_dbentity_id.get(name) if dbentity_id is None: print(name + " is not in the database") strain = pieces[1] taxon = strain_to_taxon_mapping.get(strain) if taxon is None: print("The strain = " + strain + " is not in the mapping module.") continue taxonomy_id = taxon_to_taxonomy_id.get(taxon) if taxonomy_id is None: print("The taxid = " + taxon + " is not in the database.") continue contig = pieces[2] contig_id = contig_to_contig_id.get(contig) if contig_id is None: print(contig + " is not in the database.") continue annotation_id = key_to_annotation_id.get( (dbentity_id, taxonomy_id, contig_id)) if annotation_id is None: print((dbentity_id, taxonomy_id, contig_id) + " is not in the database.") continue data = pieces[3:] insert_proteinsequence_detail(nex_session, fw, annotation_id, data, header) f.close() # nex_session.rollback() nex_session.commit()
def load_phenotypes(infile, logfile): nex_session = get_session() name_to_locus_id = {} for x in nex_session.query(Locusdbentity).all(): name_to_locus_id[x.systematic_name] = x.dbentity_id if x.gene_name: name_to_locus_id[x.gene_name] = x.dbentity_id sgd = nex_session.query(Source).filter_by(format_name='SGD').one_or_none() source_id = sgd.source_id pmid_to_reference_id = dict([ (x.pmid, x.dbentity_id) for x in nex_session.query(Referencedbentity).all() ]) experiment_to_id = {} mutant_to_id = {} for x in nex_session.query(Apo).all(): if x.apo_namespace == 'experiment_type': experiment_to_id[x.display_name] = x.apo_id if x.apo_namespace == 'mutant_type': mutant_to_id[x.display_name] = x.apo_id annotation_id_to_last_group_id = {} for x in nex_session.query(PhenotypeannotationCond).all(): last_group_id = 1 if x.annotation_id in annotation_id_to_last_group_id: last_group_id = annotation_id_to_last_group_id[x.annotation_id] if x.group_id > last_group_id: last_group_id = x.group_id annotation_id_to_last_group_id[x.annotation_id] = last_group_id phenotype_to_id = dict([(x.display_name, x.phenotype_id) for x in nex_session.query(Phenotype).all()]) taxid_to_taxonomy_id = dict([(x.taxid, x.taxonomy_id) for x in nex_session.query(Taxonomy).all()]) allele_to_id = dict([(x.display_name, x.allele_id) for x in nex_session.query(Allele).all()]) reporter_to_id = dict([(x.display_name, x.reporter_id) for x in nex_session.query(Reporter).all()]) chebiid_to_name = dict([(x.chebiid, x.display_name) for x in nex_session.query(Chebi).all()]) fw = open(logfile, "w") key_to_annotation_id = dict([ ((x.dbentity_id, x.taxonomy_id, x.reference_id, x.phenotype_id, x.experiment_id, x.mutant_id, x.allele_id, x.reporter_id, x.strain_name, x.details), x.annotation_id) for x in nex_session.query(Phenotypeannotation).all() ]) strain_taxid_mapping = get_strain_taxid_mapping() f0 = open(degree_file) degree = None for line in f0: field = line.split("\t") degree = field[26] f0.close() f = open(infile) header = [] i = 0 superheader = [] header = [] cond_header = [] for line in f: i = i + 1 pieces = line.strip().split("\t") if i == 1: superheader = pieces continue if i == 2: j = 0 for x in pieces: if x in ['required', 'Required'] or x == '': x = superheader[j] if x == "ChEBI ID": x = "chemical_name" header.append(x) j = j + 1 cond_header = header[cond_start_index:cond_stop_index] continue if len(pieces) < column_size: for r in range(len(pieces), column_size - 1): pieces.append("") conds = {} created_by = None dbentity_id = None reference_id = None taxonomy_id = None experiment_id = None mutant_id = None allele_id = None allele_comment = "" reporter_id = None reporter_comment = "" details = "" observable = "" qualifier = "" phenotype_id = None strain_name = "" bad_row = 0 conds = pieces[cond_start_index:cond_stop_index] k = 0 for x in pieces: field_name = header[k].strip() if k < cond_stop_index and k >= cond_start_index: k = k + 1 continue k = k + 1 if x is "": continue ## the rest is for phenotypeannotation table if field_name.startswith('curator'): created_by = x.strip() if field_name == 'feature_name': dbentity_id = name_to_locus_id.get(x.strip()) if dbentity_id is None: print "The feature_name:", x, " is not in the database." bad_row = 1 break if field_name == 'PMID': reference_id = pmid_to_reference_id.get(int(x.strip())) if reference_id is None: print "The PMID: ", x, " is not in the database." bad_row = 1 break if field_name == "experiment_type": experiment_id = experiment_to_id.get(x.strip().replace( '"', '')) if experiment_id is None: print "The experiment_type:", x, " is not in the APO table." bad_row = 1 break if field_name == "mutant_type": mutant_id = mutant_to_id.get(x.strip()) if mutant_id is None: print "The mutant_type:", x, " is not in the APO table." bad_row = 1 continue if field_name == "observable": observable = x.strip() if field_name == "qualifier": qualifier = x.strip() if field_name == "strain_background": taxid = strain_taxid_mapping.get(x.strip()) if taxid is None: print "The strain_background:", x, " is not in the mapping." bad_row = 1 continue taxonomy_id = taxid_to_taxonomy_id.get(taxid) if taxonomy_id is None: print "The TAXON ID: ", taxid, " is not in the database." bad_row = 1 continue if field_name == "strain_name": strain_name = x.strip() if field_name == "allele_name": allele_id = allele_to_id.get(x.strip()) if allele_id is None: allele_id = insert_allele(nex_session, fw, source_id, created_by, x.strip()) allele_to_id[x.strip()] = allele_id if field_name == "allele_description": allele_comment = x if field_name == "reporter_name": reporter_id = reporter_to_id.get(x.strip()) if reporter_id is None: reporter_id = insert_reporter(nex_session, fw, source_id, created_by, x.strip()) reporter_to_id[x.strip()] = reporter_id if field_name == "reporter_description": reporter_comment = x if field_name == "details": details = x if bad_row == 1: continue if created_by is None and observable == "": continue if observable != "": phenotype = observable if qualifier != "": phenotype = observable + ": " + qualifier phenotype_id = phenotype_to_id.get(phenotype) if phenotype_id is None: print "The phenotype:", phenotype, " is not in the database." continue else: print "No observable is provided for line:", line continue if dbentity_id is None: print "No feature_name is provided for line:", line continue if taxonomy_id is None: print "No strain_background is provided for line:", line continue if reference_id is None: print "No PMID is provided for line:", line continue if created_by is None: print "No curator ID is provided for line:", line continue # print "dbentity_id=", dbentity_id, ", source_id=", source_id, ", taxonomy_id=", taxonomy_id, ", reference_id=", reference_id, ", phenotype_id=", phenotype_id, ", allele_id=", allele_id, ", allele_comment=", allele_comment, ", reporter_id=", reporter_id key = (dbentity_id, taxonomy_id, reference_id, phenotype_id, experiment_id, mutant_id, allele_id, reporter_id, strain_name, details) annotation_id = key_to_annotation_id.get(key) group_id = 1 if annotation_id is None: annotation_id = insert_phenotypeannotation( nex_session, fw, source_id, created_by, dbentity_id, taxonomy_id, reference_id, phenotype_id, experiment_id, mutant_id, allele_id, allele_comment, reporter_id, reporter_comment, strain_name, details) key_to_annotation_id[key] = annotation_id else: group_id = annotation_id_to_last_group_id.get(annotation_id) if group_id is None: group_id = 1 else: group_id = group_id + 1 ## insert conditions here m = 0 for r in range(0, len(cond_header) / 3): cond_name = conds[m] cond_value = conds[m + 1] cond_unit = conds[m + 2] cond_class = cond_header[m].split("_")[0] m = m + 3 if cond_name == "": continue if cond_class == "chemical": chemical_names = cond_name.split(',') chemical_values = cond_value.split(',') chemical_units = cond_unit.split(',') print "chemical_names=", chemical_names print "chemical_values=", chemical_values print "chemical_units=", chemical_units n = 0 for chemical_name in chemical_names: chebiid = "CHEBI:" + chemical_name cond_name = chebiid_to_name.get(chebiid) cond_value = chemical_values[n] cond_unit = chemical_units[n] print "cond_name=", cond_name print "cond_value=", cond_value print "cond_unit=", cond_unit n = n + 1 if cond_name is None: print "The ChEBI ID", chebi, " is not in the database." continue insert_phenotypeannotation_cond(nex_session, fw, created_by, annotation_id, group_id, cond_class, cond_name, cond_value, cond_unit) else: if cond_class in ['temperature', 'treatment' ] and cond_unit.endswith('C'): cond_unit = degree # cond_unit = cond_unit.encode('utf8') insert_phenotypeannotation_cond(nex_session, fw, created_by, annotation_id, group_id, cond_class, cond_name, cond_value, cond_unit) annotation_id_to_last_group_id[annotation_id] = group_id ########## # nex_session.rollback() nex_session.commit() fw.close() f.close()
def create_seqs(strain): nex_session = get_session() strain_to_taxid = get_strain_taxid_mapping() taxon = strain_to_taxid.get(strain) if taxon is None: print("The strain=", strain, " is not in the mapping.") return taxonomy = nex_session.query(Taxonomy).filter_by(taxid=taxon).one_or_none() if taxonomy is None: print("The taxon ID=", taxon, " is not in the database.") return taxonomy_id = taxonomy.taxonomy_id dbentity_id_to_name = dict([ (x.dbentity_id, (x.systematic_name, x.dbentity_status)) for x in nex_session.query(Locusdbentity).all() ]) so_id_to_display_name = dict([(x.so_id, x.display_name) for x in nex_session.query(So).all()]) outfile = dataDir + "not_feature_" + strain + ".fsa" featureOrder = [] if strain != 'S288C': f = open(refFile) for line in f: if line.startswith(">"): seqID = line.replace(">", "").split(' ')[0] [name1, name2, RefStrain] = seqID.split('|') featureOrder.append((name1, name2)) f.close() fw = open(outfile, "w") found = {} prevRow = None prevContigId = None contig_id_to_seq = {} contig_id_to_display_name = {} defline_to_seq = {} for x in nex_session.query(Dnasequenceannotation).filter_by( dna_type='GENOMIC', taxonomy_id=taxonomy_id).order_by( Dnasequenceannotation.contig_id, Dnasequenceannotation.start_index, Dnasequenceannotation.end_index).all(): (name, status) = dbentity_id_to_name[x.dbentity_id] if status in ['Deleted', 'Merged']: continue type = so_id_to_display_name.get(x.so_id) if type not in [ 'ORF', 'ncRNA gene', 'snoRNA gene', 'snRNA gene', 'tRNA gene', 'rRNA gene', 'telomerase RNA gene' ]: continue if prevContigId is None or prevContigId != x.contig_id: prevRow = (name, x.start_index, x.end_index) prevContigId = x.contig_id continue (prevName, prevStart, prevEnd) = prevRow if x.start_index >= prevStart and x.end_index <= prevEnd: continue start = prevEnd + 1 end = x.start_index - 1 if end <= start: prevRow = (name, x.start_index, x.end_index) prevContigId = x.contig_id continue #if prevName[0:2] == name[0:2] and prevName[2] != name[2]: # print (name, prevName) # # eg YAL002W and YAR002W # prevRow = (name, x.start_index, x.end_index) # prevContigId = x.contig_id # continue if x.contig_id not in contig_id_to_seq: contig = nex_session.query(Contig).filter_by( contig_id=x.contig_id).one_or_none() if contig is None: print("The contig_id=", x.contig_id, " is not in the database.") exit() contig_id_to_seq[x.contig_id] = contig.residues contig_id_to_display_name[x.contig_id] = contig.display_name seq = contig_id_to_seq[x.contig_id][start - 1:end] seqID = prevName + "|" + name + "|" + strain if (prevName, name) not in featureOrder and (name, prevName) in featureOrder: seqID = name + "|" + prevName + "|" + strain (start, end) = (end, start) seq = reverse_complement(seq) if seqID in found: print("The seqID is already in the file.", seqID) continue found[seqID] = 1 defline = ">" + seqID + " " + contig_id_to_display_name[ x.contig_id] + " " + "from " + str(start) + "-" + str(end) fw_mapping.write(seqID + "\t" + str(x.contig_id) + "\t" + str(start) + "\t" + str(end) + "\n") if strain == 'S288C': defline = defline + ", Genome Release 64-2-1," defline = defline + " between " + seqID.split( '|')[0] + " and " + seqID.split('|')[1] if strain == 'S288C': fw.write(defline + "\n") fw.write(seq + "\n") else: defline_to_seq[defline] = seq prevRow = (name, x.start_index, x.end_index) prevContigId = x.contig_id if strain != 'S288C': for defline in sorted(defline_to_seq.keys()): fw.write(defline + "\n") fw.write(defline_to_seq[defline] + "\n") fw.close() fw_mapping.close()
def load_data(): nex_session = get_session() taxid_to_taxonomy_id = dict([(x.taxid, x.taxonomy_id) for x in nex_session.query(Taxonomy).all()]) sgd = nex_session.query(Source).filter_by(display_name='SGD').one_or_none() genBank = nex_session.query(Source).filter_by( display_name='GenBank/EMBL/DDBJ').one_or_none() uniprot = nex_session.query(Source).filter_by( display_name='UniProtKB').one_or_none() so_to_so_id = dict([(x.display_name, x.so_id) for x in nex_session.query(So).all()]) name_to_locus_id = dict([(x.systematic_name, x.dbentity_id) for x in nex_session.query(Locusdbentity).all()]) source_id = sgd.source_id genBank_src_id = genBank.source_id uniprot_src_id = uniprot.source_id strain_taxid_mapping = get_strain_taxid_mapping() fw = open(log_file, "w") for seq_file in [genomic_file, coding_file, kb_file]: f = open(seq_file) defline = "" seq = "" dna_type = None if 'coding' in seq_file: dna_type = 'CODING' elif 'genomic' in seq_file: dna_type = 'GENOMIC' else: dna_type = '1KB' for line in f: line = line.strip() if line.startswith('>'): if seq and defline: insert_dnasequenceannotation(nex_session, fw, source_id, dna_type, defline, seq, so_to_so_id, taxid_to_taxonomy_id, name_to_locus_id, strain_taxid_mapping) defline = line seq = "" else: seq = seq + line insert_dnasequenceannotation(nex_session, fw, source_id, dna_type, defline, seq, so_to_so_id, taxid_to_taxonomy_id, name_to_locus_id, strain_taxid_mapping) f.close() ## protein sequences f = open(protein_file) defline = "" seq = "" for line in f: line = line.strip() if line.startswith('>'): if seq and defline: insert_proteinsequenceannotation(nex_session, fw, source_id, defline, seq, taxid_to_taxonomy_id, name_to_locus_id, strain_taxid_mapping) defline = line seq = "" else: seq = seq + line insert_proteinsequenceannotation(nex_session, fw, source_id, defline, seq, taxid_to_taxonomy_id, name_to_locus_id, strain_taxid_mapping) f.close() ## cds sequences f = open(cds_file) defline = "" seq = "" for line in f: line = line.strip() if line.startswith('>'): if seq and defline: insert_dnasubsequence(nex_session, fw, source_id, defline, seq, taxid_to_taxonomy_id, name_to_locus_id, strain_taxid_mapping, so_to_so_id) defline = line seq = "" else: seq = seq + line insert_dnasubsequence(nex_session, fw, source_id, defline, seq, taxid_to_taxonomy_id, name_to_locus_id, strain_taxid_mapping, so_to_so_id) f.close() ## locus_alias + locusdbentity f = open(gene_file) for line in f: if line.startswith('systematic_name'): continue [name, genBankID, uniprotID] = line.strip().split("\t") locus_id = name_to_locus_id.get(name) if locus_id is None: print(name + " is not in the database.") continue nex_session.query(Locusdbentity).filter_by( dbentity_id=locus_id).update({ 'has_sequence': '1', 'has_protein': '1', 'has_sequence_section': '1' }) insert_locus_alias(nex_session, fw, locus_id, genBankID, genBank_src_id, 'DNA accession ID', 'https://www.ncbi.nlm.nih.gov/nuccore/' + genBankID) insert_locus_alias(nex_session, fw, locus_id, uniprotID, uniprot_src_id, 'UniProtKB ID', 'http://www.uniprot.org/uniprot/' + uniprotID) f.close() fw.close() # nex_session.rollback() nex_session.commit()
import os from os import path from src.models import Locusdbentity, Dnasubsequence, Dnasequenceannotation, Taxonomy from scripts.loading.database_session import get_session from scripts.loading.variant import calculate_variant_data, aligned_sequence_to_snp_sequence, \ strain_to_id, calculate_block_data from scripts.loading.util import get_strain_taxid_mapping nex_session = get_session() strain_to_taxid = get_strain_taxid_mapping() strain_to_id = strain_to_id() taxon = strain_to_taxid['S288C'] dataDir = 'scripts/loading/variant/data/' dnaSeqAlignFile = dataDir + 'dna_sequence_alignment.txt' proteinSeqAlignFile = dataDir + 'protein_sequence_alignment.txt' dnaVariantFile = dataDir + 'dna_variant.txt' proteinVariantFile = dataDir + 'protein_variant.txt' dnaDir = dataDir + 'dna_align/' proteinDir = dataDir + 'protein_align/' def generate_protein_data(name_to_dbentity_id): fw = open(proteinSeqAlignFile, "w") fw2 = open(proteinVariantFile, "w") fw.write("sequence_name\tdbentity_id\taligned_sequence\n") fw2.write(
def load_data(): nex_session = get_session() sgd = nex_session.query(Source).filter_by(display_name='SGD').one_or_none() source_id = sgd.source_id name_to_dbentity_id = dict([ (x.systematic_name, x.dbentity_id) for x in nex_session.query(Locusdbentity).all() ]) pmid_to_reference_id = dict([ (x.pmid, x.dbentity_id) for x in nex_session.query(Referencedbentity).all() ]) ecoid_to_eco_id = dict([(x.ecoid, x.eco_id) for x in nex_session.query(Eco).all()]) efoid_to_efo_id = dict([(x.efoid, x.efo_id) for x in nex_session.query(Efo).all()]) chebiid_to_chebi_id = dict([(x.chebiid, x.chebi_id) for x in nex_session.query(Chebi).all()]) goid_to_go_id = dict([(x.goid, x.go_id) for x in nex_session.query(Go).all()]) taxid_to_taxonomy_id = dict([(x.taxid, x.taxonomy_id) for x in nex_session.query(Taxonomy).all()]) strain_to_taxid_mapping = get_strain_taxid_mapping() reference_id = pmid_to_reference_id.get(PMID) if reference_id is None: print("The PMID:", PMID, " is not in the database.") return log.info("Start loading:\n") log.info(str(datetime.now()) + "\n") fw = open(logfile, "w") f = open(datafile) i = 0 for line in f: if line.startswith("SYSTEMATIC_NMAE"): continue pieces = line.strip().replace("None", "").split("\t") dbentity_id = name_to_dbentity_id.get(pieces[0]) if dbentity_id is None: print("The ORF name is not in the Locusdbentity table:", pieces[0]) continue original_reference_id = pmid_to_reference_id.get(int(pieces[2])) data_value = int(pieces[3]) eco_id = ecoid_to_eco_id.get(pieces[4]) if eco_id is None: print("The ECOID:", pieces[4], " is not in the database.") continue efo_id = efoid_to_efo_id.get(pieces[5]) if efo_id is None: print("The EFOID:", pieces[5], " is not in the database.") continue taxid = strain_to_taxid_mapping.get(pieces[6]) if taxid is None: print("The strain:", pieces[6], " is not in the mapping list.") continue taxonomy_id = taxid_to_taxonomy_id.get(taxid) if taxonomy_id is None: print("The TAXID:", taxid, " is not in the database.") continue chebi_id = None go_id = None time_value = None time_unit = None conc_value = None conc_unit = None fold_change = None median = None mad = None if len(pieces) >= 8: if pieces[7]: chebi_id = chebiid_to_chebi_id.get(pieces[7]) if chebi_id is None: print("The chebiid:", pieces[7], " is not in the database.") continue if pieces[8]: go_id = goid_to_go_id.get(pieces[8]) if go_id is None: print("The goid:", pieces[8], " is not in the database.") continue if pieces[9]: time_value = int(pieces[9]) if pieces[10]: time_unit = pieces[10] if time_unit.startswith('hour'): time_unit = 'hr' if time_unit.startswith('day'): time_unit = 'd' if time_unit.startswith('min'): time_unit = 'min' if pieces[11]: conc_value = float(pieces[11]) conc_unit = pieces[12] if pieces[13]: fold_change = float(pieces[13]) if pieces[14]: median = int(pieces[14]) if pieces[15]: mad = int(pieces[15]) insert_proteinabundanceannotation(nex_session, fw, dbentity_id, source_id, taxonomy_id, reference_id, original_reference_id, eco_id, efo_id, chebi_id, go_id, data_value, fold_change, time_value, time_unit, conc_value, conc_unit, median, mad) i = i + 1 if i > 500: # nex_session.rollback() nex_session.commit() i = 0 f.close() # nex_session.rollback() nex_session.commit() nex_session.close() log.info("Done loading\n") log.info(str(datetime.now()) + "\n")
def read_data_and_update_database(nex_session, fw): ipr = nex_session.query(Source).filter_by( format_name='InterPro').one_or_none() # taxon = nex_session.query(Taxonomy).filter_by(taxid=taxid).one_or_none() taxon_to_taxonomy_id = dict([(x.taxid, x.taxonomy_id) for x in nex_session.query(Taxonomy).all()]) name_to_dbentity_id = dict([ (x.systematic_name, x.dbentity_id) for x in nex_session.query(Locusdbentity).all() ]) format_name_to_id = dict([(x.format_name, x.proteindomain_id) for x in nex_session.query(Proteindomain).all()]) source_id = ipr.source_id # taxonomy_id = taxon.taxonomy_id key_to_annotation = {} for x in nex_session.query(Proteindomainannotation).all(): key = (x.dbentity_id, x.proteindomain_id, x.start_index, x.end_index, x.taxonomy_id) key_to_annotation[key] = x f = open(domain_file) strain_to_taxon_mapping = get_strain_taxid_mapping() i = 0 found = {} for line in f: items = line.strip().split("\t") IDs = items[0].split('_') name = IDs[0] strain = IDs[2] taxon = strain_to_taxon_mapping.get(strain) if taxon is None: print("The strain = " + strain + " is not in the mapping module.") continue taxonomy_id = taxon_to_taxonomy_id.get(taxon) if taxonomy_id is None: print("The taxid = " + taxon + " is not in the database.") continue dbentity_id = name_to_dbentity_id.get(name) if dbentity_id is None: print("The systematic_name ", name, " is not in the LOCUSDBENTITY table.") continue domain_name = items[4].replace(' ', '_') proteindomain_id = format_name_to_id.get(domain_name) if proteindomain_id is None: print("The domain name:", domain_name, " is not in the PROTEINDOMAIN table.") continue start = int(items[6]) end = int(items[7]) run_time = items[10].split('-') run_date = run_time[2] + '-' + run_time[1] + '-' + run_time[0] key = (dbentity_id, proteindomain_id, start, end, taxonomy_id) if key not in key_to_annotation and key not in found: i = i + 1 insert_annotation(nex_session, fw, dbentity_id, proteindomain_id, source_id, taxonomy_id, start, end, run_date) nex_session.commit() found[key] = 1 f.close() nex_session.commit()