def map_sequence_key(sequence_key): s1 = aliased(Sequence) s2 = aliased(Sequence) session = Session() sequences = session.query(FilesystemOutfile,s1,s2).join((s1,FilesystemOutfile.sequence), (s2,FilesystemOutfile.parent_sequence)).filter(FilesystemOutfile.sequence_key==sequence_key).all() for seq in sequences: print seq #print seq[0].parent_sequence_key fold_start = seq[2].sequence.find(seq[1].sequence) + 1 fold_stop = fold_start + len(seq[1].sequence) print fold_start, fold_stop domain_regions = session.query(Domain, DomainRegion).join(DomainRegion.domain).filter(Domain.parent_sequence_key==seq[0].parent_sequence_key).all() for dreg in domain_regions: print dreg domain_start = dreg[1].start domain_stop = dreg[1].stop loc_dict = {"fold_start":fold_start, "fold_stop":fold_stop, "domain_start":domain_start, "domain_stop":domain_stop} coverage = percent_coverage(loc_dict) print coverage #kdrew: do not add entries which are nonoverlapping if not coverage['location'] == "fold_domain_nonoverlapping" and not coverage['location'] == "domain_fold_nonoverlapping": dfm = DomainFoldableMap(parent_sequence_key = seq[0].parent_sequence_key, fold_sequence_key = seq[0].sequence_key, domain_sequence_key = dreg[0].domain_sequence_key, domain_key = dreg[0].id, outfile_key = seq[0].id, fold_start=fold_start, fold_stop=fold_stop, domain_start=domain_start, domain_stop=domain_stop, fold_coverage=coverage['fold_coverage'], domain_coverage=coverage['domain_coverage']) print "dfm: ", dfm session.add_all([dfm,])
def domain_fasta(id=None): if id: outfile = "/Users/kdrew/tmp/exp_%d.fasta" % (id,) experiment = Session.query(Experiment).filter(Experiment.id==id).one() species = experiment.species() print species sequences = Session().query(Sequence).join(DomainSCCS.domain).join(Domain.sequence).join(Domain.proteins).join(Protein.experiment).filter(not_(DomainSCCS.domain_type.in_( ('psiblast','fold_recognition')))).filter(Experiment.id==id).distinct().all() else: print "all denovo" outfile = "/Users/kdrew/tmp/hpf_denovo.fasta" sequences = Session().query(Sequence).join(DomainSCCS.domain).join(Domain.sequence).join(Domain.proteins).join(Protein.experiment).filter(not_(DomainSCCS.domain_type.in_( ('psiblast','fold_recognition')))).distinct().all() print len(sequences) #sequences = Session().query(Sequence).join(Domain.sequence).join(Domain.proteins).join(Protein.experiment).filter(Experiment.id==id).distinct().all() #records = imap(lambda x: x.biopython(description=species).format("fasta"), sequences) bio_records = YRCRecordFactory().create(*sequences) with open(outfile, "w") as handle1: SeqIO.write(bio_records, handle1,"fasta") handle1.close()