def member_info(self, member): with self.session() as session: info = session.query(mod.IfeInfo.pdb_id.label('pdb'), mod.IfeInfo.model).\ filter_by(ife_id=member['id']).\ one() info = row2dict(info) info.update(member) with self.session() as session: query = session.query(mod.ChainInfo.chain_name, mod.IfeChains.is_structured, ).\ join(mod.IfeChains, mod.IfeChains.chain_id == mod.ChainInfo.chain_id).\ filter_by(ife_id=member['id']) if not query.count(): raise core.InvalidState("Could not find chains for %s" % member) all_chains = [row2dict(c) for c in query] valid = op.itemgetter('is_structured') chains = [c['chain_name'] for c in all_chains if valid(c)] if not chains: chains = [c['chain_name'] for c in all_chains] info['chains'] = chains loader = self._create(IfeLoader) info['sym_op'] = loader.sym_op(info['pdb']) return info
def position_info(self, unit): """Get the information about a position in an experimental sequence using a unit id. """ self.logger.debug("Finding position for %s", unit) try: with self.session() as session: pos = mod.ExpSeqPosition mapping = mod.ExpSeqUnitMapping result = session.query(pos.index, pos.exp_seq_id, mod.UnitInfo.chain, mod.UnitInfo.model, mod.UnitInfo.sym_op, ).\ join(mapping, mapping.exp_seq_position_id == pos.exp_seq_position_id).\ join(mod.UnitInfo, mod.UnitInfo.unit_id == mapping.unit_id).\ filter(mapping.unit_id == unit).\ one() return row2dict(result) except: # handle the case where the unit id in the database table ends with ||A or ||B # but that is not being stored in unit. Not sure why not. self.logger.info('Looking up sequence position of alternates of ' + unit) newunit = '%' + unit + '%' with self.session() as session: pos = mod.ExpSeqPosition mapping = mod.ExpSeqUnitMapping result = session.query(pos.index, pos.exp_seq_id, mod.UnitInfo.chain, mod.UnitInfo.model, mod.UnitInfo.sym_op, ).\ join(mapping, mapping.exp_seq_position_id == pos.exp_seq_position_id).\ join(mod.UnitInfo, mod.UnitInfo.unit_id == mapping.unit_id).\ filter(mapping.unit_id.like(newunit)).\ first() if not result: self.logger.info('No experimental sequence position for ' + unit) return row2dict(result)
def unit_mapping(self, pdb): """ Create a dictionary that maps from data produced by `as_key` to unit ids that are in the database. This will lookup all unit ids in the database and create the required mapping. Parameters ---------- pdb : str The pdb id to look up a mapping for Returns ------- mapping : dict The mapping dictionary to use. """ mapping = coll.defaultdict(set) with self.session() as session: query = session.query(mod.UnitInfo).\ filter_by(pdb_id=pdb) for result in query: entry = ut.row2dict(result) generic_key = as_key(entry, ignore_model=True) model_key = as_key(entry) mapping[generic_key].add(result.unit_id) mapping[model_key].add(result.unit_id) return mapping
def load_nr_classes(self, release, resolution): with self.session() as session: query = session.query( mod.NrChains.rank.label('index'), mod.NrChains.ife_id.label('id'), mod.IfeInfo.pdb_id, mod.IfeInfo.length, mod.IfeChains.chain_id, mod.NrClasses.name, ).\ join(mod.IfeInfo, mod.IfeInfo.ife_id == mod.NrChains.ife_id).\ join(mod.IfeChains, mod.IfeChains.ife_id == mod.IfeInfo.ife_id).\ join(mod.NrClasses, mod.NrClasses.nr_class_id == mod.NrChains.nr_class_id).\ filter(mod.NrClasses.nr_release_id == release).\ filter(mod.NrClasses.resolution == resolution).\ filter(mod.IfeChains.index == 0) data = coll.defaultdict(list) for result in query: entry = row2dict(result) entry['rep'] = (entry['index'] == 0) nr = entry['name'] data[nr].append(entry) return data.values()
def normalized_mapping(self, pdb_id): """This produces a dictonary that can be used to correct bad unit ids. Some of the loops stored after we migrated the database have incorrect unit ids. The errors appear to be of 2 kinds, incorrect model number and possibly bad alt ids. By producing this mapping we try to correct the issue by finding the correct unit id. :param str pdb_id: The pdb id to get units for. :returns: A dictonary with Unit keys mapping to the unit id. """ with self.session() as session: query = session.query(mod.UnitInfo.unit_id, mod.UnitInfo.pdb_id.label('pdb'), mod.UnitInfo.model, mod.UnitInfo.chain, mod.UnitInfo.number.label('component_number'), mod.UnitInfo.ins_code.label('insertion_code'), mod.UnitInfo.alt_id, mod.UnitInfo.sym_op.label('symmetry'), ).\ filter(mod.UnitInfo.pdb_id == pdb_id) if not query.count(): raise core.InvalidState("No units in %s" % pdb_id) mapping = {} for result in query: data = row2dict(result) unit_id = data.pop('unit_id') key = Unit(**data) if key in mapping: raise core.InvalidState("Non unique mapping") mapping[key] = unit_id return mapping
def incomplete(self, pdb): """Load all incomplete nucleotides from the database. This will query the unit_incomplete for all incomplete data. Parameters ---------- pdb : str The pdb id to use. Returns ------- incomplete : set A set of unit ids that are incomplete. """ with self.session() as session: query = session.query(mod.UnitIncomplete.pdb_id, mod.UnitIncomplete.model, mod.UnitIncomplete.chain, mod.UnitIncomplete.number, mod.UnitIncomplete.unit, mod.UnitIncomplete.alt_id, mod.UnitIncomplete.ins_code, ).\ filter_by(pdb_id=pdb) return {Entry(**row2dict(r)) for r in query}
def interactions(self, pdb): """Lookup all interactions for the given structure. This gets all interaction entries. If there are none this returns an empty list. The entries in the list are dictonaries with the same names as in `Exporter.headers`. Parameters ---------- pdb : str The PDB id to look up interactions for Returns ------- interactions : list A list of all interactions. """ with self.session() as session: query = session.query( mod.UnitPairsInteractions.unit_id_1.label(self.headers[0]), mod.UnitPairsInteractions.unit_id_2.label(self.headers[1]), mod.UnitPairsInteractions.f_lwbp.label(self.headers[2]), mod.UnitPairsInteractions.f_stacks.label(self.headers[3]), mod.UnitPairsInteractions.f_bphs.label( self.headers[4])).filter_by(pdb_id=pdb) count = query.count() if not count: self.logger.warning("No interactions found for %s", pdb) else: self.logger.info("Found %s interactions for %s", count, pdb) return [row2dict(result) for result in query]
def interactions(self, pdb): """Lookup all interactions for the given structure. This gets all interaction entries. If there are none this returns an empty list. The entries in the list are dictonaries with the same names as in `Exporter.headers`. Parameters ---------- pdb : str The PDB id to look up interactions for Returns ------- interactions : list A list of all interactions. """ with self.session() as session: query = session.query( mod.UnitPairsInteractions.unit_id_1.label(self.headers[0]), mod.UnitPairsInteractions.unit_id_2.label(self.headers[1]), mod.UnitPairsInteractions.f_lwbp.label(self.headers[2]), mod.UnitPairsInteractions.f_stacks.label(self.headers[3]), mod.UnitPairsInteractions.f_bphs.label(self.headers[4]) ).filter_by(pdb_id=pdb) count = query.count() if not count: self.logger.warning("No interactions found for %s", pdb) else: self.logger.info("Found %s interactions for %s", count, pdb) return [row2dict(result) for result in query]
def mapping(self, pdb): """Create a dictionary that maps from data produced by `as_key` to unit ids that are in the database. This will lookup all unit ids in the database and create the required mapping. Parameters ---------- pdb : str The pdb id to look up a mapping for Returns ------- mapping : dict The mapping dictionary to use. """ mapping = coll.defaultdict(list) with self.session() as session: query = session.query(mod.UnitInfo).\ filter_by(pdb_id=pdb) for result in query: key = as_key(ut.row2dict(result)) mapping[key].append(result.unit_id) return mapping
def load_quality(self, members): def as_quality(data): return { 'has': {key for key, value in data.items() if value}, 'rsrz': data.get('rsrz') or 100, 'backbone': data.get('backbone') or 100, 'clashscore': data.get('clashscore') or 500, } known = {m['pdb'] for m in members} with self.session() as session: query = session.query(mod.PdbQuality.pdb_id, mod.PdbQuality.percent_rsrz_outliers. label('rsrz'), mod.PdbQuality.clashscore, mod.PdbQuality.percent_rota_outliers. label('backbone'), ).\ filter(mod.PdbQuality.pdb_id.in_(known)) measures = {} for result in query: result = row2dict(result) pdb_id = result.pop('pdb_id') measures[pdb_id] = as_quality(result) for member in members: pdb_id = member['pdb'] member['quality'] = measures.get(pdb_id, as_quality({})) return members
def chain_status(self, reps, release, resolution, **kwargs): with self.session() as session: query = session.query(mod.NrClasses.name.label('Group'), mod.NrClasses.nr_release_id.label('Release'), mod.NrChains.ife_id.label('IFE'), mod.IfeInfo.bp_count.label('BP'), mod.IfeInfo.pdb_id.label('PDB'), mod.IfeInfo.length.label('NT'), mod.NrChains.rep, ).\ join(mod.NrChains, mod.NrChains.nr_class_id == mod.NrClasses.nr_class_id).\ join(mod.IfeInfo, mod.IfeInfo.ife_id == mod.NrChains.ife_id).\ filter(mod.NrClasses.nr_release_id == release).\ filter(mod.NrClasses.resolution == resolution) data = [] for result in query: entry = row2dict(result) rep = entry.pop('rep') entry['Current'] = 'Member' entry['Ratio'] = round(float(entry['BP']) / entry['NT'], 4) if rep: entry['Current'] = 'Representative' for method, status in reps[entry['IFE']].items(): entry[method] = 'Member' if status: entry[method] = 'Representative' data.append(entry) return data
def chains(self, release_id, resolution): with self.session() as session: chains = mod.NrChains ife = mod.IfeInfo pdbs = mod.PdbInfo classes = mod.NrClasses query = session.query(chains.nr_release_id, classes.name, classes.handle, classes.version, ife.ife_id.label('id'), ife.bp_count.label('bp'), ife.length, pdbs.resolution, pdbs.experimental_technique.label('method'), ).\ join(ife, ife.ife_id == chains.ife_id).\ join(classes, classes.nr_class_id == chains.nr_class_id).\ join(pdbs, pdbs.pdb_id == ife.pdb_id).\ filter(classes.nr_release_id == release_id).\ filter(classes.resolution == resolution).\ order_by(classes.name) found = [row2dict(r) for r in query] grouped = it.groupby(found, op.itemgetter('name')) return [list(g) for n, g in grouped]
def pdb_info(self, ifes): pdb_ids = self.class_property(ifes, 'pdb_id') with self.session() as session: query = session.query( mod.PdbInfo.pdb_id.label('PDB'), mod.PdbInfo.resolution.label('Resolution'), mod.PdbInfo.experimental_technique.label('Method'), mod.PdbInfo.title.label('Title'), ).filter(mod.PdbInfo.pdb_id.in_(pdb_ids)) data = {} for result in query: entry = row2dict(result) method = entry.pop('Method') if method == 'X-RAY DIFFRACTION': method = 'x-ray' elif method == 'SOLUTION NMR': method = 'nmr' elif method == 'ELECTRON MICROSCOPY': method = 'cryo-em' elif method == 'FIBER DIFFRACTION': method = 'fib-dif' elif method == 'FLUORESCENCE TRANSFER': method = 'fluo-trans' elif method == 'SOLID-STATE NMR': method = 'nmr-sld-sta' elif method == 'SOLUTION NMR, SOLUTION SCATTERING': method = 'nmr-sol-scat' elif method == 'SOLUTION NMR, THEORETICAL MODEL': method = 'nmr-sol-theo' else: method = method entry['Method'] = method data[entry['PDB']] = entry return data
def units_between(self, unit1, unit2): """Get a list of all units between two units. This assumes they are on the same chain and have the same symmetry operator. """ start = self.position_info(unit1) stop = self.position_info(unit2) with self.session() as session: units = mod.UnitInfo mapping = mod.ExpSeqUnitMapping pos = mod.ExpSeqPosition query = session.query(units.pdb_id, units.model, units.chain, units.number, units.unit, units.alt_id, units.ins_code, ).\ join(mapping, mapping.unit_id == units.unit_id).\ join(pos, mapping.exp_seq_position_id == pos.exp_seq_position_id).\ filter(pos.exp_seq_id == start['exp_seq_id']).\ filter(pos.index >= start['index']).\ filter(pos.index <= stop['index']).\ filter(units.chain == start['chain']).\ filter(units.model == start['model']).\ filter(units.sym_op == start['sym_op']).\ distinct().\ order_by(asc(pos.index)) return [Entry(**row2dict(r)) for r in query]
def positions(self, pdb, chain): exp_seq = self.exp_seq(pdb, chain) with self.session() as session: esum = mod.ExpSeqUnitMapping esp = mod.ExpSeqPosition escm = mod.ExpSeqChainMapping ci = mod.ChainInfo query = session.query( esum.unit_id, esp.index, esp.unit, ).join(esp, esp.exp_seq_position_id == esum.exp_seq_position_id).\ join(escm, escm.exp_seq_chain_mapping_id == esum.exp_seq_chain_mapping_id).\ join(ci, ci.chain_id == escm.chain_id).\ filter(ci.pdb_id == pdb).\ filter(ci.chain_name == chain) if not query.count(): raise core.InvalidState( "Could not load positions for %s|1|%s" % (pdb_id, chain)) positions = [] for result in query: entry = row2dict(result) entry['observed'] = int(result.unit_id is not None) entry['index'] = entry['index'] + 1 positions.append(entry) return positions
def pairs(self, pdb): pairs = coll.defaultdict(lambda: { 'Pairs': set(), 'Stacks': set(), 'Basephosphate': set() }) with self.session() as session: interactions = mod.UnitPairsInteractions query = session.query(interactions.unit_id_1, interactions.unit_id_2, interactions.f_lwbp.label('Pairs'), interactions.f_stacks.label('Stacks'), interactions.f_bphs.label('Basephosphate'), ).\ filter(interactions.pdb_id == pdb) for result in query: data = row2dict(result) unit1 = data.pop('unit_id_1') unit2 = data.pop('unit_id_2') if unit1 == unit2 and data['Basephosphate'] and \ '0BPh' in data['Basephosphate']: data['Basephosphate'] = None for name, value in data.items(): if value and not value.startswith('n'): pairs[unit1][name].add(unit2) return pairs
def revised_chain_info(self, ifes): self.logger.debug('ifes: %s' % ifes) ife_ids = self.class_property(ifes, 'id') with self.session() as session: query = session.query( mod.IfeInfo.ife_id, func.sum(mod.ChainInfo.chain_length).label('Exp Length (CI)'), func.group_concat(mod.ChainInfo.sequence.op('SEPARATOR')('+')).label('Exp Sequence (CI)'), func.group_concat(mod.ChainInfo.compound.op('SEPARATOR')(' + ')).label('Nucleic Acid Compound'), func.group_concat(mod.SpeciesMapping.species_name.op('SEPARATOR')(' / ')).label('RNA Species'), ).\ join(mod.IfeChains, mod.IfeChains.ife_id == mod.IfeInfo.ife_id).\ join(mod.ChainInfo, mod.ChainInfo.chain_id == mod.IfeChains.chain_id).\ join(mod.ChainSpecies, mod.ChainSpecies.chain_id == mod.ChainInfo.chain_id).\ outerjoin(mod.SpeciesMapping, mod.SpeciesMapping.species_mapping_id == mod.ChainSpecies.species_id).\ filter(mod.IfeInfo.ife_id.in_(ife_ids)).\ group_by(mod.IfeInfo.ife_id) data = {} for result in query: entry = row2dict(result) #entry['Exp Length (CI)'] = len(entry['Exp Sequence (CI)']) ife_id = entry.pop('ife_id') data[ife_id] = entry return data
def positions(self, pdb, chain): exp_seq = self.exp_seq(pdb, chain) with self.session() as session: esum = mod.ExpSeqUnitMapping esp = mod.ExpSeqPosition escm = mod.ExpSeqChainMapping ci = mod.ChainInfo query = session.query( esum.unit_id, esp.index, esp.unit, ).join(esp, esp.exp_seq_position_id == esum.exp_seq_position_id).\ join(escm, escm.exp_seq_chain_mapping_id == esum.exp_seq_chain_mapping_id).\ join(ci, ci.chain_id == escm.chain_id).\ filter(ci.pdb_id == pdb).\ filter(ci.chain_name == chain) if not query.count(): raise core.InvalidState("Could not load positions for %s|1|%s" % (pdb_id, chain)) positions = [] for result in query: entry = row2dict(result) entry['observed'] = int(result.unit_id is not None) entry['index'] = entry['index'] + 1 positions.append(entry) return positions
def test_computes_both_discrepancies(self): c1 = self.chain_id('1X8W', 'D') c2 = self.chain_id('1GRZ', 'B') corr_id = self.corr_id(c1, c2) val = [row2dict(d) for d in self.loader.data((c1, c2))] assert len(val) == 2 # Remove discrepancy since it needs a different method assert_almost_equal(val[0].pop('discrepancy'), 0.227388, decimal=6) assert_almost_equal(val[1].pop('discrepancy'), 0.227388, decimal=6) del val[0]['chain_chain_similarity_id'] del val[1]['chain_chain_similarity_id'] assert val[0] == { 'chain_id_1': c1, 'chain_id_2': c2, 'model_1': 1, 'model_2': 1, 'correspondence_id': corr_id, 'num_nucleotides': 242 } assert val[1] == { 'chain_id_1': c2, 'chain_id_2': c1, 'model_1': 1, 'model_2': 1, 'correspondence_id': corr_id, 'num_nucleotides': 242 }
def dump(filename, **kwargs): """Dump chain chain comparison data to a file. This will dump all chain chain comparison data to a file for later import. The data is pickled for easy reading and writing in python. Parameters ---------- filename : str Name of the file to write to. """ session = setup(**kwargs) with session() as sess: chain1 = aliased(mod.ChainInfo) chain2 = aliased(mod.ChainInfo) query = sess.query(mod.ChainChainSimilarity.discrepancy, mod.ChainChainSimilarity.num_nucleotides, mod.ChainChainSimilarity.model_1, mod.ChainChainSimilarity.model_2, chain1.pdb_id.label('pdb_id1'), chain1.chain_name.label('chain_name1'), chain2.pdb_id.label('pdb_id2'), chain2.chain_name.label('chain_name2'), ).\ join(chain1, chain1.chain_id == mod.ChainChainSimilarity.chain_id_1).\ join(chain2, chain2.chain_id == mod.ChainChainSimilarity.chain_id_2) results = [row2dict(r) for r in query] with open(filename, 'wb') as out: pickle.dump(results, out)
def current(self, corr_id): """Get the current data for the correspondence. """ with self.session() as session: info = session.query(mod.CorrespondenceInfo).get(corr_id) return utils.row2dict(info)
def info(self, chain_id): """Load the required information about a chain. Since we want to use the results of this loader for the NR stages we use the same data as was in the IFE's the given chain is a part of. Parameters ---------- chain_id : int The chain id to look up. Returns ------- ife_info : dict A dict with a 'chain_name', 'chain_id', `pdb`, `model`, `ife_id`, `sym_op`, and `name` keys. """ with self.session() as session: query = session.query(mod.ChainInfo.chain_name, mod.ChainInfo.chain_id, mod.IfeInfo.pdb_id.label('pdb'), mod.IfeInfo.model, mod.IfeInfo.ife_id, ).\ join(mod.IfeChains, mod.IfeChains.chain_id == mod.ChainInfo.chain_id).\ join(mod.IfeInfo, mod.IfeInfo.ife_id == mod.IfeChains.ife_id).\ filter(mod.IfeInfo.new_style == 1).\ filter(mod.ChainInfo.chain_id == chain_id) if not query.count(): raise core.InvalidState("Could not load chain with id %s" % chain_id) ife = ut.row2dict(query.first()) with self.session() as session: query = session.query(mod.UnitInfo.sym_op, mod.UnitInfo.alt_id, ).\ join(mod.ChainInfo, (mod.ChainInfo.pdb_id == mod.UnitInfo.pdb_id) & (mod.ChainInfo.chain_name == mod.UnitInfo.chain)).\ join(mod.UnitCenters, mod.UnitCenters.unit_id == mod.UnitInfo.unit_id).\ join(mod.UnitRotations, mod.UnitRotations.unit_id == mod.UnitInfo.unit_id).\ filter(mod.ChainInfo.chain_id == chain_id).\ distinct() if not query.count(): raise core.InvalidState("Could not get info for chain %s" % chain_id) ife['sym_op'] = pick(['1_555', 'P_1'], 'sym_op', query) ife['alt_id'] = pick([None, 'A', 'B'], 'alt_id', query) ife['name'] = ife['ife_id'] + '+' + ife['sym_op'] return ife
def correspondence_id_mapping(session, data, ignore_missing=False): """Create a mapping from compared chain chain to correspondence ids. This will fail if not all chains in the input data could be mapped, if ignore_missing is False (the default behavior), otherwise it will only log the error. Parameters ---------- session : pymotifs.core.Session The sesson to use data : list A list of dictionaries with pdb_id1, chain_name1, pdb_id2, chain_name2 entries. ignore_missing : bool, optional A flag to make this ignore missing chains. In this case errors are only logged. Returns ------- mapping : dict A dictionary mapping (chain_id, chain_id) to correspondence id. """ entries = {(chain1(e), chain2(e)) for e in data} with session() as sess: corr = mod.CorrespondencePdbs query = sess.query( corr.correspondence_id, corr.pdb_id_1.label('pdb_id1'), corr.pdb_id_2.label('pdb_id2'), corr.chain_name_1.label('chain_name1'), corr.chain_name_2.label('chain_name2'), corr.chain_id_1, corr.chain_id_2, ) mapping = {} for result in query: result = row2dict(result) ids = (chain1(result), chain2(result)) if ids not in entries: continue key = (result['chain_id_1'], result['chain_id_2']) if key in mapping: raise ValueError("Duplicate mapping found %s" % ids) mapping[key] = result['correspondence_id'] entries.remove(ids) logger.info("Found %i/%i correspondences", len(mapping), len(entries)) if entries: logger.error("Could not map all correspondences %s", str(entries)) if not ignore_missing: raise ValueError("Could not map all correspondences %s" % str(entries)) return mapping
def test_it_assigns_valid_data(self): assert row2dict(self.motifs[0]) == { 'motif_id': 'IL_85752.1', 'ml_release_id': '0.1', 'type': 'IL', 'handle': '85752', 'version': 1, 'comment': 'New id, no parents', }
def correspondence_id_mapping(session, data, ignore_missing=False): """Create a mapping from compared chain chain to correspondence ids. This will fail if not all chains in the input data could be mapped, if ignore_missing is False (the default behavior), otherwise it will only log the error. Parameters ---------- session : pymotifs.core.Session The sesson to use data : list A list of dictionaries with pdb_id1, chain_name1, pdb_id2, chain_name2 entries. ignore_missing : bool, optional A flag to make this ignore missing chains. In this case errors are only logged. Returns ------- mapping : dict A dictionary mapping (chain_id, chain_id) to correspondence id. """ entries = {(chain1(e), chain2(e)) for e in data} with session() as sess: corr = mod.CorrespondencePdbs query = sess.query(corr.correspondence_id, corr.pdb_id_1.label('pdb_id1'), corr.pdb_id_2.label('pdb_id2'), corr.chain_name_1.label('chain_name1'), corr.chain_name_2.label('chain_name2'), corr.chain_id_1, corr.chain_id_2, ) mapping = {} for result in query: result = row2dict(result) ids = (chain1(result), chain2(result)) if ids not in entries: continue key = (result['chain_id_1'], result['chain_id_2']) if key in mapping: raise ValueError("Duplicate mapping found %s" % ids) mapping[key] = result['correspondence_id'] entries.remove(ids) logger.info("Found %i/%i correspondences", len(mapping), len(entries)) if entries: logger.error("Could not map all correspondences %s", str(entries)) if not ignore_missing: raise ValueError("Could not map all correspondences %s" % str(entries)) return mapping
def known(self): annotations = {} with self.session() as session: query = session.query(self.table).\ order_by(self.table.date.asc()) for result in query: current = row2dict(result) for key, value in current.items(): if value == '': current[key] = None annotations[result.motif_id] = current return annotations
def known(self, pdb): """Determine the known """ mapping = {} with self.session() as session: query = session.query(mod.LoopPositions).\ join(mod.LoopInfo, mod.LoopInfo.loop_id == mod.LoopPositions.loop_id).\ filter(mod.LoopInfo.pdb_id == pdb) for result in query: data = utils.row2dict(result) mapping[(result.loop_id, result.position)] = data return mapping
def possible_classes(self, start, stop, resolution): start_index = self.release_index(start) stop_index = self.release_index(stop) rel = mod.NrReleases with self.session() as session: query = session.query(mod.NrClasses.handle, mod.NrClasses.nr_release_id, mod.NrClasses.nr_class_id, ).\ join(rel, mod.NrClasses.nr_release_id == rel.nr_release_id).\ filter(rel.index >= start_index).\ filter(rel.index <= stop_index).\ filter(mod.NrClasses.resolution == resolution).\ order_by(mod.NrClasses.nr_release_id) return [row2dict(r) for r in query]
def ife_info(self, nr_class): ife_id = self.class_property(nr_class, 'id') with self.session() as session: data = {} query = session.query(mod.IfeInfo.ife_id).\ filter(mod.IfeInfo.ife_id == ife_id) for result in query: entry = row2dict(result) ife_id = entry.pop('ife_id') chain_ids = ife_id.split('+') chains = [p.split('|')[-1] for p in chain_ids] entry['Chains'] = ', '.join(chains) data[ife_id] = entry return data
def interactions(self, pdb, chain): c1 = aliased(mod.UnitInfo) c2 = aliased(mod.UnitInfo) interactions = [] with self.session() as session: query = session.query(mod.UnitPairsInteractions).\ join(c1, c1.id == mod.UnitPairsInteractions.unit_id_1).\ join(c2, c2.id == mod.UnitPairsInteractions.unit_id_2).\ filter(mod.UnitPairsInteractions.pdb_id == pdb).\ filter(c1.chain == c2.chain, c1.chain == chain) for result in query: data = ut.row2dict(result) data['id'] = int(data['unit_pairs_interactions_id']) interactions.append(data) return interactions
def load_ife_cqs_data(self, ife_list, nr_name): with self.session() as session: query = session.query( mod.IfeCqs.ife_id, mod.IfeCqs.obs_length, mod.IfeCqs.clashscore, mod.IfeCqs.average_rsr, mod.IfeCqs.average_rscc, mod.IfeCqs.percent_clash, mod.IfeCqs.rfree, mod.IfeCqs.resolution, ).\ filter(mod.IfeCqs.ife_id.in_(ife_list)) data = coll.defaultdict(list) max_exp_len = 0 for result in query: entry = row2dict(result) ii = entry['ife_id'] entry['nr_name'] = nr_name data[ii].append(entry) if result[1] > max_exp_len: max_exp_len = result[1] for ife in ife_list: if data[ife]: ife_data = data[ife] obs_length = ife_data[0]['obs_length'] ife_data[0]['max_exp_len'] = max_exp_len else: self.logger.warning("NQL: data: LICD: no data for %s" % ife) continue truth, fraction_unobserved = self.fraction_unobserved( obs_length, max_exp_len) percent_observed = (1 - fraction_unobserved) data[ife][0]['fraction_unobserved'] = fraction_unobserved data[ife][0]['percent_observed'] = percent_observed compscore = self.compscore(data[ife]) data[ife][0]['compscore'] = compscore return data.values()
def load_ife_cqs_data(self, ife_list, nr_name): with self.session() as session: query = session.query( mod.IfeCqs.ife_id, mod.IfeCqs.obs_length, mod.IfeCqs.clashscore, mod.IfeCqs.average_rsr, mod.IfeCqs.average_rscc, mod.IfeCqs.percent_clash, mod.IfeCqs.rfree, mod.IfeCqs.resolution, ).\ filter(mod.IfeCqs.ife_id.in_(ife_list)) data = coll.defaultdict(list) max_exp_len = 0 for result in query: entry = row2dict(result) ii = entry['ife_id'] entry['nr_name'] = nr_name data[ii].append(entry) if result[1] > max_exp_len: max_exp_len = result[1] for ife in ife_list: if data[ife]: ife_data = data[ife] obs_length = ife_data[0]['obs_length'] ife_data[0]['max_exp_len'] = max_exp_len else: self.logger.warning("NQL: data: LICD: no data for %s" % ife) continue truth, fraction_unobserved = self.fraction_unobserved(obs_length, max_exp_len) percent_observed = (1 - fraction_unobserved) data[ife][0]['fraction_unobserved'] = fraction_unobserved data[ife][0]['percent_observed'] = percent_observed compscore = self.compscore(data[ife]) data[ife][0]['compscore'] = compscore return data.values()
def loop_quality(self, loop_release, pdb, loop_ids, **kwargs): if not loop_ids: raise core.InvalidState("No loops to get data for") pairs = self.pairs(pdb) known_positions = self.positions(pdb) with self.session() as session: info = mod.LoopInfo positions = mod.LoopPositions quality = mod.UnitQuality motifs = mod.MlLoops status = mod.LoopQa query = session.query(info.loop_id.label('Loop'), info.pdb_id.label('Pdb'), info.type.label('Type'), motifs.motif_id.label('Motif'), positions.unit_id.label('Nt'), status.status.label('Status'), quality.real_space_r.label('RSR'), quality.z_score.label('RSRZ'), ).\ join(positions, positions.loop_id == info.loop_id).\ join(status, status.loop_id == info.loop_id).\ outerjoin(motifs, (motifs.loop_id == info.loop_id) & (motifs.ml_release_id == kwargs['motif_release'])).\ outerjoin(quality, quality.unit_id == positions.unit_id).\ filter(info.pdb_id == pdb).\ filter(status.loop_release_id == loop_release).\ filter(info.loop_id.in_(loop_ids)) as_result = ft.partial(self.as_result, pairs, known_positions) quality = [as_result(row2dict(r)) for r in query] found = {q['Loop'] for q in quality} required = set(loop_ids) if found != required: missing = required - found self.logger.error("Missing data for %s", missing) raise core.InvalidState("Did not find data on all loops") return quality
def chains(self, class_id): with self.session() as session: chains = mod.NrChains ife = mod.IfeInfo pdbs = mod.PdbInfo classes = mod.NrClasses query = session.query(chains.nr_release_id, classes.name, classes.handle, classes.version, ife.ife_id.label('id'), ife.bp_count.label('bp'), ife.length, pdbs.resolution, pdbs.experimental_technique.label('method'), ).\ join(ife, ife.ife_id == chains.ife_id).\ join(classes, classes.nr_class_id == chains.nr_class_id).\ join(pdbs, pdbs.pdb_id == ife.pdb_id).\ filter(classes.nr_class_id == class_id) return [row2dict(r) for r in query]
def chain_info(self, ifes): chain_ids = self.class_property(ifes, 'chain_id') with self.session() as session: query = session.query( mod.ChainInfo.chain_id, mod.ChainInfo.sequence.label('Exp Sequence (CI)'), mod.ChainInfo.compound.label('Nucleic Acid Compound'), mod.SpeciesMapping.species_name.label('RNA Species'), ).\ join(mod.ChainSpecies, mod.ChainSpecies.chain_id == mod.ChainInfo.chain_id).\ join(mod.SpeciesMapping, mod.SpeciesMapping.species_id == mod.ChainSpecies.species_id).\ filter(mod.ChainInfo.chain_id.in_(chain_ids)) data = {} for result in query: entry = row2dict(result) entry['Exp Length (CI)'] = len(entry['Exp Sequence (CI)']) chain_id = entry.pop('chain_id') data[chain_id] = entry return data
def ife_info(self, ifes): ife_ids = self.class_property(ifes, 'id') with self.session() as session: query = session.query( mod.IfeInfo.ife_id, mod.IfeInfo.bp_count, mod.IfeInfo.length.label('Obs Length (II)') ).\ filter(mod.IfeInfo.ife_id.in_(ife_ids)) data = {} for result in query: entry = row2dict(result) nt = entry['Obs Length (II)'] bp = entry.pop('bp_count') ife_id = entry.pop('ife_id') entry['BP/NT'] = float(bp) / float(nt) chain_ids = ife_id.split('+') chains = [p.split('|')[-1] for p in chain_ids] entry['Chains'] = ', '.join(chains) data[ife_id] = entry return data
def position_info(self, unit): """Get the information about a position in an experimental sequence using a unit id. """ self.logger.debug("Finding position for %s", unit) with self.session() as session: pos = mod.ExpSeqPosition mapping = mod.ExpSeqUnitMapping result = session.query(pos.index, pos.exp_seq_id, mod.UnitInfo.chain, mod.UnitInfo.model, mod.UnitInfo.sym_op, ).\ join(mapping, mapping.exp_seq_position_id == pos.exp_seq_position_id).\ join(mod.UnitInfo, mod.UnitInfo.unit_id == mapping.unit_id).\ filter(mapping.unit_id == unit).\ one() return row2dict(result)
def pairs(self, pdb): pairs = coll.defaultdict(lambda: {'Pairs': set(), 'Stacks': set(), 'Basephosphate': set()}) with self.session() as session: interactions = mod.UnitPairsInteractions query = session.query(interactions.unit_id_1, interactions.unit_id_2, interactions.f_lwbp.label('Pairs'), interactions.f_stacks.label('Stacks'), interactions.f_bphs.label('Basephosphate'), ).\ filter(interactions.pdb_id == pdb) for result in query: data = row2dict(result) unit1 = data.pop('unit_id_1') unit2 = data.pop('unit_id_2') if unit1 == unit2 and data['Basephosphate'] and \ '0BPh' in data['Basephosphate']: data['Basephosphate'] = None for name, value in data.items(): if value and not value.startswith('n'): pairs[unit1][name].add(unit2) return pairs
def loops(self, pdb): """Get all loops in the current structure. If the loop is part of the current motif atlas release we will fetch the motif assignment as well. Parameters ---------- pdb : str The pdb id to look up structures for. Returns ------- loops : list A list of loop dictonaries that contain an 'id', 'pdb', 'nts' and 'motif_id' column. """ current_ml_release = self.current_ml_release() with self.session() as session: query = session.query(mod.LoopInfo.loop_id.label('id'), mod.LoopInfo.pdb_id.label('pdb'), mod.LoopInfo.unit_ids.label('nts'), mod.MlLoops.motif_id.label('motif_id') ).\ outerjoin(mod.MlLoops, (mod.MlLoops.loop_id == mod.LoopInfo.loop_id) & (mod.MlLoops.ml_release_id == current_ml_release)).\ filter(mod.LoopInfo.pdb_id == pdb).\ order_by(mod.LoopInfo.loop_id) count = query.count() if not count: self.logger.info("No loops found for %s", pdb) else: self.logger.info("Found %s loops for %s", count, pdb) return [row2dict(result) for result in query]
def lookup_sequences(self, pdb): """Return all exp_seq_ids for the given pdb. This only assign the species id from the given pdb. :param str pdb: The pdb id to get all sequences for. :returns: A list of dictionaries of unique sequences. """ with self.session() as session: query = session.query(ExpSeqPdb.exp_seq_id.label('id'), ExpSeqInfo.normalized_length.label('length'), ChainSpecies.species_id.label('species')).\ join(ExpSeqInfo, ExpSeqInfo.exp_seq_id == ExpSeqPdb.exp_seq_id).\ outerjoin(ChainSpecies, ChainSpecies.chain_id == ExpSeqPdb.chain_id).\ filter(ExpSeqPdb.pdb_id == pdb).\ filter(ExpSeqInfo.was_normalized).\ distinct() if not query.count(): self.logger.warning("No sequences for %s" % pdb) return [ut.row2dict(result) for result in query]
def reference(self, pdb): """Get all correlated reference structures. """ with self.session() as session: query = session.query(mod.CorrespondenceInfo).filter_by(pdb2=pdb) return [ut.row2dict(result) for result in query]
def from_dict(cls, result): return cls(**row2dict(result))