def write(infile, ftype, indata, ck=False): """ :param infile: Path to input file. :type infile: str :param ftype: One of 'psicov', 'ccmpred', 'fasta', 'pdb', 'a3m', 'jones', 'xml'. :type ftype: str :param ck: Open alternative conkit version instead of default, defaults to False. :type ck: bool, optional :return: Parsed file (and, for 'pdb', list of filenames). :rtype: One or two of list[str], :class:`~crops.elements.sequences.sequence`, :class:`~conkit.core.sequence.Sequence`, """ if (ftype.lower() not in _ftypelist() or isinstance(ftype, str) is not True): logging.critical('Specified type not valid.') raise ValueError if ck is True and ftype.lower() != 'xml': output = ckio.write(infile, ftype.lower(), hyerarchy=indata) else: if ftype.lower() == 'psicov': pass if ftype.lower() == 'ccmpred': pass elif ftype.lower() == 'fasta': output = cps.parseseqfile(infile) elif ftype.lower() == 'pdb': output1, output2 = cps.parsestrfile(infile) return output1, output2 elif ftype.lower() == 'a3m' or 'jones': output = ckio.read(infile, ftype.lower()) elif ftype.lower() == 'xml': output = ET.parse(infile) return output
def main(): """Remove a number of residues from sequence and structure files in agreement to the intervals and other details supplied. :raises ValueError: For wrong argument values. """ # INITIALISE AND PARSE ARGUMENTS FROM COMMAND LINE parser = create_argument_parser() args = parser.parse_args() global logger logger = ccl.crops_logger(level="info") logger.info(ccl._welcome()) inseq = check_path(args.input_seqpath[0], 'file') indb = check_path(args.input_database[0], 'file') instr = check_path(args.input_strpath[0]) if args.uniprot_threshold is not None: if args.uniprot_threshold[1] != 'server-only': insprot = check_path(args.uniprot_threshold[1]) else: insprot = 'server-only' else: insprot = None if args.uniprot_threshold is not None: minlen = float(args.uniprot_threshold[0]) if minlen < 0.0 or minlen > 100.0: logger.critical( 'The UniProt threshold is a percentage and, therefore, it must fulfil 0 < threshold < 100.' ) raise ValueError else: minlen = 0.0 targetlbl = ctg.target_format(indb, terms=args.terminals, th=minlen) infixlbl = ctg.infix_gen(indb, terms=args.terminals) if args.outdir is None: outdir = check_path(os.path.dirname(inseq), 'dir') else: outdir = check_path(os.path.join(args.outdir[0], ''), 'dir') # PARSE INPUT FILES logger.info('Parsing sequence file ' + inseq) if args.preselect is not None: subset = set(args.preselect) else: subset = None seqset = cin.parseseqfile(seq_input=inseq, inset=subset) logger.info('Done') logger.info('Parsing structure file ' + instr) strset, fileset = cin.parsestrfile(instr) logger.info('Done') logger.info('Parsing interval database file ' + indb) if len(seqset) > 0: intervals = cin.import_db(indb, pdb_in=seqset) else: logger.critical('No chains were imported from sequence file.') raise ValueError logger.info('Done' + os.linesep) if insprot is not None and minlen > 0.0: logger.info('Parsing uniprot sequence file: ' + insprot) uniprotset = set() for seqncid, seqnc in seqset.items(): chains = seqnc.chainlist() for monomerid in chains: monomer = seqnc.imer[seqnc.whatseq(monomerid)] if 'uniprot' in intervals[seqncid][monomerid].tags: for key in intervals[seqncid][monomerid].tags['uniprot']: if key.upper() not in uniprotset: uniprotset.add(key.upper()) upserver = True if insprot == 'server-only' else False uniprotset = cin.parseseqfile(seq_input=insprot, inset=uniprotset, use_UPserver=upserver) logger.info('Done' + os.linesep) # MAIN OPERATION / PRINT OUT RESULTS WITHIN gseqset = {} strset2 = {} logger.info('Renumbering structure(s)...') for key, structure in strset.items(): found = False for seqname in seqset: if ((seqname in key) or (len(seqset) == 1 and len(strset) == 1)): finalid = seqname newstructure, gseqset[seqname] = cop.renumber_pdb( seqset[seqname], structure, seqback=True) fout = finalid + infixlbl["renumber"] + os.path.splitext( instr)[1] outstr = outpathgen(outdir, subdir=finalid, filename=fout, mksubdir=True) newstructure.write_minimal_pdb(outstr) strset2[finalid] = structure found = True if found is False: logger.warning("Identifier '" + key + "' not found in sequence input.") logger.info('Done' + os.linesep) logger.info('Cropping renumbered structure(s)...') outseq = os.path.join( outdir, os.path.splitext(os.path.basename(inseq))[0] + infixlbl["croprenum"] + os.path.splitext(os.path.basename(inseq))[1]) for key, S in gseqset.items(): newS = S.deepcopy() if key in intervals: if insprot is not None and minlen > 0.0: newinterval = {} for key2, monomer in S.imer.items(): cropped_seq = False for key3 in monomer.chains: if key3 in intervals[key]: if insprot is not None and minlen > 0.0: newinterval[key3] = intervals[key][key3].deepcopy() newinterval[key3].tags[ 'description'] += ' - Uniprot threshold' newinterval[key3].subint = [] unilbl = ' uniprot chains included: ' for unicode, uniintervals in intervals[key][ key3].tags['uniprot'].items(): uniseq = uniprotset[unicode].imer['1'] if 100 * uniintervals.n_elements( ) / uniseq.length() >= minlen: newinterval[key3] = newinterval[ key3].union( intervals[key][key3].intersection( uniintervals)) unilbl += unicode + '|' if cropped_seq is False: monomer = cop.crop_seq(monomer, newinterval[key3], targetlbl + unilbl, terms=args.terminals) cropped_seq = True else: if cropped_seq is False: monomer = cop.crop_seq(monomer, intervals[key][key3], targetlbl, terms=args.terminals) cropped_seq = True if newS.imer[key2] != monomer: newS.imer[key2] = monomer.deepcopy() else: logger.warning( 'Chain-name ' + key + '_' + str(key3) + ' not found in database. Cropping not performed.') monomer.update_cropsheader() hf = '_' + key2 if args.individual is True else '' ifx = infixlbl["croprenum"] if cropped_seq is True else '' fout = (key + hf + ifx + os.path.splitext(os.path.basename(inseq))[1]) outseq = outpathgen(outdir, subdir=key, filename=fout, mksubdir=True) monomer.dump(outseq) if monomer.cropmap is not None: fout = key + hf + infixlbl["croprenum"] + '.cropmap' outmap = outpathgen(outdir, subdir=key, filename=fout) monomer.dumpmap(outmap) cropped_str = cop.crop_pdb(strset2[key], newS, original_id=True) fout = key + infixlbl["crop"] + os.path.splitext(instr)[1] outstr = outpathgen(outdir, subdir=key, filename=fout, mksubdir=True) cropped_str.write_minimal_pdb(outstr) cropped_str2 = cop.crop_pdb(strset2[key], newS, original_id=False) fout = key + infixlbl["croprenum"] + os.path.splitext(instr)[1] outstr = outpathgen(outdir, subdir=key, filename=fout, mksubdir=True) cropped_str2.write_minimal_pdb(outstr) else: logger.warning('PDB-ID ' + key.upper() + ' not found in database. Cropping not performed.') for key2, monomer in newS.imer.items(): hf = '_' + key2 if args.individual is True else '' fout = key + hf + os.path.splitext(os.path.basename(inseq))[1] outseq = outpathgen(outdir, subdir=key, filename=fout, mksubdir=True) monomer.dump(outseq) # FINISH logger.info('Done' + os.linesep) return
def main(): """Renumber a structure file in agreement with the residue positions in the sequence file corresponding to that structure. Non-polymer elements are numbered starting right after the final (TER) residue. IMPORTANT: If the input sequence and the input structure files are not from the same source (e.g. RCSB PDB) a source conflict might occur making the renumbering operation unsuccessful even if the program does not crash. """ # INITIALISE AND PARSE ARGUMENTS FROM COMMAND LINE parser = create_argument_parser() args = parser.parse_args() global logger logger = ccl.crops_logger(level="info") logger.info(ccl._welcome()) inseq = check_path(args.input_seqpath[0], 'file') instr = check_path(args.input_strpath[0]) if args.outdir is None: outdir = check_path(os.path.dirname(inseq), 'dir') else: outdir = check_path(os.path.join(args.outdir[0], ''), 'dir') infixlbl = ".crops.seq" # PARSE INPUT FILES logger.info('Parsing sequence file ' + inseq) if args.preselect is not None: subset = set(args.preselect) else: subset = None seqset = cin.parseseqfile(seq_input=inseq, inset=subset) logger.info('Done') logger.info('Parsing structure file ' + instr) strset, fileset = cin.parsestrfile(instr) logger.info('Done') # MAIN OPERATION / PRINT OUT RESULTS WITHIN logger.info('Renumbering structure(s)...') for pdbid, structure in strset.items(): found = False for seqname in seqset: if ((seqname in pdbid) or (len(seqset) == 1 and len(strset) == 1)): finalid = seqname try: newstructure = cop.renumber_pdb(seqset[seqname], structure) except (AttributeError, IndexError) as e: logger.warning( 'Something has gone wrong during renumbering:\n{}'. format(e)) if args.force_alignment: logger.info('Attempting Needleman-Wunsch...') newstructure = cop.renumber_pdb_needleman( seqset[seqname], structure) else: logger.critical( 'Unable to renumber the structure, exiting now. ' 'Try again with -f option to force the alignment.') return fout = finalid + infixlbl + os.path.splitext(instr)[1] outstr = outpathgen(outdir, subdir=finalid, filename=fout, mksubdir=True) newstructure.write_minimal_pdb(outstr) found = True if found is False: logger.warning("Identifier '" + pdbid + "' not found in sequence input.") # FINISH logger.info('Done' + os.linesep) return
def main(): parser = create_argument_parser() args = parser.parse_args() global logger logger = pcl.pisacov_logger(level="info") welcomemsg, starttime = pcl.welcome(command=__script__) logger.info(welcomemsg) # PARSE CONFIGURATION FILE: invals = pco._initialise_inputs() invals['INSEQ'] = None invals['INSTR'] = None invals['ALTDB'] = None invals['OUTROOT'] = None invals['OUTCSVPATH'] = None invals['UPTHRESHOLD'] = None # READ INPUT ARGUMENTS invals['INSEQ'] = ppaths.check_path(args.seqpath[0], 'file') invals['INSTR'] = ppaths.check_path(args.crystalpath[0], 'file') if args.hhblits_arguments is not None: invals['HHBLITS_PARAMETERS'] = pco._check_hhparams( args.hhblits_arguments) else: pass if args.uniprot_threshold is not None: try: invals['UPTHRESHOLD'] = float(args.uniprot_threshold[0]) except ValueError: logger.critical('Uniprot threshold given not valid.') if invals['UNICLUST_FASTA_PATH'] is None: invals['UNICLUST_FASTA_PATH'] = pco._uniurl else: pass if args.skip_conpred is True: skipexec = True if (args.hhblits_arguments is not None or args.uniprot_threshold is not None): logger.info( 'HHblits, UniProt threshold parameters given bypassed by --skip_conpred' ) else: skipexec = False cropping = args.remove_insertions scoring = [cropping, not cropping] if args.outdir is None: invals['OUTROOT'] = ppaths.check_path(os.path.dirname(invals['INSEQ'])) else: invals['OUTROOT'] = ppaths.check_path(os.path.join(args.outdir[0], '')) ppaths.mdir(invals['OUTROOT']) invals['OUTCSVPATH'] = [] if args.collection_file is None: invals['OUTCSVPATH'].append( ppaths.check_path( os.path.join(invals['OUTROOT'], ("evcovsignal" + os.extsep + "cropped" + os.extsep + "pisacov" + os.extsep + "csv")))) invals['OUTCSVPATH'].append( ppaths.check_path( os.path.join(invals['OUTROOT'], ("evcovsignal" + os.extsep + "full" + os.extsep + "pisacov" + os.extsep + "csv")))) else: if cropping is True: invals['OUTCSVPATH'].append( ppaths.check_path(args.collection_file[0])) invals['OUTCSVPATH'].append( ppaths.check_path( os.path.splitext(args.collection_file[0])[0] + os.extsep + 'full' + os.extsep + os.path.splitext(args.collection_file[0])[1])) else: invals['OUTCSVPATH'].append(None) invals['OUTCSVPATH'].append( ppaths.check_path(args.collection_file[0])) # Define formats used sources = pco._sources() # Parse sequence and structure files logger.info('Parsing sequence file...') seqs = cps.parseseqfile(invals['INSEQ']) logger.info('Parsing structure file...') strs, filestrs = cps.parsestrfile(invals['INSTR']) if len(seqs) == 1 or len(strs) == 1: if len(seqs) == 1: for key in seqs: pdbid = key elif len(seqs) > 1 and len(strs) == 1: for key in strs: for key2 in seqs: if key.upper() == key2.upper(): pdbid = key.upper() else: if key2.upper() in key.upper(): pdbid = key2.upper() else: raise Exception( 'More than one pdbid in sequence and/or structure set.') seq = seqs[pdbid] #structure = strs[pdbid] # CROPPING AND RENUMBERING outpdbdir = os.path.join(invals['OUTROOT'], pdbid, "") instrc = os.path.join(invals['OUTROOT'], pdbid, os.path.basename(invals['INSTR'])) fseq = {} fmsa = {} if skipexec is False: if cropping is True: logger.info('Cropping and renumbering sequences, ' + 'structures according to SIFTS database.') logger.info(pcl.running('CROPS-cropstr')) itime = datetime.datetime.now() psc.runcrops(invals['INSEQ'], invals['INSTR'], invals['SIFTS_PATH'], invals['UPTHRESHOLD'], invals['UNICLUST_FASTA_PATH'], invals['OUTROOT']) logger.info(pcl.running('CROPS-cropstr', done=itime)) else: logger.info('Renumbering structure ' + 'according to position in sequence.') logger.info(pcl.running('CROPS-renumber')) itime = datetime.datetime.now() psc.renumcrops(invals['INSEQ'], invals['INSTR'], invals['OUTROOT']) logger.info(pcl.running('CROPS-renumber', done=itime)) ppaths.mdir(outpdbdir) if cropping is False: psc.splitseqs(invals['INSEQ'], outpdbdir) copyfile(invals['INSTR'], instrc) for i, iseq in seq.imer.items(): fiseq = pdbid + '_' + i + '.fasta' fseq[i] = os.path.join(invals['OUTROOT'], pdbid, fiseq) fiseq = pdbid + '_' + i + '.msa.aln' fmsa[i] = os.path.join(invals['OUTROOT'], pdbid, 'hhblits', fiseq) if skipexec is False: iseq.dump(fseq[i]) # Parse cropped sequences and maps if cropping is True: amap = {} fcropseq = {} fcropmsa = {} for i, iseq in seq.imer.items(): fprefix = pdbid + '_' + i + '.crops.to_uniprot' fmap = os.path.join(invals['OUTROOT'], pdbid, fprefix + os.extsep + 'cropmap') amap.update(cps.parsemapfile(fmap)[pdbid]) fcropseq[i] = os.path.join(invals['OUTROOT'], pdbid, fprefix + os.extsep + 'fasta') fcropmsa[i] = os.path.join( invals['OUTROOT'], pdbid, 'hhblits', (fprefix + os.extsep + 'msa' + os.extsep + 'aln')) seq.set_cropmaps(amap, cropmain=True) # EXECUTION OF EXTERNAL PROGRAMS hhdir = os.path.join(invals['OUTROOT'], pdbid, 'hhblits', '') dmpdir = os.path.join(invals['OUTROOT'], pdbid, 'dmp', '') pisadir = os.path.join(invals['OUTROOT'], pdbid, 'pisa', '') fstr = os.path.join( invals['OUTROOT'], (pdbid + os.extsep + 'crops' + os.extsep + 'seq' + os.extsep + 'pdb')) if cropping: fcropstr = os.path.join( invals['OUTROOT'], pdbid, (pdbid + os.extsep + 'crops' + os.extsep + 'oldids' + os.extsep + 'to_uniprot' + os.path.splitext(invals['INSTR'])[1])) if skipexec is False: # MSA GENERATOR ppaths.mdir(hhdir) if invals['HHBLITS_PARAMETERS'] == ['3', '0.001', 'inf', '50', '99']: logger.info( 'Generating Multiple Sequence Alignment using DeepMetaPSICOV default parameters... [AS RECOMMENDED]' ) elif invals['HHBLITS_PARAMETERS'] == ['2', '0.001', '1000', '0', '90']: logger.info( 'Generating Multiple Sequence Alignment using HHBlits default parameters...' ) else: logger.info( 'Generating Multiple Sequence Alignment using user-custom parameters...' ) for i, iseq in seq.imer.items(): sfile = fcropseq[i] if cropping is True else fseq[i] afile = fcropmsa[i] if cropping is True else fmsa[i] logger.info(pcl.running('HHBlits')) itime = datetime.datetime.now() themsa = psm.runhhblits(sfile, invals['HHBLITS_PARAMETERS'], hhdir) logger.info(pcl.running('HHBlits', done=itime)) if cropping is True: iseq.cropmsa = themsa if iseq.ncrops() == 0: iseq.msa = iseq.cropmsa logger.info(' Cropped sequence ' + iseq.oligomer_id + '_' + iseq.name + ' is identical to original sequence.') continue else: pass else: iseq.msa = themsa # DEEP META PSICOV RUN ppaths.mdir(dmpdir) if skipexec is False: logger.info( 'Generating contact prediction lists via DeepMetaPSICOV...') for i, iseq in seq.imer.items(): sfile = fcropseq[i] if cropping is True else fseq[i] afile = fcropmsa[i] if cropping is True else fmsa[i] nsfile = os.path.join(dmpdir, os.path.basename(sfile)) if sfile != nsfile: copyfile(sfile, nsfile) logger.info(pcl.running('DeepMetaPSICOV')) itime = datetime.datetime.now() psd.rundmp(nsfile, afile, dmpdir) logger.info(pcl.running('DeepMetaPSICOV', done=itime)) # INTERFACE GENERATION, PISA ppaths.mdir(pisadir) if skipexec is False: logger.info('Generating interface files via PISA...') sfile = fcropstr if cropping is True else fstr logger.info(pcl.running('PISA')) itime = datetime.datetime.now() iflist = psp.runpisa(sfile, pisadir, sessionid=pdbid) logger.info(pcl.running('PISA', done=itime)) endmsg = pcl.ok(starttime, command=__script__) logger.info(endmsg) return