def main(args): desc = """ Blastnfilter """ options = _parse_arguments(desc, args[1:]) options.program = args[0] options.version = d3r.__version__ util.setup_logging(options) logger.debug('Starting run ') try: run.run(options) except Exception: logger.exception("Error caught exception") sys.exit(2) sys.exit(0)
def test_setup_logging(self): logger = logging.getLogger('funlogger') theargs = D3RParameters() theargs.loglevel = 'INFO' util.setup_logging(theargs) self.assertEqual( logging.getLogger('d3r.celpp.task').getEffectiveLevel(), logging.INFO) self.assertEqual(theargs.numericloglevel, logging.INFO) logger.debug('test') theargs.loglevel = 'DEBUG' util.setup_logging(theargs) self.assertEqual( logging.getLogger('d3r.celpp.task').getEffectiveLevel(), logging.DEBUG) self.assertEqual(theargs.numericloglevel, logging.DEBUG) theargs.loglevel = 'WARNING' util.setup_logging(theargs) self.assertEqual( logging.getLogger('d3r.celpp.task').getEffectiveLevel(), logging.WARNING) self.assertEqual(theargs.numericloglevel, logging.WARNING) theargs.loglevel = 'ERROR' util.setup_logging(theargs) self.assertEqual( logging.getLogger('d3r.celpp.task').getEffectiveLevel(), logging.ERROR) self.assertEqual(theargs.numericloglevel, logging.ERROR) theargs.loglevel = 'CRITICAL' util.setup_logging(theargs) self.assertEqual( logging.getLogger('d3r.celpp.task').getEffectiveLevel(), logging.CRITICAL) self.assertEqual(theargs.numericloglevel, logging.CRITICAL)
def main(): blasttask = BlastNFilterTask('', p) dataimport = DataImportTask('', p) challenge = ChallengeDataTask('', p) glide = GlideTask('', p) makedb = MakeBlastDBTask('', p) prot = ProteinLigPrepTask('', p) vina = AutoDockVinaTask('', p) chimeraprep = ChimeraProteinLigPrepTask('', p) desc = """ Version {version} Runs the 9 stages (makedb, import, blast, challengedata, proteinligprep, {chimeraprep}, extsubmission, glide, vina, & evaluation) of CELPP processing pipeline (http://www.drugdesigndata.org) CELPP processing pipeline relies on a set of directories with specific structure. The pipeline runs a set of stages Each stage has a numerical value and a name. The numerical value denotes order and the stage name identifies separate tasks to run in the stage. The filesystem structure of the stage is: stage.<stage number>.<task name> The stage(s) run are defined via the required --stage flag. To run multiple stages serially just pass a comma delimited list to the --stage flag. Example: --stage import,blast NOTE: When running multiple stages serially the program will not run subsequent stages if a task in a stage fails. Also note order matters, ie putting blast,import will cause celpprunner.py to run blast stage first. This program drops a pid lockfile (celpprunner.<stage>.lockpid) in celppdir to prevent duplicate invocation. When run, this program will examine the stage and see if work can be done. If stage is complete or previous steps have not completed, the program will exit silently. If previous steps have failed or current stage already exists in an error or uncomplete state then program will report the error via email using addresses set in --email flag. Errors will also be reported via stderr/stdout. The program will also exit with nonzero exit code. This program utilizes simple token files to denote stage completion. If within the stage directory there is a: '{complete}' file - then stage is done and no other checking is done. 'error' file - then stage failed. 'start' file - then stage is running. Notification of stage start and end will be sent to addresses set via --email flag. Unless --customweekdir is set, this program will examine the 'celppdir' (last argument passed on commandline) to find the latest directory with this path: <year>/dataset.week.# The program will find the latest <year> and within that year the dataset.week.# with highest #. The output directories created will be put within this directory. Setting --customweekdir will cause program to use 'celppdir' path. Setting the --createweekdir flag will instruct this program to create a new directory for the current celpp week/year before running any stage processing. NOTE: CELPP weeks start on Friday and end on Thursday and week # follows ISO8601 rules so week numbers at the end and start of the year are a bit wonky. Breakdown of behavior of program is defined by value passed with --stage flag: If --stage '{createchallenge}' This is NOT a stage, but has the same effect as calling --stage makedb,import,blast,challengedata The four stages that need to run to generate the challenge data package. If --stage 'makedb' In this stage the file {pdb_seqres} is downloaded from an ftp site set by --pdbsequrl. This file is then gunzipped and NCBI makeblastdb (set by --makeblastdb) is run on it to create a blast database. The files are stored in {makeblastdb_dirname} If --stage 'import' In this stage 4 files are downloaded from urls specified by --compinchi and --pdbfileurl flags on the commandline into {dataimport_dirname} directory. The tsv files are (--pdbfileurl flag sets url to download these files from): {nonpolymer_tsv} {sequence_tsv} {crystal_tsv} The Components ich file is (--compinchi flag sets base url to download this file from): {compinchi_ich} This stage will just wait and retry if any of the tsv files have NOT been updated since the start of the current celpp week as determined by a HEAD request. To bypass this delay add --skipimportwait flag. --importsleep denotes the time to wait before re-examining the update time of the tsv files and --importretry sets number of times to retry before giving up. If --stage 'blast' Verifies {dataimport_dirname} exists and has '{complete}' file. Also verifies {makeblastdb_dirname} exists and has '{complete}' file. If both conditions are met then the 'blast' stage is run which invokes script set by --blastnfilter flag and output stored in {blast_dirname}. Requires --pdbdb to be set to a directory with valid PDB database files. Note: --blastnfilter script is killed after time set with --blastnfiltertimeout flag. If --stage 'challengedata' Verifies {blast_dirname} exists and has '{complete}' file. If complete, this stage runs which invokes program set in --genchallenge flag to create a challenge dataset file. The --pdbdb flag must also be set when calling this stage. If --ftpconfig is set with {challengepath} field then this stage will also upload the challenge dataset tarfile to the ftp server with path set by {challengepath}. The code will also upload a {latest_txt} file containing name of the tarfile to the same destination overwriting any {latest_txt} file that already exists. Example file for --ftpconfig: {host} some.ftp.com {user} bob {passn} mypass {path} /celpp {challengepath} /challenge {submissionpath} /submissions If --stage '{chimeraprep}' Verifies {challenge_dirname} exists and has '{complete}' file. If complete, this stage runs which invokes program set in --chimeraprep flag to prepare pdb and inchi files storing output in {chimeraprep_dirname}. --pdbdb flag must also be set when calling this stage. If --stage 'proteinligprep' Verifies {challenge_dirname} exists and has '{complete}' file. If complete, this stage runs which invokes program set in --proteinligprep flag to prepare pdb and inchi files storing output in {proteinligprep_dirname}. --pdbdb flag must also be set when calling this stage. If --stage 'extsubmission' Connects to server specified by --ftpconfig and downloads external docking submissions from {submissionpath} on remote server. Submissions should be named: celpp_weekXX_YYYY_dockedresults_ZZZZ.tar.gz as documented here: https://github.com/drugdata/d3r/wiki/Proposed-challenge-docked\ -results-file-structure For each submission a directory named stage.X.ZZZZ.extsubmission will be created and uncompressed contents of package will be stored in that directory. If data does not conform properly 'error' file will be placed in directory denoting failure If --stage 'glide' Verifies {proteinligprep_dirname} exists and has a '{complete}' file within it. If complete, this stage runs which invokes program set in --glide flag to perform docking via glide storing output in {glide_dirname} If --stage 'vina' Verifies {proteinligprep_dirname} exists and has a '{complete}' file within it. If complete, this stage runs which invokes program set in --vina flag to perform docking via AutoDock Vina storing output in {vina_dirname} If --stage 'evaluation' Finds all stage.{dockstage}.<algo> directories with '{complete}' files in them which do not end in name '{webdata}' and runs script set via --evaluation parameter storing the result of the script into stage.{evalstage}.<algo>.evaluation. --pdbdb flag must also be set when calling this stage. """.format(makeblastdb_dirname=makedb.get_dir_name(), dataimport_dirname=dataimport.get_dir_name(), blast_dirname=blasttask.get_dir_name(), challenge_dirname=challenge.get_dir_name(), createchallenge=CREATE_CHALLENGE, proteinligprep_dirname=prot.get_dir_name(), glide_dirname=glide.get_dir_name(), vina_dirname=vina.get_dir_name(), dockstage=str(glide.get_stage()), evalstage=str(glide.get_stage() + 1), complete=blasttask.COMPLETE_FILE, chimeraprep_dirname=chimeraprep.get_dir_name(), chimeraprep=CHIMERA_PREP, compinchi_ich=DataImportTask.COMPINCHI_ICH, pdb_seqres=MakeBlastDBTask.PDB_SEQRES_TXT_GZ, nonpolymer_tsv=DataImportTask.NONPOLYMER_TSV, sequence_tsv=DataImportTask.SEQUENCE_TSV, crystal_tsv=DataImportTask.CRYSTALPH_TSV, webdata=EvaluationTaskFactory.WEB_DATA_SUFFIX, latest_txt=ChallengeDataTask.LATEST_TXT, host=FtpFileTransfer.HOST, user=FtpFileTransfer.USER, passn=FtpFileTransfer.PASS, path=FtpFileTransfer.PATH, challengepath=FtpFileTransfer.CHALLENGEPATH, submissionpath=FtpFileTransfer.SUBMISSIONPATH, version=d3r.__version__) theargs = _parse_arguments(desc, sys.argv[1:]) theargs.program = sys.argv[0] theargs.version = d3r.__version__ util.setup_logging(theargs) try: run_stages(theargs) except Exception: logger.exception("Error caught exception") sys.exit(2)
import os import argparse import psutil import logging from datetime import date import d3r from d3r.celpp import util from d3r.celpp.task import D3RParameters # create logger logger = logging.getLogger('d3r.celpprunner') DEFAULT_LOG_LEVEL = 'ERROR' p = D3RParameters() p.loglevel = DEFAULT_LOG_LEVEL util.setup_logging(p) from d3r.celpp.blastnfilter import BlastNFilterTask from d3r.celpp.proteinligprep import ProteinLigPrepTask from d3r.celpp.dataimport import DataImportTask from d3r.celpp.glide import GlideTask from d3r.celpp.evaluation import EvaluationTaskFactory from d3r.celpp.makeblastdb import MakeBlastDBTask from d3r.celpp.vina import AutoDockVinaTask from d3r.celpp.challengedata import ChallengeDataTask from d3r.celpp.chimeraprep import ChimeraProteinLigPrepTask from d3r.celpp.filetransfer import FtpFileTransfer from d3r.celpp.extsubmission import ExternalDataSubmissionFactory from lockfile.pidlockfile import PIDLockFile
def main(args): """Main entry into genmoleculedb :param args: should be set to sys.argv which is a list of arguments starting with script name as the first argument """ desc = """ Version {version} Performs mol file validation on files with {mol_suffix} extension in gzipped tar files. This script runs in two modes: {genmol_mode} & {validate_mode} These modes are set via the first argument passed into this script. In general '{genmol_mode}' mode is run first and '{validate_mode}' mode is run multiple times to perform the validation. '{genmol_mode}' mode takes a directory of {mol_suffix} files or a CSV file with SMILES strings and generates a molecule database. This database is a pickle file and is used to validate the mol files. The output database is specified by the --{output} flag. This database basically is a dictionary of Ligand names as parsed from the mol file name XXX-####-XXX.mol where the ligand name is expected to be the value between first and second - character. Any problems found are output to standard out/err and a non 0 exit code is returned. '{validate_mode}' mode takes the molecule database from {genmol_mode} (which is passed in via --{moldb} flag) and validates all mol files found in the tarfile specified by --{usersubmission} flag. It is assumed all mol files have a file name format like this: XXX-####-XXX.mol where #### between 1st and second - is considered to be the Ligand ID. Validation is done by comparing number and atomic weight of non hydrogen atoms against the database. Any problems found are output to standard out/err and a non 0 exit code is returned. For more information visit: http://www.drugdesigndata.org """.format(version=d3r.__version__, mol_suffix=MOL_SUFFIX, output=OUTFILE, genmol_mode=GENMOLECULEDB_MODE, validate_mode=VALIDATE_MODE, moldb=MOLDB, usersubmission=USER_SUBMISSION) theargs = _parse_arguments(desc, args[1:]) theargs.program = args[0] theargs.version = d3r.__version__ util.setup_logging(theargs) try: molfactory = D3RMoleculeFromMolFileViaOpeneyeFactory() if theargs.mode == GENMOLECULEDB_MODE: logger.info('Running in ' + GENMOLECULEDB_MODE + ' database generation mode') if theargs.outputfile is None: logger.error('--outputfile must be set to generate the ' 'molecule database') return 3 if theargs.moldir is not None: return _generate_molecule_database_frommolfiles( theargs, molfactory) if theargs.molcsv is not None: smilefactory = D3RMoleculeFromSmileViaOpeneyeFactory() return _generate_molecule_database_fromcsv( theargs, smilefactory) raise ValueError('Either --moldir or --molcsv must be ' 'set to generate molecule database') if theargs.mode == VALIDATE_MODE: logger.info('Running in ' + VALIDATE_MODE + ' validation mode') return _run_validation(theargs, molfactory) raise ValueError('Unsupported mode: ' + str(theargs.mode)) except Exception: logger.exception("Error caught exception") return 2