def test_run_all_compinchi_fail(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() params.pdbfileurl = 'file://' + temp_dir params.compinchi = 'file://' + temp_dir make_blast = MakeBlastDBTask(temp_dir, params) make_blast.create_dir() open(os.path.join(make_blast.get_dir(), D3RTask.COMPLETE_FILE), 'a').close() task = DataImportTask(temp_dir, params) task._retrysleep = 0 open(os.path.join(temp_dir, task.NONPOLYMER_TSV), 'a').close() open(os.path.join(temp_dir, task.SEQUENCE_TSV), 'a').close() open(os.path.join(temp_dir, task.OLDSEQUENCE_TSV), 'a').close() open(os.path.join(temp_dir, task.CRYSTALPH_TSV), 'a').close() task.run() self.assertEquals( task.get_error(), 'Unable to download file ' + 'from ' + params.compinchi + ' to ' + task.get_components_inchi_file()) finally: shutil.rmtree(temp_dir)
def test_get_set_of_pdbid_in_crystalph_tsv_and_pdb_seqres_w_hits(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = DataImportTask(temp_dir, params) task.create_dir() f = open(task.get_crystalph_tsv(), 'w') f.write('PDB_ID _exptl_crystal_grow.pH\n') f.write('4X09\t6.5\n') f.write('4rfr\t8\n') f.write('4XET\t6.2\n') f.write('4XF1\t6.2\n') f.write('4XF3\t6.2\n') f.flush() f.close() makeblast = MakeBlastDBTask(temp_dir, params) makeblast.create_dir() f = open(makeblast.get_pdb_seqres_txt(), 'w') f.write('>4rfr_A mol:protein length:154 MYOGLOBIN\n') f.write('MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRVK' 'HLKTEAEMKASEDLKKHG\n') f.write('>102l_A mol:protein length:165 T4 LYSOZYME\n') f.write('MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSELDKA' 'IGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAAL' 'INMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRV' 'ITTFRTGTWDAYKNL\n') f.flush() f.close() pdbset = task.get_set_of_pdbid_in_crystalph_tsv_and_pdb_seqres() self.assertEqual(len(pdbset), 1) self.assertEqual('4RFR' in pdbset, True) finally: shutil.rmtree(temp_dir)
def __init__(self, path, args): super(DataImportTask, self).__init__(path, args) self.set_name(DataImportTask.TASK_NAME) makeblast = MakeBlastDBTask('', args) self.set_stage(makeblast.get_stage() + 1) self.set_status(D3RTask.UNKNOWN_STATUS) self._maxretries = 3 self._retrysleep = 1
def test_get_set_of_pbdid_from_pdb_seqres_txt_no_file(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = MakeBlastDBTask(temp_dir, params) task.create_dir() pdbset = task.get_set_of_pbdid_from_pdb_seqres_txt() self.assertEqual(len(pdbset), 0) finally: shutil.rmtree(temp_dir)
def test_run_where_pdbsequrl_is_not_set(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = MakeBlastDBTask(temp_dir, params) task.run() self.assertEqual( task.get_error(), 'cannot download files cause ' 'pdbsequrl not set') finally: shutil.rmtree(temp_dir)
def test_run_where_makeblastdb_is_not_set(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() params.pdbsequrl = 'pdbsequrl' task = MakeBlastDBTask(temp_dir, params) task.run() self.assertEqual( task.get_error(), 'cannot make blast database ' 'cause makeblastdb not set') finally: shutil.rmtree(temp_dir)
def test_run_all_success(self): temp_dir = tempfile.mkdtemp() try: fakeftp = FtpFileTransfer(None) mftp = D3RParameters() fakeftp.set_connection(mftp) fakeftp.set_remote_dir('/foo2') mftp.get = Mock() params = D3RParameters() params.pdbfileurl = 'file://' + temp_dir params.compinchi = 'file://' + temp_dir make_blast = MakeBlastDBTask(temp_dir, params) make_blast.create_dir() open(os.path.join(make_blast.get_dir(), D3RTask.COMPLETE_FILE), 'a').close() task = DataImportTask(temp_dir, params) task.set_file_transfer(fakeftp) task._retrysleep = 0 open(os.path.join(temp_dir, task.NONPOLYMER_TSV), 'a').close() open(os.path.join(temp_dir, task.SEQUENCE_TSV), 'a').close() open(os.path.join(temp_dir, task.OLDSEQUENCE_TSV), 'a').close() open(os.path.join(temp_dir, task.CRYSTALPH_TSV), 'a').close() open(os.path.join(temp_dir, task.COMPINCHI_ICH), 'a').close() task.run() self.assertEquals(task.get_error(), None) # check line count is 1 now which indicates # standard was added self.assertEqual( util.get_file_line_count(task.get_nonpolymer_tsv()), 1) self.assertEqual(util.get_file_line_count(task.get_sequence_tsv()), 1) self.assertEqual( util.get_file_line_count(task.get_oldsequence_tsv()), 1) self.assertEqual( util.get_file_line_count(task.get_crystalph_tsv()), 1) mftp.get.assert_called_with('/foo2/' + DataImportTask.PARTICIPANT_LIST_CSV, local=task.get_participant_list_csv()) finally: shutil.rmtree(temp_dir)
def can_run(self): """Determines if task can actually run This method first verifies the `MakeBlastDBTask` and `DataImportTask` task have `D3RTask.COMPLETE_STATUS` for status. The method then verifies a `BlastNFilterTask` does not already exist. If above is not true then self.set_error() is set with information about the issue :return: True if can run otherwise False """ self._can_run = False self._error = None # check blast make_blastdb = MakeBlastDBTask(self._path, self._args) make_blastdb.update_status_from_filesystem() if make_blastdb.get_status() != D3RTask.COMPLETE_STATUS: logger.info('Cannot run ' + self.get_name() + ' task ' + 'because ' + make_blastdb.get_name() + ' task' + 'has a status of ' + make_blastdb.get_status()) self.set_error(make_blastdb.get_name() + ' task has ' + make_blastdb.get_status() + ' status') return False # check data import data_import = DataImportTask(self._path, self._args) data_import.update_status_from_filesystem() if data_import.get_status() != D3RTask.COMPLETE_STATUS: logger.info('Cannot run ' + self.get_name() + ' task ' + 'because ' + data_import.get_name() + ' task' + 'has a status of ' + data_import.get_status()) self.set_error(data_import.get_name() + ' task has ' + data_import.get_status() + ' status') return False # check blast is not complete and does not exist self.update_status_from_filesystem() if self.get_status() == D3RTask.COMPLETE_STATUS: logger.debug("No work needed for " + self.get_name() + " task") return False if self.get_status() != D3RTask.NOTFOUND_STATUS: logger.warning(self.get_name() + " task was already " + "attempted, but there was a problem") self.set_error(self.get_dir_name() + ' already exists and ' + 'status is ' + self.get_status()) return False self._can_run = True return True
def test_constructor(self): params = D3RParameters() task = MakeBlastDBTask('/foo', params) self.assertEqual(task.get_name(), 'makeblastdb') self.assertEqual(task.get_stage(), 1) self.assertEqual(task.get_status(), D3RTask.UNKNOWN_STATUS) self.assertEqual(task.get_path(), '/foo') self.assertEqual(task.get_dir_name(), 'stage.1.makeblastdb') test_task.try_update_status_from_filesystem(self, task)
def test_run_all_nonpolymer_fail(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() params.pdbfileurl = 'file://' + temp_dir params.compinchi = 'file://' + temp_dir make_blast = MakeBlastDBTask(temp_dir, params) make_blast.create_dir() open(os.path.join(make_blast.get_dir(), D3RTask.COMPLETE_FILE), 'a').close() task = DataImportTask(temp_dir, params) task._retrysleep = 0 task.run() self.assertEquals( task.get_error(), 'Unable to download file ' + 'from ' + params.pdbfileurl + ' to ' + task.get_nonpolymer_tsv()) finally: shutil.rmtree(temp_dir)
def test_get_set_of_pbdid_from_pdb_seqres_txt_with_400k_file(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = MakeBlastDBTask(temp_dir, params) task.create_dir() f = open(task.get_pdb_seqres_txt(), 'w') perms = itertools.permutations( string.ascii_lowercase + '123456789', 4) counter = 0 try: limit = 400000 while counter < limit: f.write('>' + ''.join(map(str, perms.next())) + '_A mol:protein length:165 T4 LYSOZYME\n') f.write('MVLSEGEWQLVLH\n') counter += 1 except StopIteration: pass f.flush() f.close() pdbset = task.get_set_of_pbdid_from_pdb_seqres_txt() self.assertEqual(len(pdbset), counter) finally: shutil.rmtree(temp_dir)
def test_run_where_can_run_is_false(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = MakeBlastDBTask(temp_dir, params) task.create_dir() open(os.path.join(task.get_dir(), 'error'), 'a').close() task.run() self.assertEqual(task._can_run, False) finally: shutil.rmtree(temp_dir)
def test_run_all_success_except_participant_download_fails(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() params.pdbfileurl = 'file://' + temp_dir params.compinchi = 'file://' + temp_dir make_blast = MakeBlastDBTask(temp_dir, params) make_blast.create_dir() open(os.path.join(make_blast.get_dir(), D3RTask.COMPLETE_FILE), 'a').close() task = DataImportTask(temp_dir, params) task._retrysleep = 0 open(os.path.join(temp_dir, task.NONPOLYMER_TSV), 'a').close() open(os.path.join(temp_dir, task.SEQUENCE_TSV), 'a').close() open(os.path.join(temp_dir, task.OLDSEQUENCE_TSV), 'a').close() open(os.path.join(temp_dir, task.CRYSTALPH_TSV), 'a').close() open(os.path.join(temp_dir, task.COMPINCHI_ICH), 'a').close() task.run() self.assertEquals(task.get_error(), None) # check line count is 1 now which indicates # standard was added self.assertEqual( util.get_file_line_count(task.get_nonpolymer_tsv()), 1) self.assertEqual(util.get_file_line_count(task.get_sequence_tsv()), 1) self.assertEqual( util.get_file_line_count(task.get_oldsequence_tsv()), 1) self.assertEqual( util.get_file_line_count(task.get_crystalph_tsv()), 1) self.assertTrue(task.get_email_log().startswith( '\nWARNING: Unable to download')) finally: shutil.rmtree(temp_dir)
def test_get_set_of_pbdid_from_pdb_seqres_txt_empty_file(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = MakeBlastDBTask(temp_dir, params) task.create_dir() open(task.get_pdb_seqres_txt(), 'a').close() self.assertEqual(os.path.isfile(task.get_pdb_seqres_txt()), True) pdbset = task.get_set_of_pbdid_from_pdb_seqres_txt() self.assertEqual(len(pdbset), 0) finally: shutil.rmtree(temp_dir)
def test_get_set_of_pdbid_in_crystalph_tsv_and_pdb_seqres_empty_seq(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = DataImportTask(temp_dir, params) task.create_dir() f = open(task.get_crystalph_tsv(), 'w') f.write('PDB_ID _exptl_crystal_grow.pH\n') f.write('4X09\t6.5\n') f.write('4rfr\t8\n') f.write('4XET\t6.2\n') f.write('4XF1\t6.2\n') f.write('4XF3\t6.2\n') f.flush() f.close() makeblast = MakeBlastDBTask(temp_dir, params) makeblast.create_dir() open(makeblast.get_pdb_seqres_txt(), 'a').close() pdbset = task.get_set_of_pdbid_in_crystalph_tsv_and_pdb_seqres() self.assertEqual(len(pdbset), 0) finally: shutil.rmtree(temp_dir)
def test_get_sequence_count_file_has_zero_size(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = MakeBlastDBTask(temp_dir, params) task.create_dir() open(task.get_pdb_seqres_txt(), 'a').close() self.assertEqual(task._get_sequence_count_message(), '# sequence(s): 0') finally: shutil.rmtree(temp_dir)
def test_run_where_download_fails(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() params.pdbsequrl = 'file://doesnotexist' params.makeblastdb = 'makeblastdb' task = MakeBlastDBTask(temp_dir, params) task._retrysleep = 0 task._maxretries = 1 task.run() self.assertEqual( task.get_error(), 'Unable to download file: ' + 'file://doesnotexist') finally: shutil.rmtree(temp_dir)
def test_get_set_of_pbdid_from_pdb_seqres_txt_file_no_seqs(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = MakeBlastDBTask(temp_dir, params) task.create_dir() f = open(task.get_pdb_seqres_txt(), 'w') f.write('hi\nhow\nare\nyou') f.flush() f.close() self.assertEqual(os.path.isfile(task.get_pdb_seqres_txt()), True) pdbset = task.get_set_of_pbdid_from_pdb_seqres_txt() self.assertEqual(len(pdbset), 0) finally: shutil.rmtree(temp_dir)
def test_get_sequence_count_file_has_multiple_seqs(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = MakeBlastDBTask(temp_dir, params) task.create_dir() f = open(task.get_pdb_seqres_txt(), 'w') f.write('>hi\n>seq\n>are\n') f.flush() f.close() self.assertEqual(task._get_sequence_count_message(), '# sequence(s): 3') finally: shutil.rmtree(temp_dir)
def test_run_where_gunzip_fails(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() fakegz = os.path.join(temp_dir, 'fake.gz') f = open(fakegz, 'w') f.write('hello\n') f.flush() f.close() params.pdbsequrl = 'file://' + fakegz params.makeblastdb = 'makeblastdb' task = MakeBlastDBTask(temp_dir, params) task._retrysleep = 0 task._maxretries = 1 task.run() self.assertEqual( task.get_error(), 'Unable to uncompress file: ' + task.get_pdb_seqres_txt()) finally: shutil.rmtree(temp_dir)
def test_get_set_of_pbdid_from_pdb_seqres_txt_with_seqs(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = MakeBlastDBTask(temp_dir, params) task.create_dir() f = open(task.get_pdb_seqres_txt(), 'w') f.write('>101m_A mol:protein length:154 MYOGLOBIN\n') f.write('MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRVK' 'HLKTEAEMKASEDLKKHG\n') f.write('>102l_A mol:protein length:165 T4 LYSOZYME\n') f.write('MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSELDKA' 'IGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAAL' 'INMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRV' 'ITTFRTGTWDAYKNL\n') f.write('>102l_A mol:protein length:154 MYOGLOBIN\n') f.write('MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHL' 'KTEAEMKASEDLKKAGVTVLTALGAILKKKGHHEAELKPLAQSHATKHKI' 'PIKYLEFISEAIIHVLHSRHPGNFGADAQGAMNKALELFRKDIAAKYKELGYQG\n') f.write('>103l_A mol:protein length:167 T4 LYSOZYME\n') f.write('MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSLDAAKSELD' 'KAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRA' 'ALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAK' 'RVITTFRTGTWDAYKNL\n') f.write('>10jj3m_A mol:protein length:154 MYOGLOBIN\n') f.write('MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRF\n') f.write('>104l_A mol:protein length:166 T4 LYSOZYME\n') f.write('MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSAAE\n') f.write('>104l_B mol:protein length:166 T4 LYSOZYME\n') f.write('MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSAKNL\n') f.flush() f.close() self.assertEqual(os.path.isfile(task.get_pdb_seqres_txt()), True) pdbset = task.get_set_of_pbdid_from_pdb_seqres_txt() self.assertEqual(len(pdbset), 4) self.assertEqual('101M' in pdbset, True) self.assertEqual('102L' in pdbset, True) self.assertEqual('103L' in pdbset, True) self.assertEqual('104L' in pdbset, True) finally: shutil.rmtree(temp_dir)
def get_set_of_pdbid_in_crystalph_tsv_and_pdb_seqres(self): """Gets set of PDBIDs that are in both tsv and sequence file Examines `DataImportTask.CRYSTALPH_TSV` and `MakeBlastDBTask.PDB_SEQRES_TXT` and returns a set of PDBIDs that are in both files :returns: set of PDBIDs uppercase that are in both files above """ make_blastdb = MakeBlastDBTask(self._path, self._args) if not os.path.isfile(make_blastdb.get_pdb_seqres_txt()): logger.warning('No ' + make_blastdb.get_pdb_seqres_txt() + ' file found') return set() c_pdbid_set = self.get_set_of_pdbid_from_crystalph_tsv() if len(c_pdbid_set) == 0: logger.warning('No PDBIds found in ' + self.get_crystalph_tsv()) return set() seq_pdbid_set = make_blastdb.get_set_of_pbdid_from_pdb_seqres_txt() if len(seq_pdbid_set) == 0: logger.warning('No PDBIds found in ' + make_blastdb.get_pdb_seqres_txt()) return set() common_pdbid = set() # iterate through tsv pdb ids and return any found in # sequence pdb id set for id in c_pdbid_set: if id in seq_pdbid_set: common_pdbid.add(id) logger.debug('Found ' + str(len(common_pdbid)) + ' PDBIDs in ' + self.get_crystalph_tsv() + ' and ' + make_blastdb.get_pdb_seqres_txt()) return common_pdbid
def test_can_run_does_not_exist_or_error(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = DataImportTask(temp_dir, params) # no make blast db self.assertEquals(task.can_run(), False) self.assertEquals(task.get_error(), 'makeblastdb task has notfound status') self.assertEquals(task._can_run, False) make_blast = MakeBlastDBTask(temp_dir, params) make_blast.create_dir() # make blast db failed err_file = os.path.join(make_blast.get_dir(), D3RTask.ERROR_FILE) open(err_file, 'a').close() self.assertEquals(task.can_run(), False) self.assertEquals(task.get_error(), 'makeblastdb task has error status') self.assertEquals(task._can_run, False) os.remove(err_file) # make blast db success open(os.path.join(make_blast.get_dir(), D3RTask.COMPLETE_FILE), 'a').close() self.assertEquals(task.can_run(), True) self.assertEquals(task.get_error(), None) self.assertEquals(task._can_run, True) task.create_dir() open(os.path.join(task.get_dir(), D3RTask.ERROR_FILE), 'a').close() self.assertEquals(task.can_run(), False) self.assertEquals(task._can_run, False) self.assertEquals( task.get_error(), task.get_dir_name() + ' already exists and ' + 'status is ' + D3RTask.ERROR_STATUS) finally: shutil.rmtree(temp_dir)
def test_get_set_of_pbdid_from_pdb_seqres_txt_wrong_len_pdbids(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() task = MakeBlastDBTask(temp_dir, params) task.create_dir() f = open(task.get_pdb_seqres_txt(), 'w') f.write('>1m_A mol:protein length:154 MYOGLOBIN\n') f.write('MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRVK' 'HLKTEAEMKASEDLKKHG\n') f.write('>abcdel_A mol:protein length:165 T4 LYSOZYME\n') f.write('MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSELDKA' 'IGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAAL' 'INMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRV' 'ITTFRTGTWDAYKNL\n') f.flush() f.close() self.assertEqual(os.path.isfile(task.get_pdb_seqres_txt()), True) pdbset = task.get_set_of_pbdid_from_pdb_seqres_txt() self.assertEqual(len(pdbset), 0) finally: shutil.rmtree(temp_dir)
def test_run_where_makeblastdb_fails(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() fakegz = os.path.join(temp_dir, 'fake.gz') f = gzip.open(fakegz, 'wb') f.write('hello\n') f.flush() f.close() params.pdbsequrl = 'file://' + fakegz params.makeblastdb = 'false' task = MakeBlastDBTask(temp_dir, params) task._retrysleep = 0 task._maxretries = 1 task.run() self.assertEqual( task.get_error(), 'Non zero exit code: 1 ' 'received. Standard out:' ' Standard error: ') finally: shutil.rmtree(temp_dir)
def test_run_with_blast_success_useoldseq_and_postanalysis_fail(self): temp_dir = tempfile.mkdtemp() try: params = D3RParameters() params.blastnfilter = '/bin/echo' params.postanalysis = os.path.join(temp_dir, 'foo.py') params.pdbdb = '/pdbdb' blasttask = BlastNFilterTask(temp_dir, params) blasttask._can_run = True txt_file = os.path.join(blasttask.get_dir(), 'summary.txt') txt_contents = ('INPUT SUMMARY\\n' + ' sequences: 177\\n' + ' complexes: 149\\n') # create fake blastnfilter script that makes csv files f = open(params.postanalysis, 'w') f.write('#! /usr/bin/env python\n\n') f.write('f = open(\'' + txt_file + '\', \'w\')\n') f.write('f.write(\'' + txt_contents + '\\n\')\n') f.write('f.flush()\nf.close()\n') f.flush() f.close() os.chmod(params.postanalysis, stat.S_IRWXU) blasttask.run() self.assertEqual(blasttask.get_status(), D3RTask.COMPLETE_STATUS) self.assertEqual(blasttask.get_error(), None) complete_file = os.path.join(blasttask.get_dir(), D3RTask.COMPLETE_FILE) self.assertEqual(os.path.isfile(complete_file), True) std_err_file = os.path.join(blasttask.get_dir(), 'echo.stderr') self.assertEqual(os.path.isfile(std_err_file), True) std_out_file = os.path.join(blasttask.get_dir(), 'echo.stdout') dataimport = DataImportTask(temp_dir, params) makeblast = MakeBlastDBTask(temp_dir, params) f = open(std_out_file, 'r') echo_out = f.read().replace('\n', '') echo_out.index('--nonpolymertsv ' + os.path.join(temp_dir, dataimport.get_dir_name(), DataImportTask.NONPOLYMER_TSV)) echo_out.index(' --sequencetsv ' + os.path.join(temp_dir, dataimport.get_dir_name(), DataImportTask.OLDSEQUENCE_TSV)) echo_out.index(' --pdbblastdb ' + os.path.join(temp_dir, makeblast.get_dir_name())) echo_out.index(' --compinchi ' + os.path.join(temp_dir, dataimport.get_dir_name(), DataImportTask.COMPINCHI_ICH)) echo_out.index(' --outdir ' + os.path.join(temp_dir, blasttask.get_dir_name())) echo_out.index(' --crystalpH ' + os.path.join(temp_dir, dataimport.get_dir_name(), DataImportTask.CRYSTALPH_TSV)) echo_out.index(' --pdbdb /pdbdb ') f.close() self.assertEqual(os.path.isfile(std_out_file), True) self.assertEquals(blasttask.get_status(), D3RTask.COMPLETE_STATUS) self.assertEquals( os.path.exists( os.path.join(blasttask.get_dir(), 'foo.py.stderr')), True) self.assertEquals( os.path.exists( os.path.join(blasttask.get_dir(), 'foo.py.stdout')), True) res = blasttask.get_email_log().rstrip('\n') res.index('/bin/echo') res.index('# txt files found: 0') res.index('Output from summary.txt') res.index(' sequences: 177') res.index(' complexes: 149') res.index(dataimport.get_sequence_tsv() + ' file not found falling back to ' + dataimport.get_oldsequence_tsv()) finally: shutil.rmtree(temp_dir)
def test_can_run(self): tempDir = tempfile.mkdtemp() try: # try where makeblastdb is not complete params = D3RParameters() blastTask = BlastNFilterTask(tempDir, params) self.assertEqual(blastTask.can_run(), False) # try where makeblastdb failed blastDb = MakeBlastDBTask(tempDir, params) blastDb.create_dir() errorFile = os.path.join(blastDb.get_path(), blastDb.get_dir_name(), D3RTask.ERROR_FILE) open(errorFile, 'a').close() self.assertEqual(blastTask.can_run(), False) self.assertEqual(blastTask.get_error(), 'makeblastdb task has error status') # try where data import is not complete completeFile = os.path.join(blastDb.get_path(), blastDb.get_dir_name(), D3RTask.COMPLETE_FILE) open(completeFile, 'a').close() self.assertEqual(blastTask.can_run(), False) self.assertEqual(blastTask.get_error(), 'dataimport task has ' + 'notfound status') # try where data import failed dataImport = DataImportTask(tempDir, params) dataImport.create_dir() errorFile = os.path.join(dataImport.get_path(), dataImport.get_dir_name(), D3RTask.ERROR_FILE) open(errorFile, 'a').close() self.assertEqual(blastTask.can_run(), False) self.assertEqual(blastTask.get_error(), 'dataimport task has error status') # try where blast can run os.remove(errorFile) completeFile = os.path.join(dataImport.get_dir(), D3RTask.COMPLETE_FILE) open(completeFile, 'a').close() self.assertEqual(blastTask.can_run(), True) self.assertEqual(blastTask.get_error(), None) # try where blast exists blastTask.create_dir() self.assertEqual(blastTask.can_run(), False) self.assertEqual( blastTask.get_error(), blastTask.get_dir_name() + ' already exists and' + ' status is unknown') # try where blast is complete completeFile = os.path.join(blastTask.get_path(), blastTask.get_dir_name(), D3RTask.COMPLETE_FILE) open(completeFile, 'a').close() self.assertEqual(blastTask.can_run(), False) self.assertEqual(blastTask.get_error(), None) finally: shutil.rmtree(tempDir)
def main(): blasttask = BlastNFilterTask('', p) dataimport = DataImportTask('', p) challenge = ChallengeDataTask('', p) glide = GlideTask('', p) makedb = MakeBlastDBTask('', p) prot = ProteinLigPrepTask('', p) vina = AutoDockVinaTask('', p) chimeraprep = ChimeraProteinLigPrepTask('', p) desc = """ Version {version} Runs the 9 stages (makedb, import, blast, challengedata, proteinligprep, {chimeraprep}, extsubmission, glide, vina, & evaluation) of CELPP processing pipeline (http://www.drugdesigndata.org) CELPP processing pipeline relies on a set of directories with specific structure. The pipeline runs a set of stages Each stage has a numerical value and a name. The numerical value denotes order and the stage name identifies separate tasks to run in the stage. The filesystem structure of the stage is: stage.<stage number>.<task name> The stage(s) run are defined via the required --stage flag. To run multiple stages serially just pass a comma delimited list to the --stage flag. Example: --stage import,blast NOTE: When running multiple stages serially the program will not run subsequent stages if a task in a stage fails. Also note order matters, ie putting blast,import will cause celpprunner.py to run blast stage first. This program drops a pid lockfile (celpprunner.<stage>.lockpid) in celppdir to prevent duplicate invocation. When run, this program will examine the stage and see if work can be done. If stage is complete or previous steps have not completed, the program will exit silently. If previous steps have failed or current stage already exists in an error or uncomplete state then program will report the error via email using addresses set in --email flag. Errors will also be reported via stderr/stdout. The program will also exit with nonzero exit code. This program utilizes simple token files to denote stage completion. If within the stage directory there is a: '{complete}' file - then stage is done and no other checking is done. 'error' file - then stage failed. 'start' file - then stage is running. Notification of stage start and end will be sent to addresses set via --email flag. Unless --customweekdir is set, this program will examine the 'celppdir' (last argument passed on commandline) to find the latest directory with this path: <year>/dataset.week.# The program will find the latest <year> and within that year the dataset.week.# with highest #. The output directories created will be put within this directory. Setting --customweekdir will cause program to use 'celppdir' path. Setting the --createweekdir flag will instruct this program to create a new directory for the current celpp week/year before running any stage processing. NOTE: CELPP weeks start on Friday and end on Thursday and week # follows ISO8601 rules so week numbers at the end and start of the year are a bit wonky. Breakdown of behavior of program is defined by value passed with --stage flag: If --stage '{createchallenge}' This is NOT a stage, but has the same effect as calling --stage makedb,import,blast,challengedata The four stages that need to run to generate the challenge data package. If --stage 'makedb' In this stage the file {pdb_seqres} is downloaded from an ftp site set by --pdbsequrl. This file is then gunzipped and NCBI makeblastdb (set by --makeblastdb) is run on it to create a blast database. The files are stored in {makeblastdb_dirname} If --stage 'import' In this stage 4 files are downloaded from urls specified by --compinchi and --pdbfileurl flags on the commandline into {dataimport_dirname} directory. The tsv files are (--pdbfileurl flag sets url to download these files from): {nonpolymer_tsv} {sequence_tsv} {crystal_tsv} The Components ich file is (--compinchi flag sets base url to download this file from): {compinchi_ich} This stage will just wait and retry if any of the tsv files have NOT been updated since the start of the current celpp week as determined by a HEAD request. To bypass this delay add --skipimportwait flag. --importsleep denotes the time to wait before re-examining the update time of the tsv files and --importretry sets number of times to retry before giving up. If --stage 'blast' Verifies {dataimport_dirname} exists and has '{complete}' file. Also verifies {makeblastdb_dirname} exists and has '{complete}' file. If both conditions are met then the 'blast' stage is run which invokes script set by --blastnfilter flag and output stored in {blast_dirname}. Requires --pdbdb to be set to a directory with valid PDB database files. Note: --blastnfilter script is killed after time set with --blastnfiltertimeout flag. If --stage 'challengedata' Verifies {blast_dirname} exists and has '{complete}' file. If complete, this stage runs which invokes program set in --genchallenge flag to create a challenge dataset file. The --pdbdb flag must also be set when calling this stage. If --ftpconfig is set with {challengepath} field then this stage will also upload the challenge dataset tarfile to the ftp server with path set by {challengepath}. The code will also upload a {latest_txt} file containing name of the tarfile to the same destination overwriting any {latest_txt} file that already exists. Example file for --ftpconfig: {host} some.ftp.com {user} bob {passn} mypass {path} /celpp {challengepath} /challenge {submissionpath} /submissions If --stage '{chimeraprep}' Verifies {challenge_dirname} exists and has '{complete}' file. If complete, this stage runs which invokes program set in --chimeraprep flag to prepare pdb and inchi files storing output in {chimeraprep_dirname}. --pdbdb flag must also be set when calling this stage. If --stage 'proteinligprep' Verifies {challenge_dirname} exists and has '{complete}' file. If complete, this stage runs which invokes program set in --proteinligprep flag to prepare pdb and inchi files storing output in {proteinligprep_dirname}. --pdbdb flag must also be set when calling this stage. If --stage 'extsubmission' Connects to server specified by --ftpconfig and downloads external docking submissions from {submissionpath} on remote server. Submissions should be named: celpp_weekXX_YYYY_dockedresults_ZZZZ.tar.gz as documented here: https://github.com/drugdata/d3r/wiki/Proposed-challenge-docked\ -results-file-structure For each submission a directory named stage.X.ZZZZ.extsubmission will be created and uncompressed contents of package will be stored in that directory. If data does not conform properly 'error' file will be placed in directory denoting failure If --stage 'glide' Verifies {proteinligprep_dirname} exists and has a '{complete}' file within it. If complete, this stage runs which invokes program set in --glide flag to perform docking via glide storing output in {glide_dirname} If --stage 'vina' Verifies {proteinligprep_dirname} exists and has a '{complete}' file within it. If complete, this stage runs which invokes program set in --vina flag to perform docking via AutoDock Vina storing output in {vina_dirname} If --stage 'evaluation' Finds all stage.{dockstage}.<algo> directories with '{complete}' files in them which do not end in name '{webdata}' and runs script set via --evaluation parameter storing the result of the script into stage.{evalstage}.<algo>.evaluation. --pdbdb flag must also be set when calling this stage. """.format(makeblastdb_dirname=makedb.get_dir_name(), dataimport_dirname=dataimport.get_dir_name(), blast_dirname=blasttask.get_dir_name(), challenge_dirname=challenge.get_dir_name(), createchallenge=CREATE_CHALLENGE, proteinligprep_dirname=prot.get_dir_name(), glide_dirname=glide.get_dir_name(), vina_dirname=vina.get_dir_name(), dockstage=str(glide.get_stage()), evalstage=str(glide.get_stage() + 1), complete=blasttask.COMPLETE_FILE, chimeraprep_dirname=chimeraprep.get_dir_name(), chimeraprep=CHIMERA_PREP, compinchi_ich=DataImportTask.COMPINCHI_ICH, pdb_seqres=MakeBlastDBTask.PDB_SEQRES_TXT_GZ, nonpolymer_tsv=DataImportTask.NONPOLYMER_TSV, sequence_tsv=DataImportTask.SEQUENCE_TSV, crystal_tsv=DataImportTask.CRYSTALPH_TSV, webdata=EvaluationTaskFactory.WEB_DATA_SUFFIX, latest_txt=ChallengeDataTask.LATEST_TXT, host=FtpFileTransfer.HOST, user=FtpFileTransfer.USER, passn=FtpFileTransfer.PASS, path=FtpFileTransfer.PATH, challengepath=FtpFileTransfer.CHALLENGEPATH, submissionpath=FtpFileTransfer.SUBMISSIONPATH, version=d3r.__version__) theargs = _parse_arguments(desc, sys.argv[1:]) theargs.program = sys.argv[0] theargs.version = d3r.__version__ util.setup_logging(theargs) try: run_stages(theargs) except Exception: logger.exception("Error caught exception") sys.exit(2)
def run(self): """Runs blastnfilter task after verifying dataimport was good Method requires can_run() to be called before hand with successful outcome Otherwise method invokes D3RTask.start then this method creates a directory and invokes blastnfilter script and postanalysis script. Upon completion results are analyzed and success or error status is set appropriately and D3RTask.end is invoked """ super(BlastNFilterTask, self).run() if self._can_run is False: logger.debug(self.get_dir_name() + ' cannot run cause _can_run flag ' 'is False') return data_import = DataImportTask(self._path, self._args) make_blastdb = MakeBlastDBTask(self._path, self._args) try: loglevel = self.get_args().loglevel except AttributeError: logger.debug('No log level set in arguments using WARNING') loglevel = 'WARNING' # verify sequence.tsv file exists on filesystem. # if not fall back to oldsequence.tsv file sequencetsv = data_import.get_sequence_tsv() if not os.path.isfile(sequencetsv): logger.warning(sequencetsv + ' file not found. falling ' 'back to old file') self.append_to_email_log('\n ' + sequencetsv + ' file not found ' + 'falling back to ' + data_import.get_oldsequence_tsv() + '\n') sequencetsv = data_import.get_oldsequence_tsv() cmd_to_run = (self.get_args().blastnfilter + ' --nonpolymertsv ' + data_import.get_nonpolymer_tsv() + ' --sequencetsv ' + sequencetsv + ' --pdbblastdb ' + make_blastdb.get_dir() + ' --compinchi ' + data_import.get_components_inchi_file() + ' --crystalpH ' + data_import.get_crystalph_tsv() + ' --pdbdb ' + self.get_args().pdbdb + ' --log ' + loglevel + ' --outdir ' + self.get_dir()) blastnfilter_name = os.path.basename(self.get_args().blastnfilter) self.run_external_command( blastnfilter_name, cmd_to_run, False, ) self.set_status(D3RTask.COMPLETE_STATUS) cmd_to_run = (self.get_args().postanalysis + ' --compinchi ' + data_import.get_components_inchi_file() + ' ' + self.get_dir()) postanalysis_name = os.path.basename(self.get_args().postanalysis) self.run_external_command(postanalysis_name, cmd_to_run, False) try: # examine output to get candidate hit count DR-12 hit_stats = self._parse_blastnfilter_output_for_hit_stats() if hit_stats is not None: self.append_to_email_log(hit_stats) except Exception: logger.exception("Error caught exception") # assess the result self.end()
def get_task_list_for_stage(theargs, stage_name): """Factory method that generates a list of tasks for given stage Using stage_name get the list of tasks that need to be run. :param theargs: parameters set via commandline along with ``theargs.latest_weekly`` which should be set to to base directory where stages will be run :param stage_name: Name of stage to run """ if stage_name is None: raise NotImplementedError('stage_name is None') task_list = [] logger.debug('Getting task list for ' + stage_name) if stage_name == CREATE_CHALLENGE: task_list.append(MakeBlastDBTask(theargs.latest_weekly, theargs)) task_list.append(DataImportTask(theargs.latest_weekly, theargs)) task_list.append(BlastNFilterTask(theargs.latest_weekly, theargs)) task_list.append(ChallengeDataTask(theargs.latest_weekly, theargs)) if stage_name == 'makedb': task_list.append(MakeBlastDBTask(theargs.latest_weekly, theargs)) if stage_name == 'import': task_list.append(DataImportTask(theargs.latest_weekly, theargs)) if stage_name == 'blast': task_list.append(BlastNFilterTask(theargs.latest_weekly, theargs)) if stage_name == 'challengedata': task_list.append(ChallengeDataTask(theargs.latest_weekly, theargs)) if stage_name == 'proteinligprep': task_list.append(ProteinLigPrepTask(theargs.latest_weekly, theargs)) if stage_name == 'glide': task_list.append(GlideTask(theargs.latest_weekly, theargs)) if stage_name == 'vina': task_list.append(AutoDockVinaTask(theargs.latest_weekly, theargs)) if stage_name == CHIMERA_PREP: task_list.append( ChimeraProteinLigPrepTask(theargs.latest_weekly, theargs)) if stage_name == 'extsubmission': extfac = ExternalDataSubmissionFactory(theargs.latest_weekly, theargs) task_list.extend(extfac.get_external_data_submissions()) if stage_name == 'evaluation': # use util function call to get all evaluation tasks # append them to the task_list eval_task_factory = EvaluationTaskFactory(theargs.latest_weekly, theargs) task_list.extend(eval_task_factory.get_evaluation_tasks()) if len(task_list) is 0: raise NotImplementedError('uh oh no tasks for ' + stage_name + ' stage') return task_list