Пример #1
0
def recreate_project_from_db(analysis_top_dir, project_name, project_id):
    project_dir = os.path.join(analysis_top_dir, "DATA", project_name)
    project_obj = NGIProject(name=project_name,
                             dirname=project_name,
                             project_id=project_id,
                             base_path=analysis_top_dir)
    charon_session = CharonSession()
    try:
        samples_dict = charon_session.project_get_samples(project_id)["samples"]
    except CharonError as e:
        raise RuntimeError("Could not access samples for project {}: {}".format(project_id, e))
    for sample in samples_dict:
        sample_id = sample.get("sampleid")
        sample_dir = os.path.join(project_dir, sample_id)
        sample_obj = project_obj.add_sample(name=sample_id, dirname=sample_id)
        sample_obj.status = sample.get("status", "unknown")
        try:
            libpreps_dict = charon_session.sample_get_libpreps(project_id, sample_id)["libpreps"]
        except CharonError as e:
            raise RuntimeError("Could not access libpreps for project {} / sample {}: {}".format(project_id,sample_id, e))
        for libprep in libpreps_dict:
            libprep_id = libprep.get("libprepid")
            libprep_obj = sample_obj.add_libprep(name=libprep_id,  dirname=libprep_id)
            libprep_obj.status = libprep.get("status", "unknown")
            try:
                seqruns_dict = charon_session.libprep_get_seqruns(project_id, sample_id, libprep_id)["seqruns"]
            except CharonError as e:
                raise RuntimeError("Could not access seqruns for project {} / sample {} / "
                                   "libprep {}: {}".format(project_id, sample_id, libprep_id, e))
            for seqrun in seqruns_dict:
                # e.g. 140528_D00415_0049_BC423WACXX
                seqrun_id = seqrun.get("seqrunid")
                seqrun_obj = libprep_obj.add_seqrun(name=seqrun_id, dirname=seqrun_id)
                seqrun_obj.status = seqrun.get("status", "unknown")
    return project_obj
Пример #2
0
def recreate_project_from_db(analysis_top_dir, project_name, project_id):
    project_dir = os.path.join(analysis_top_dir, "DATA", project_name)
    project_obj = NGIProject(name=project_name,
                             dirname=project_name,
                             project_id=project_id,
                             base_path=analysis_top_dir)
    charon_session = CharonSession()
    try:
        samples_dict = charon_session.project_get_samples(project_id)["samples"]
    except CharonError as e:
        raise RuntimeError("Could not access samples for project {}: {}".format(project_id, e))
    for sample in samples_dict:
        sample_id = sample.get("sampleid")
        sample_dir = os.path.join(project_dir, sample_id)
        sample_obj = project_obj.add_sample(name=sample_id, dirname=sample_id)
        sample_obj.status = sample.get("status", "unknown")
        try:
            libpreps_dict = charon_session.sample_get_libpreps(project_id, sample_id)["libpreps"]
        except CharonError as e:
            raise RuntimeError("Could not access libpreps for project {} / sample {}: {}".format(project_id,sample_id, e))
        for libprep in libpreps_dict:
            libprep_id = libprep.get("libprepid")
            libprep_obj = sample_obj.add_libprep(name=libprep_id,  dirname=libprep_id)
            libprep_obj.status = libprep.get("status", "unknown")
            try:
                seqruns_dict = charon_session.libprep_get_seqruns(project_id, sample_id, libprep_id)["seqruns"]
            except CharonError as e:
                raise RuntimeError("Could not access seqruns for project {} / sample {} / "
                                   "libprep {}: {}".format(project_id, sample_id, libprep_id, e))
            for seqrun in seqruns_dict:
                # e.g. 140528_D00415_0049_BC423WACXX
                seqrun_id = seqrun.get("seqrunid")
                seqrun_obj = libprep_obj.add_seqrun(name=seqrun_id, dirname=seqrun_id)
                seqrun_obj.status = seqrun.get("status", "unknown")
    return project_obj
Пример #3
0
    def setUpClass(self):
        self.tmp_dir = tempfile.mkdtemp()
        self.workflow_subtask = 'subtask'
        self.project_base_path = self.tmp_dir
        self.project_name = 'S.One_20_12'
        self.project_id = 'P123'
        self.sample_id = 'P123_1001'
        self.libprep_id = 'A'
        self.seqrun_id = 'seqrun'

        self.project_obj = NGIProject(self.project_name, self.project_id,
                                      self.project_id, self.project_base_path)
        self.sample_obj = self.project_obj.add_sample(self.sample_id,
                                                      self.sample_id)
Пример #4
0
 def setUpClass(cls):
     cls.proj_name = "Y.Mom_15_01"
     cls.proj_id = "P1155"
     cls.libprep_id = "P1155_prepA"
     cls.seqrun_id = "P1155_seqrunA"
     cls.sample_name = "P1155_101"
     cls.engine_name = "piper_ngi"
     cls.proj_basepath = tempfile.mkdtemp()
     cls.workflow_name = "merge_process_variantcall"
     cls.xml_path = os.path.join(cls.proj_basepath, "some_config.xml")
     cls.exit_file = os.path.join(cls.proj_basepath, "some_file.exit")
     cls.config = load_yaml_config(locate_ngi_config())
     cls.project_obj = NGIProject(name=cls.proj_name,
                                  dirname=cls.proj_name,
                                  project_id=cls.proj_id,
                                  base_path=cls.proj_basepath)
     cls.sample_obj = cls.project_obj.add_sample(name=cls.sample_name,
                                                 dirname=cls.sample_name)
     # create a mock that can replace calls to charon
     cls.charon_mock = mock.Mock()
     cls.charon_mock.sample_get_libpreps = mock.Mock(return_value={
         'libpreps': [{
             'qc': 'PASSED',
             'libprepid': cls.libprep_id
         }]
     })
     cls.charon_mock.libprep_get_seqruns = mock.Mock(
         return_value={'seqruns': [{
             'seqrunid': cls.seqrun_id
         }]})
     cls.charon_mock.seqrun_get = mock.Mock(
         return_value={'alignment_status': ''})
     cls.charon_mock.project_get = mock.Mock(
         return_value={'sequencing_facility': 'Unknown'})
Пример #5
0
def test_launch_analysis(mock_analyze, mock_update, mock_get_engine):
    mock_get_engine.return_value = {
        'best_practice_analysis': 'wgs_germline',
        'status': 'OPEN'
    }
    project = NGIProject('S.One_20_01', 'dir_P123', 'P123', '/some/path')
    launch_analysis([project])
    mock_analyze.assert_called_once()
Пример #6
0
    def test_create_charon_entries_from_project(self):
        # Create the NGIObjects
        project_obj = NGIProject(name=self.p_name,
                                 dirname=self.p_name,
                                 project_id=self.p_id,
                                 base_path=self.p_bp)
        sample_obj = project_obj.add_sample(name=self.s_id, dirname=self.s_id)
        libprep_obj = sample_obj.add_libprep(name=self.l_id, dirname=self.l_id)
        seqrun_obj = libprep_obj.add_seqrun(name=self.sr_id,
                                            dirname=self.sr_id)

        try:
            # Create them in the db
            create_charon_entries_from_project(project_obj)
        finally:
            charon_session = CharonSession()
            charon_session.project_delete(project_obj.project_id)
Пример #7
0
def create_project_obj_from_analysis_log(project_name, project_id,
                                         project_base_path, sample_id, workflow):
    """Using the log of seqruns used for a sample analysis, recreate a project
    object with relevant sample, libprep, and seqrun objects.
    """
    analysis_log_filename = "{}-{}-{}.files".format(project_id, sample_id, workflow)
    analysis_log_path = os.path.join(project_base_path, "ANALYSIS",
                                     project_id, "piper_ngi", "logs", analysis_log_filename)
    with open(analysis_log_path, 'r') as f:
        analysis_dict = yaml.load(f)
    project_obj = NGIProject(name=project_name, dirname=project_id,
                             project_id=project_id, base_path=project_base_path)
    sample_obj = project_obj.add_sample(sample_id, sample_id)
    for libprep_name, seqrun_dict in analysis_dict[project_id][sample_id].items():
        libprep_obj = sample_obj.add_libprep(libprep_name, libprep_name)
        for seqrun_name in seqrun_dict.keys():
            libprep_obj.add_seqrun(seqrun_name, seqrun_name)
    return project_obj
Пример #8
0
    def test_get_engine_for_bp(self, mock_load, mock_charon):
        mock_charon.return_value = {'best_practice_analysis': 'some_BP'}
        mock_load.return_value = 'some_engine'
        conf = {'dummy': 'conf'}
        project = NGIProject('S.One_20_01', 'dir_P123', 'P123', '/some/path')
        got_engine = get_engine_for_bp(project, config=conf)

        mock_load.assert_called_once_with('some_BP', conf)
        self.assertEqual(got_engine, 'some_engine')
Пример #9
0
    def test_create_charon_entries_from_project(self):
        # Create the NGIObjects
        project_obj = NGIProject(name=self.p_name,
                                 dirname=self.p_name,
                                 project_id=self.p_id,
                                 base_path=self.p_bp)
        sample_obj = project_obj.add_sample(name=self.s_id,
                                            dirname=self.s_id)
        libprep_obj = sample_obj.add_libprep(name=self.l_id,
                                             dirname=self.l_id)
        seqrun_obj = libprep_obj.add_seqrun(name=self.sr_id,
                                            dirname=self.sr_id)

        try:
        # Create them in the db
            create_charon_entries_from_project(project_obj)
        finally:
            charon_session = CharonSession()
            charon_session.project_delete(project_obj.project_id)
Пример #10
0
    def setUp(self):
        # Details
        self.project_id = 'P100001'
        self.project_name = 'S.One_20_02'
        self.project_path = '/some/path'
        self.sample_id = 'P100001_101'
        self.libprep_id = 'A'
        self.seqrun_id = '201030_A00187_0332_AHFCFLDSXX'

        # Objects
        self.project_obj = NGIProject(name=self.project_name,
                                      dirname=self.project_name,
                                      project_id=self.project_id,
                                      base_path=self.project_path)
        self.sample_obj = self.project_obj.add_sample(name=self.sample_id,
                                                      dirname=self.sample_id)
        self.libprep_obj = self.sample_obj.add_libprep(name=self.libprep_id,
                                                       dirname=self.libprep_id)
        self.seqrun_obj = self.libprep_obj.add_seqrun(name=self.seqrun_id,
                                                      dirname=self.seqrun_id)
Пример #11
0
    def _project_from_fastq_file_paths(fastq_file_paths):
        """
        recreate the project object from a list of fastq file paths
        :param fastq_file_paths: list of fastq file paths, expected to be arranged in subfolders according to
        [/]path/to/project name/sample name/libprep name/seqrun name/fastq_file_name.fastq.gz

        :return: a ngi_pipeline.conductor.classes.NGIProject object recreated from the directory tree and fastq files
        """
        project_obj = None
        for fastq_file_path in fastq_file_paths:
            seqrun_path, fastq_file_name = os.path.split(fastq_file_path)
            libprep_path, seqrun_name = os.path.split(seqrun_path)
            sample_path, libprep_name = os.path.split(libprep_path)
            project_path, sample_name = os.path.split(sample_path)
            project_data_path, project_name = os.path.split(project_path)
            project_base_path = os.path.dirname(project_data_path)

            project_obj = project_obj or NGIProject(
                project_name, project_name, project_name, project_base_path)
            sample_obj = project_obj.add_sample(sample_name, sample_name)
            libprep_obj = sample_obj.add_libprep(libprep_name, libprep_name)
            seqrun_obj = libprep_obj.add_seqrun(seqrun_name, seqrun_name)
            seqrun_obj.add_fastq_files(fastq_file_name)
        return project_obj
Пример #12
0
def recreate_project_from_filesystem(project_dir,
                                     restrict_to_samples=None,
                                     restrict_to_libpreps=None,
                                     restrict_to_seqruns=None,
                                     force_create_project=False,
                                     config=None, config_file_path=None):
    """Recreates the full project/sample/libprep/seqrun set of
    NGIObjects using the directory tree structure."""

    from ngi_pipeline.database.classes import CharonError
    from ngi_pipeline.database.communicate import get_project_id_from_name

    if not restrict_to_samples: restrict_to_samples = []
    if not restrict_to_libpreps: restrict_to_libpreps = []
    if not restrict_to_seqruns: restrict_to_seqruns = []

    if os.path.islink(os.path.abspath(project_dir)):
        real_project_dir = os.path.realpath(project_dir)
        syml_project_dir = os.path.abspath(project_dir)
    else:
        real_project_dir = os.path.abspath(project_dir)
        search_dir = os.path.join(os.path.dirname(project_dir), "*")
        sym_files =  filter(os.path.islink, glob.glob(search_dir))
        for sym_file in sym_files:
            if os.path.realpath(sym_file) == os.path.realpath(real_project_dir):
                syml_project_dir = os.path.abspath(sym_file)
                break
        else:
            syml_project_dir = None
    project_id = os.path.split(real_project_dir)[1]
    if syml_project_dir:
        project_name = os.path.split(syml_project_dir)[1]
    else: # project name is the same as project id (Uppsala perhaps)
        project_name = project_id
    LOG.info('Setting up project "{}"'.format(project_id))
    project_obj = NGIProject(name=project_name,
                             dirname=project_id,
                             project_id=project_id,
                             base_path=config["analysis"]["top_dir"])
    samples_pattern = os.path.join(real_project_dir, "*")
    samples = filter(os.path.isdir, glob.glob(samples_pattern))
    if not samples:
        LOG.warn('No samples found for project "{}"'.format(project_obj))
    for sample_dir in samples:
        sample_name = os.path.basename(sample_dir)
        if restrict_to_samples and sample_name not in restrict_to_samples:
            LOG.debug('Skipping sample "{}": not in specified samples "{}"'.format(sample_name, ', '.join(restrict_to_samples)))
            continue
        LOG.info('Setting up sample "{}"'.format(sample_name))
        sample_obj = project_obj.add_sample(name=sample_name, dirname=sample_name)

        libpreps_pattern = os.path.join(sample_dir, "*")
        libpreps = filter(os.path.isdir, glob.glob(libpreps_pattern))
        if not libpreps:
            LOG.warn('No libpreps found for sample "{}"'.format(sample_obj))
        for libprep_dir in libpreps:
            libprep_name = os.path.basename(libprep_dir)
            if restrict_to_libpreps and libprep_name not in restrict_to_libpreps:
                LOG.debug('Skipping libprep "{}": not in specified libpreps "{}"'.format(libprep_name, ', '.join(restrict_to_libpreps)))
                continue
            LOG.info('Setting up libprep "{}"'.format(libprep_name))
            libprep_obj = sample_obj.add_libprep(name=libprep_name,
                                                    dirname=libprep_name)

            seqruns_pattern = os.path.join(libprep_dir, "*_*_*_*")
            seqruns = filter(os.path.isdir, glob.glob(seqruns_pattern))
            if not seqruns:
                LOG.warn('No seqruns found for libprep "{}"'.format(libprep_obj))
            for seqrun_dir in seqruns:
                seqrun_name = os.path.basename(seqrun_dir)
                if restrict_to_seqruns and seqrun_name not in restrict_to_seqruns:
                    LOG.debug('Skipping seqrun "{}": not in specified seqruns "{}"'.format(seqrun_name, ', '.join(restrict_to_seqruns)))
                    continue
                LOG.info('Setting up seqrun "{}"'.format(seqrun_name))
                seqrun_obj = libprep_obj.add_seqrun(name=seqrun_name,
                                                          dirname=seqrun_name)
                for fq_file in fastq_files_under_dir(seqrun_dir):
                    fq_name = os.path.basename(fq_file)
                    LOG.info('Adding fastq file "{}" to seqrun "{}"'.format(fq_name, seqrun_obj))
                    seqrun_obj.add_fastq_files([fq_name])
    return project_obj
Пример #13
0
 def test_ngi_proj(self):
     proj1 = NGIProject('S.One_20_01', 'dir_P123', 'P123', '/some/path')
     proj2 = NGIProject('S.One_20_01', 'dir_P123', 'P123', '/some/path')
     self.assertEqual(proj1, proj2)
Пример #14
0
def collect_files_for_sample_analysis(project_obj,
                                      sample_obj,
                                      restart_finished_jobs=False):
    """This function finds all data files relating to a sample and 
    follows a preset decision path to decide which of them to include in
    a sample-level analysis. This can include fastq files, bam files, and
    alignment-qc-level files.
    """
    ### FASTQ
    # Access the filesystem to determine what fastq files are available
    # For each file, validate it.

    # This funtion goes into Charon and finds all valid libpreps and seqruns,
    # dvs libpreps for which               'qc' != "FAILED"
    # and seqruns  for which 'alignment_status' != "DONE"
    valid_libprep_seqruns = get_valid_seqruns_for_sample(
        project_id=project_obj.project_id,
        sample_id=sample_obj.name,
        include_failed_libpreps=False,
        include_done_seqruns=restart_finished_jobs)
    if not valid_libprep_seqruns:
        LOG.error("Notify user or whatever. I don't know.")

    # Now we find all fastq files that are available and validate them against
    # the group compiled in the previous step (get_valid_seqruns_for_sample)
    # We're going to recreate NGIProject/NGISample/NGILibraryPrep/NGISeqrun objects here
    sample_data_directory = os.path.join(project_obj.base_path, "DATA",
                                         project_obj.dirname,
                                         sample_obj.dirname)
    fastq_files_on_filesystem = fastq_files_under_dir(sample_data_directory,
                                                      realpath=False)
    if not fastq_files_on_filesystem:
        LOG.error("TODO raise an error or something")

    fastq_files_to_analyze = []
    # Create a new NGIProject object (the old one could still be in use elsewhere)
    # Fix this later I've been coding for too long
    proj_obj = NGIProject(project_obj.name, project_obj.dirname,
                          project_obj.project_id, project_obj.base_path)
    sample_obj = proj_obj.add_sample(sample_obj.name, sample_obj.dirname)
    for fastq_path in fastq_files_on_filesystem:
        base_path, fastq = os.path.split(fastq_path)
        if not fastq:
            base_path, fastq = os.path.split(
                base_path)  # Handles trailing slash
        base_path, fs_seqrun_name = os.path.split(base_path)
        base_path, fs_libprep_name = os.path.split(base_path)
        if fs_libprep_name not in valid_libprep_seqruns.keys():
            # Invalid library prep, skip this fastq file
            continue
        elif fs_seqrun_name not in valid_libprep_seqruns.get(
                fs_libprep_name, []):
            continue
        else:
            libprep_obj = sample_obj.add_libprep(name=fs_libprep_name,
                                                 dirname=fs_libprep_name)
            seqrun_obj = libprep_obj.add_seqrun(name=fs_seqrun_name,
                                                dirname=fs_seqrun_name)
            seqrun_obj.add_fastq_files(fastq)

    ### BAM / ALIGNMENT QC
    # Access the filesystem to determine which alignment (bam) files are available.
    # If there are any, add them to the list of files to include in the new analysis.
    # Include alignment qc files.
    project_analysis_dir = os.path.join(project_obj.base_path, "ANALYSIS",
                                        project_obj.dirname)
    project_aln_dir = os.path.join(project_analysis_dir, "01_raw_alignments")
    project_alnqc_dir = os.path.join(project_analysis_dir,
                                     "02_preliminary_alignment_qc")
    sample_analysis_file_pattern = "{sample_name}.*.{sample_name}.*".format(
        sample_name=sample_obj.name)
    aln_files_to_copy = glob.glob(
        os.path.join(project_aln_dir, sample_analysis_file_pattern))
    qc_files_to_copy = glob.glob(
        os.path.join(project_alnqc_dir, sample_analysis_file_pattern))

    return (proj_obj, aln_files_to_copy, qc_files_to_copy)
Пример #15
0
class TestPiperUtils(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        self.tmp_dir = tempfile.mkdtemp()
        self.workflow_subtask = 'subtask'
        self.project_base_path = self.tmp_dir
        self.project_name = 'S.One_20_12'
        self.project_id = 'P123'
        self.sample_id = 'P123_1001'
        self.libprep_id = 'A'
        self.seqrun_id = 'seqrun'

        self.project_obj = NGIProject(self.project_name, self.project_id,
                                      self.project_id, self.project_base_path)
        self.sample_obj = self.project_obj.add_sample(self.sample_id,
                                                      self.sample_id)

    @classmethod
    def tearDownClass(self):
        shutil.rmtree(self.tmp_dir)

    def test_find_previous_genotype_analyses(self):
        project_dir = os.path.join(self.tmp_dir, 'ANALYSIS', 'P123',
                                   'piper_ngi', '01_genotype_concordance')
        os.makedirs(project_dir)
        sample_file = os.path.join(project_dir, 'P123_1001.gtc')
        open(sample_file, 'w').close()

        previous_analysis_not_done = utils.find_previous_genotype_analyses(
            self.project_obj, self.sample_obj)
        self.assertFalse(previous_analysis_not_done)

        sample_done_file = os.path.join(project_dir, '.P123_1001.gtc.done')
        open(sample_done_file, 'w').close()

        previous_analysis_done = utils.find_previous_genotype_analyses(
            self.project_obj, self.sample_obj)
        self.assertTrue(previous_analysis_done)
        shutil.rmtree(
            project_dir)  # Remove dir or it will interfere with other tests

    @mock.patch('ngi_pipeline.engines.piper_ngi.utils.os.remove')
    def test_remove_previous_genotype_analyses(self, mock_remove):
        project_dir = os.path.join(self.tmp_dir, 'ANALYSIS', 'P123',
                                   'piper_ngi', '02_genotype_concordance')
        os.makedirs(project_dir)
        sample_file = os.path.join(project_dir, 'P123-1001.gtc')
        open(sample_file, 'w').close()
        utils.remove_previous_genotype_analyses(self.project_obj)
        mock_remove.assert_called_once_with(sample_file)

    @mock.patch(
        'ngi_pipeline.engines.piper_ngi.utils.find_previous_sample_analyses')
    @mock.patch('ngi_pipeline.engines.piper_ngi.utils.os.remove')
    def test_remove_previous_sample_analyses(self, mock_remove, mock_find):
        file_to_remove = os.path.join(self.tmp_dir, 'a_file')
        open(file_to_remove, 'w').close()
        mock_find.return_value = [file_to_remove]

        utils.remove_previous_sample_analyses(self.project_obj)
        mock_remove.assert_called_once_with(file_to_remove)

    def test_find_previous_sample_analyses(self):
        project_dir = os.path.join(self.tmp_dir, 'ANALYSIS', 'P123',
                                   'piper_ngi', '01_files')
        os.makedirs(project_dir)
        sample_file = os.path.join(project_dir, 'P123_1001.out')
        open(sample_file, 'w').close()

        got_sample_files = utils.find_previous_sample_analyses(
            self.project_obj)
        self.assertEqual(got_sample_files, [sample_file])

    @mock.patch('ngi_pipeline.engines.piper_ngi.utils.datetime')
    @mock.patch('ngi_pipeline.engines.piper_ngi.utils.shutil.move')
    def test_rotate_previous_analysis(self, mock_move, mock_datetime):
        mock_datetime.datetime.now(
        ).strftime.return_value = '2020-11-13_09:30:12:640314'
        analysis_dir = os.path.join(self.tmp_dir, 'ANALYSIS', 'P123',
                                    'piper_ngi', '03_raw_alignments')
        os.makedirs(analysis_dir)
        sample_file = os.path.join(analysis_dir, 'P123-1001.bam')
        open(sample_file, 'w').close()

        utils.rotate_previous_analysis(self.project_obj)
        rotated_file = '{}/ANALYSIS/P123/piper_ngi/previous_analyses/2020-11-13_09:30:12:640314/03_raw_alignments'.format(
            self.tmp_dir)

        mock_move.assert_called_once_with(sample_file, rotated_file)

    @mock.patch('ngi_pipeline.engines.piper_ngi.utils.CharonSession')
    def test_get_finished_seqruns_for_sample(self, mock_charon):
        mock_charon().sample_get_libpreps.return_value = {
            'libpreps': [{
                'qc': 'PASS',
                'libprepid': 'A'
            }]
        }
        mock_charon().libprep_get_seqruns.return_value = {
            'seqruns': [{
                'seqrunid': 'B'
            }]
        }
        mock_charon().seqrun_get.return_value = {'alignment_status': 'DONE'}

        got_libpreps = utils.get_finished_seqruns_for_sample(
            self.project_id, self.sample_id)
        expected_libpreps = {'A': ['B']}

        self.assertEqual(got_libpreps, expected_libpreps)

    @mock.patch('ngi_pipeline.engines.piper_ngi.utils.CharonSession')
    def test_get_valid_seqruns_for_sample(self, mock_charon):
        mock_charon().sample_get_libpreps.return_value = {
            'libpreps': [{
                'qc': 'PASS',
                'libprepid': 'A'
            }]
        }
        mock_charon().libprep_get_seqruns.return_value = {
            'seqruns': [{
                'seqrunid': 'B'
            }]
        }

        got_libpreps = utils.get_valid_seqruns_for_sample(
            self.project_id, self.sample_id)
        expected_libpreps = {'A': ['B']}

        self.assertEqual(got_libpreps, expected_libpreps)

    def test_record_analysis_details(self):
        job_identifier = 'job_id'
        utils.record_analysis_details(self.project_obj, job_identifier)
        output_file_path = os.path.join(self.tmp_dir, 'ANALYSIS', 'P123',
                                        'piper_ngi', 'logs', 'job_id.files')
        with open(output_file_path, 'r') as f:
            got_content = yaml.load(f, Loader=yaml.FullLoader)
        expected_content = {'P123': {'P123_1001': {}}}
        self.assertEqual(got_content, expected_content)

    def test_create_project_obj_from_analysis_log(self):
        log_path = os.path.join(self.project_base_path, 'ANALYSIS',
                                self.project_id, 'piper_ngi', 'logs')
        os.makedirs(log_path)
        log_file = os.path.join(log_path, 'P123-P123_1001-workflow.files')
        log_content = ['{P123: {P123_1001: {}}}']
        with open(log_file, 'w') as f:
            f.write('\n'.join(log_content))

        got_project_obj = utils.create_project_obj_from_analysis_log(
            self.project_name, self.project_id, self.project_base_path,
            self.sample_id, 'workflow')
        self.assertEqual(got_project_obj, self.project_obj)

    @mock.patch('ngi_pipeline.engines.piper_ngi.utils.CharonSession')
    def test_check_for_preexisting_sample_runs(self, mock_charon):
        mock_charon().sample_get_libpreps.return_value = {
            'libpreps': [{
                'libprepid': 'A'
            }]
        }
        mock_charon().libprep_get_seqruns.return_value = {
            'seqruns': [{
                'seqrunid': 'B'
            }]
        }
        mock_charon().seqrun_get.return_value = {'alignment_status': 'RUNNING'}

        restart_running_jobs = False
        restart_finished_jobs = False

        with self.assertRaises(RuntimeError):
            utils.check_for_preexisting_sample_runs(self.project_obj,
                                                    self.sample_obj,
                                                    restart_running_jobs,
                                                    restart_finished_jobs)

    def test_create_sbatch_header(self):
        got_header = utils.create_sbatch_header('slurm_project_id',
                                                'slurm_queue', 17,
                                                'slurm_time', 'job_name',
                                                'slurm_out_log',
                                                'slurm_err_log')
        expected_header = """#!/bin/bash -l

#SBATCH -A slurm_project_id
#SBATCH -p slurm_queue
#SBATCH -n 16
#SBATCH -t slurm_time
#SBATCH -J job_name
#SBATCH -o slurm_out_log
#SBATCH -e slurm_err_log
"""
        self.assertEqual(got_header, expected_header)

    def test_add_exit_code_recording(self):
        cl = ['echo', 'Hello!']
        exit_code_path = '/some/path'
        got_cl = utils.add_exit_code_recording(cl, exit_code_path)
        expected_cl = 'echo Hello!; echo $? > /some/path'
        self.assertEqual(got_cl, expected_cl)

    def test_create_log_file_path(self):
        got_path = utils.create_log_file_path(self.workflow_subtask,
                                              self.project_base_path,
                                              self.project_name,
                                              self.project_id,
                                              sample_id=self.sample_id,
                                              libprep_id=self.libprep_id,
                                              seqrun_id=self.seqrun_id)
        expected_path = '{}/ANALYSIS/P123/piper_ngi/logs/P123-P123_1001-A-seqrun-subtask.log'.format(
            self.tmp_dir)

        self.assertEqual(got_path, expected_path)

    def test_create_exit_code_file_path(self):
        got_path = utils.create_exit_code_file_path(self.workflow_subtask,
                                                    self.project_base_path,
                                                    self.project_name,
                                                    self.project_id,
                                                    sample_id=self.sample_id,
                                                    libprep_id=self.libprep_id,
                                                    seqrun_id=self.seqrun_id)
        expected_path = '{}/ANALYSIS/P123/piper_ngi/logs/P123-P123_1001-A-seqrun-subtask.exit'.format(
            self.tmp_dir)

        self.assertEqual(got_path, expected_path)

    def test__create_generic_output_file_path(self):
        got_path = utils._create_generic_output_file_path(
            self.workflow_subtask,
            self.project_base_path,
            self.project_name,
            self.project_id,
            sample_id=self.sample_id,
            libprep_id=self.libprep_id,
            seqrun_id=self.seqrun_id)
        expected_path = '{}/ANALYSIS/P123/piper_ngi/logs/P123-P123_1001-A-seqrun-subtask'.format(
            self.tmp_dir)

        self.assertEqual(got_path, expected_path)
Пример #16
0
 def get_NGIProject(n):
     name = "{}_{}".format(NGIProject.__name__, n)
     project = NGIProject(name, "{}".format(name), "{}".format(name),
                          os.path.join("/path", "to", name, "base"))
     TestLaunchers.add_samples(project)
     return project
Пример #17
0
def collect_files_for_sample_analysis(project_obj,
                                      sample_obj,
                                      restart_finished_jobs=False,
                                      status_field="alignment_status"):
    """This function finds all data files relating to a sample and
    follows a preset decision path to decide which of them to include in
    a sample-level analysis. This can include fastq files, bam files, and
    alignment-qc-level files.
    Doesn't modify existing project or sample objects; returns new copies.

    :param NGIProject project_obj: The NGIProject object to process
    :param NGISample sample_obj: The NGISample object to process
    :param bool restart_finished_jobs: Include jobs marked as "DONE" (default False)
    :param str status_field: Which Charon status field to check (alignment, genotype)

    :returns: A new NGIProject object, a list of alignment and qc files
    :rtype: NGIProject, list, list

    :raises ValueError: If there are no valid libpreps, seqruns, or fastq files
    """
    ### FASTQ
    # Access the filesystem to determine what fastq files are available
    # For each file, validate it.

    # This funtion goes into Charon and finds all valid libpreps and seqruns,
    # dvs libpreps for which               'qc' != "FAILED"
    # and seqruns  for which 'alignment_status' != "DONE"
    valid_libprep_seqruns = \
            get_valid_seqruns_for_sample(project_id=project_obj.project_id,
                                         sample_id=sample_obj.name,
                                         include_failed_libpreps=False,
                                         include_done_seqruns=restart_finished_jobs,
                                         status_field=status_field)
    if not valid_libprep_seqruns:
        raise ValueError('No valid libpreps/seqruns found for project/sample '
                         '"{}/{}"'.format(project_obj, sample_obj))

    # Now we find all fastq files that are available and validate them against
    # the group compiled in the previous step (get_valid_seqruns_for_sample)
    # We're going to recreate NGIProject/NGISample/NGILibraryPrep/NGISeqrun objects here
    sample_data_directory = os.path.join(project_obj.base_path, "DATA",
                                         project_obj.dirname,
                                         sample_obj.dirname)
    fastq_files_on_filesystem = fastq_files_under_dir(sample_data_directory,
                                                      realpath=False)
    if not fastq_files_on_filesystem:
        raise ValueError('No valid fastq files found for project/sample '
                         '{}/{}'.format(project_obj, sample_obj))

    # Create a new NGIProject object (the old one could still be in use elsewhere)
    proj_obj = NGIProject(project_obj.name, project_obj.dirname,
                          project_obj.project_id, project_obj.base_path)
    sample_obj = proj_obj.add_sample(sample_obj.name, sample_obj.dirname)
    for fastq_path in fastq_files_on_filesystem:
        base_path, fastq = os.path.split(fastq_path)
        if not fastq:
            base_path, fastq = os.path.split(
                base_path)  # Handles trailing slash
        base_path, fs_seqrun_name = os.path.split(base_path)
        base_path, fs_libprep_name = os.path.split(base_path)
        if fs_libprep_name not in valid_libprep_seqruns.keys():
            # Invalid library prep, skip this fastq file
            continue
        elif fs_seqrun_name not in valid_libprep_seqruns.get(
                fs_libprep_name, []):
            continue
        else:
            libprep_obj = sample_obj.add_libprep(name=fs_libprep_name,
                                                 dirname=fs_libprep_name)
            seqrun_obj = libprep_obj.add_seqrun(name=fs_seqrun_name,
                                                dirname=fs_seqrun_name)
            seqrun_obj.add_fastq_files(fastq)

    ### EXISTING DATA
    # If we still have data here at this point, we'll copy it over. If we had
    # decided to scrap it, it would have been deleted already.
    files_to_copy = find_previous_sample_analyses(proj_obj, sample_obj)

    return (proj_obj, files_to_copy)
Пример #18
0
def recreate_project_from_filesystem(project_dir,
                                     restrict_to_samples=None,
                                     restrict_to_libpreps=None,
                                     restrict_to_seqruns=None):
    """Recreates the full project/sample/libprep/seqrun set of
    NGIObjects using the directory tree structure."""

    if not restrict_to_samples: restrict_to_samples = []
    if not restrict_to_libpreps: restrict_to_libpreps = []
    if not restrict_to_seqruns: restrict_to_seqruns = []

    base_path, project_name = os.path.split(project_dir)
    if not project_name:
        base_path, project_name = os.path.split(base_path)
    LOG.info('Setting up project "{}"'.format(project_name))
    try:
        # This requires Charon access -- maps e.g. "Y.Mom_14_01" to "P123"
        project_id = get_project_id_from_name(project_name)
    # Should handle requests.exceptions.Timeout in Charon classes
    except (CharonError, ValueError, Timeout) as e:
        error_msg = ('Cannot proceed with project "{}" due to '
                     'Charon-related error: {}'.format(project_name, e))
        raise CharonError(error_msg)
    project_obj = NGIProject(name=project_name,
                             dirname=project_name,
                             project_id=project_id,
                             base_path=base_path)

    samples_pattern = os.path.join(project_dir, "*")
    samples = filter(os.path.isdir, glob.glob(samples_pattern))
    if not samples:
        LOG.warn('No samples found for project "{}"'.format(project_obj))
    for sample_dir in samples:
        sample_name = os.path.basename(sample_dir)
        if restrict_to_samples and sample_name not in restrict_to_samples:
            LOG.debug('Skipping sample "{}": not in specified samples "{}"'.format(sample_name, ', '.join(restrict_to_samples)))
            continue
        LOG.info('Setting up sample "{}"'.format(sample_name))
        sample_obj = project_obj.add_sample(name=sample_name, dirname=sample_name)

        libpreps_pattern = os.path.join(sample_dir, "*")
        libpreps = filter(os.path.isdir, glob.glob(libpreps_pattern))
        if not libpreps:
            LOG.warn('No libpreps found for sample "{}"'.format(sample_obj))
        for libprep_dir in libpreps:
            libprep_name = os.path.basename(libprep_dir)
            if restrict_to_libpreps and libprep_name not in restrict_to_libpreps:
                LOG.debug('Skipping libprep "{}": not in specified libpreps "{}"'.format(libprep_name, ', '.join(restrict_to_libpreps)))
                continue
            LOG.info('Setting up libprep "{}"'.format(libprep_name))
            libprep_obj = sample_obj.add_libprep(name=libprep_name,
                                                    dirname=libprep_name)

            seqruns_pattern = os.path.join(libprep_dir, "*_*_*_*")
            seqruns = filter(os.path.isdir, glob.glob(seqruns_pattern))
            if not seqruns:
                LOG.warn('No seqruns found for libprep "{}"'.format(libprep_obj))
            for seqrun_dir in seqruns:
                seqrun_name = os.path.basename(seqrun_dir)
                if restrict_to_seqruns and seqrun_name not in restrict_to_seqruns:
                    LOG.debug('Skipping seqrun "{}": not in specified seqruns "{}"'.format(seqrun_name, ', '.join(restrict_to_seqruns)))
                    continue
                LOG.info('Setting up seqrun "{}"'.format(seqrun_name))
                seqrun_obj = libprep_obj.add_seqrun(name=seqrun_name,
                                                          dirname=seqrun_name)
                pattern = re.compile(".*\.(fastq|fq)(\.gz|\.gzip|\.bz2)?$")
                all_files = glob.glob(os.path.join(seqrun_dir, "*"))
                fastq_files = filter(os.path.isfile, filter(pattern.match, all_files))
                for fq_file in fastq_files:
                    fq_name = os.path.basename(fq_file)
                    LOG.info('Adding fastq file "{}" to seqrun "{}"'.format(fq_name, seqrun_obj))
                    seqrun_obj.add_fastq_files([fq_name])
    return project_obj
Пример #19
0
 def setUpClass(self):
     self.tmp_dir = tempfile.mkdtemp()
     self.project = NGIProject('S.One_20_01', 'dir_P123', 'P123',
                               self.tmp_dir)
     self.sample = NGISample('P123_1001', 'dir_P123_1001')
Пример #20
0
class TestCharonFunctions(unittest.TestCase):
    def setUp(self):
        # Details
        self.project_id = 'P100001'
        self.project_name = 'S.One_20_02'
        self.project_path = '/some/path'
        self.sample_id = 'P100001_101'
        self.libprep_id = 'A'
        self.seqrun_id = '201030_A00187_0332_AHFCFLDSXX'

        # Objects
        self.project_obj = NGIProject(name=self.project_name,
                                      dirname=self.project_name,
                                      project_id=self.project_id,
                                      base_path=self.project_path)
        self.sample_obj = self.project_obj.add_sample(name=self.sample_id,
                                                      dirname=self.sample_id)
        self.libprep_obj = self.sample_obj.add_libprep(name=self.libprep_id,
                                                       dirname=self.libprep_id)
        self.seqrun_obj = self.libprep_obj.add_seqrun(name=self.seqrun_id,
                                                      dirname=self.seqrun_id)

    @mock.patch.dict(os.environ, {
        'CHARON_BASE_URL': 'charon-url',
        'CHARON_API_TOKEN': 'token'
    })
    @mock.patch('ngi_pipeline.database.filesystem.CharonSession.project_create'
                )
    @mock.patch('ngi_pipeline.database.filesystem.CharonSession.sample_create')
    @mock.patch('ngi_pipeline.database.filesystem.CharonSession.libprep_create'
                )
    @mock.patch('ngi_pipeline.database.filesystem.CharonSession.seqrun_create')
    def test_create_charon_entries_from_project(self, mock_seqrun,
                                                mock_libprep, mock_sample,
                                                mock_proj):
        create_charon_entries_from_project(self.project_obj)
        mock_proj.assert_called_once_with(
            best_practice_analysis='whole_genome_reseq',
            name='S.One_20_02',
            projectid='P100001',
            sequencing_facility='NGI-S',
            status='OPEN')
        mock_sample.assert_called_once_with(analysis_status='TO_ANALYZE',
                                            projectid='P100001',
                                            sampleid='P100001_101')
        mock_libprep.assert_called_once_with(libprepid='A',
                                             projectid='P100001',
                                             qc='PASSED',
                                             sampleid='P100001_101')
        mock_seqrun.assert_called_once_with(
            alignment_status='NOT_RUNNING',
            libprepid='A',
            mean_autosomal_coverage=0,
            projectid='P100001',
            sampleid='P100001_101',
            seqrunid='201030_A00187_0332_AHFCFLDSXX',
            total_reads=0)

    @mock.patch.dict(os.environ, {
        'CHARON_BASE_URL': 'charon-url',
        'CHARON_API_TOKEN': 'token'
    })
    @mock.patch('ngi_pipeline.database.filesystem.CharonSession.project_create'
                )
    @mock.patch('ngi_pipeline.database.filesystem.CharonSession.project_update'
                )
    @mock.patch('ngi_pipeline.database.filesystem.CharonSession.sample_create')
    @mock.patch('ngi_pipeline.database.filesystem.CharonSession.sample_update')
    @mock.patch('ngi_pipeline.database.filesystem.CharonSession.libprep_create'
                )
    @mock.patch('ngi_pipeline.database.filesystem.CharonSession.libprep_update'
                )
    @mock.patch('ngi_pipeline.database.filesystem.CharonSession.seqrun_create')
    @mock.patch('ngi_pipeline.database.filesystem.CharonSession.seqrun_update')
    def test_create_charon_entries_from_project_update(
            self, mock_seqrun_ud, mock_seqrun_cr, mock_libprep_ud,
            mock_libprep_cr, mock_sample_ud, mock_sample_cr, mock_project_ud,
            mock_project_cr):
        # Not the neatest of tests but gets the job done...
        mock_project_cr.side_effect = CharonError('Error', status_code=400)
        mock_sample_cr.side_effect = CharonError('Error', status_code=400)
        mock_libprep_cr.side_effect = CharonError('Error', status_code=400)
        mock_seqrun_cr.side_effect = CharonError('Error', status_code=400)

        create_charon_entries_from_project(self.project_obj,
                                           force_overwrite=True)

        mock_project_ud.assert_called_once_with(
            best_practice_analysis='whole_genome_reseq',
            name='S.One_20_02',
            projectid='P100001',
            sequencing_facility='NGI-S',
            status='OPEN')
        mock_sample_ud.assert_called_once_with(analysis_status='TO_ANALYZE',
                                               projectid='P100001',
                                               sampleid='P100001_101',
                                               status='STALE')
        mock_libprep_ud.assert_called_once_with(libprepid='A',
                                                projectid='P100001',
                                                qc='PASSED',
                                                sampleid='P100001_101')
        mock_seqrun_ud.assert_called_once_with(
            alignment_status='NOT_RUNNING',
            libprepid='A',
            mean_autosomal_coverage=0,
            projectid='P100001',
            sampleid='P100001_101',
            seqrunid='201030_A00187_0332_AHFCFLDSXX',
            total_reads=0)
Пример #21
0
def collect_files_for_sample_analysis(project_obj, sample_obj, 
                                      restart_finished_jobs=False):
    """This function finds all data files relating to a sample and 
    follows a preset decision path to decide which of them to include in
    a sample-level analysis. This can include fastq files, bam files, and
    alignment-qc-level files.
    """
    ### FASTQ
    # Access the filesystem to determine what fastq files are available
    # For each file, validate it.

    # This funtion goes into Charon and finds all valid libpreps and seqruns,
    # dvs libpreps for which               'qc' != "FAILED"
    # and seqruns  for which 'alignment_status' != "DONE"
    valid_libprep_seqruns = get_valid_seqruns_for_sample(project_id=project_obj.project_id,
                                                         sample_id=sample_obj.name,
                                                         include_failed_libpreps=False,
                                                         include_done_seqruns=restart_finished_jobs)
    if not valid_libprep_seqruns: LOG.error("Notify user or whatever. I don't know.")

    # Now we find all fastq files that are available and validate them against
    # the group compiled in the previous step (get_valid_seqruns_for_sample)
    # We're going to recreate NGIProject/NGISample/NGILibraryPrep/NGISeqrun objects here
    sample_data_directory = os.path.join(project_obj.base_path, "DATA",
                                         project_obj.dirname, sample_obj.dirname)
    fastq_files_on_filesystem = fastq_files_under_dir(sample_data_directory, realpath=False)
    if not fastq_files_on_filesystem: LOG.error("TODO raise an error or something")

    fastq_files_to_analyze = []
    # Create a new NGIProject object (the old one could still be in use elsewhere)
    # Fix this later I've been coding for too long
    proj_obj = NGIProject(project_obj.name, project_obj.dirname,
                          project_obj.project_id, project_obj.base_path)
    sample_obj = proj_obj.add_sample(sample_obj.name, sample_obj.dirname)
    for fastq_path in fastq_files_on_filesystem:
        base_path, fastq = os.path.split(fastq_path)
        if not fastq:
            base_path, fastq = os.path.split(base_path) # Handles trailing slash
        base_path, fs_seqrun_name = os.path.split(base_path)
        base_path, fs_libprep_name = os.path.split(base_path)
        if fs_libprep_name not in valid_libprep_seqruns.keys():
            # Invalid library prep, skip this fastq file
            continue
        elif fs_seqrun_name not in valid_libprep_seqruns.get(fs_libprep_name, []):
            continue
        else:
            libprep_obj = sample_obj.add_libprep(name=fs_libprep_name, dirname=fs_libprep_name)
            seqrun_obj = libprep_obj.add_seqrun(name=fs_seqrun_name, dirname=fs_seqrun_name)
            seqrun_obj.add_fastq_files(fastq)

    ### BAM / ALIGNMENT QC
    # Access the filesystem to determine which alignment (bam) files are available.
    # If there are any, add them to the list of files to include in the new analysis.
    # Include alignment qc files.
    project_analysis_dir = os.path.join(project_obj.base_path, "ANALYSIS", project_obj.dirname)
    project_aln_dir = os.path.join(project_analysis_dir, "01_raw_alignments")
    project_alnqc_dir = os.path.join(project_analysis_dir, "02_preliminary_alignment_qc")
    sample_analysis_file_pattern = "{sample_name}.*.{sample_name}.*".format(sample_name=sample_obj.name)
    aln_files_to_copy = glob.glob(os.path.join(project_aln_dir, sample_analysis_file_pattern))
    qc_files_to_copy = glob.glob(os.path.join(project_alnqc_dir, sample_analysis_file_pattern))

    return (proj_obj, aln_files_to_copy, qc_files_to_copy)
Пример #22
0
def setup_analysis_directory_structure(fc_dir,
                                       projects_to_analyze,
                                       restrict_to_projects=None,
                                       restrict_to_samples=None,
                                       create_files=True,
                                       fallback_libprep=None,
                                       quiet=False,
                                       config=None,
                                       config_file_path=None):
    """
    Copy and sort files from their CASAVA-demultiplexed flowcell structure
    into their respective project/sample/libPrep/FCIDs. This collects samples
    split across multiple flowcells.

    :param str fc_dir: The directory created by CASAVA for this flowcell.
    :param dict config: The parsed configuration file.
    :param set projects_to_analyze: A dict (of Project objects, or empty)
    :param bool create_files: Alter the filesystem (as opposed to just parsing flowcells) (default True)
    :param str fallback_libprep: If libprep cannot be determined, use this value if supplied (default None)
    :param list restrict_to_projects: Specific projects within the flowcell to process exclusively
    :param list restrict_to_samples: Specific samples within the flowcell to process exclusively

    :returns: A list of NGIProject objects that need to be run through the analysis pipeline
    :rtype: list

    :raises KeyError: If a required configuration key is not available.
    """
    LOG.info(
        "Setting up analysis for demultiplexed data in source folder \"{}\"".
        format(fc_dir))
    if not restrict_to_projects: restrict_to_projects = []
    if not restrict_to_samples: restrict_to_samples = []
    config[
        "quiet"] = quiet  # Hack because I enter here from a script sometimes
    #Checks flowcell path to establish which group owns it
    pattern = ".+({}|{})\/.+".format(config["analysis"]["sthlm_root"],
                                     config["analysis"]["upps_root"])
    matches = re.match(pattern, fc_dir)
    if matches:
        flowcell_uppnexid = matches.group(1)
    else:
        LOG.error(
            "cannot guess which project (sthlm/uppsala) the flowcell {} belongs to"
            .format(fc_dir))
        raise RuntimeError

    analysis_top_dir = os.path.abspath(
        os.path.join(config["analysis"]["base_root"], flowcell_uppnexid,
                     config["analysis"]["top_dir"]))
    try:
        safe_makedir(analysis_top_dir)
    except OSError as e:
        LOG.error(
            'Error: Analysis top directory {} does not exist and could not '
            'be created.'.format(analysis_top_dir))
    fc_dir = fc_dir if os.path.isabs(fc_dir) else os.path.join(
        analysis_top_dir, fc_dir)
    if not os.path.exists(fc_dir):
        LOG.error("Error: Flowcell directory {} does not exist".format(fc_dir))
        return []
    # Map the directory structure for this flowcell
    try:
        fc_dir_structure = parse_flowcell(fc_dir)
    except (OSError, ValueError) as e:
        LOG.error("Error when processing flowcell dir \"{}\": {}".format(
            fc_dir, e))
        return []
    fc_full_id = fc_dir_structure['fc_full_id']
    if not fc_dir_structure.get('projects'):
        LOG.warning(
            "No projects found in specified flowcell directory \"{}\"".format(
                fc_dir))

    # Iterate over the projects in the flowcell directory
    for project in fc_dir_structure.get('projects', []):
        project_name = project['project_name']
        project_original_name = project['project_original_name']
        samplesheet_path = fc_dir_structure.get("samplesheet_path")

        # parse the samplesheet and get the expected sample numbers assigned by bcl2fastq
        samplesheet_sample_numbers = get_sample_numbers_from_samplesheet(
            samplesheet_path) if samplesheet_path else None

        try:
            # Maps e.g. "Y.Mom_14_01" to "P123"
            project_id = get_project_id_from_name(project_name)
        except (CharonError, RuntimeError, ValueError) as e:
            LOG.warning(
                'Could not retrieve project id from Charon (record missing?). '
                'Using project name ("{}") as project id '
                '(error: {})'.format(project_name, e))
            project_id = project_name
        # If specific projects are specified, skip those that do not match
        if restrict_to_projects and project_name not in restrict_to_projects and \
                                    project_id not in restrict_to_projects:
            LOG.debug(
                "Skipping project {} (not in restrict_to_projects)".format(
                    project_name))
            continue
        LOG.info("Setting up project {}".format(project.get("project_name")))
        # Create a project directory if it doesn't already exist, including
        # intervening "DATA" directory
        project_dir = os.path.join(analysis_top_dir, "DATA", project_id)
        project_sl_dir = os.path.join(analysis_top_dir, "DATA", project_name)
        project_analysis_dir = os.path.join(analysis_top_dir, "ANALYSIS",
                                            project_id)
        project_analysis_sl_dir = os.path.join(analysis_top_dir, "ANALYSIS",
                                               project_name)
        if create_files:
            safe_makedir(project_dir, 0o2770)
            safe_makedir(project_analysis_dir, 0o2770)
            if not project_dir == project_sl_dir and \
               not os.path.exists(project_sl_dir):
                os.symlink(project_dir, project_sl_dir)
            if not project_analysis_dir == project_analysis_sl_dir and \
               not os.path.exists(project_analysis_sl_dir):
                os.symlink(project_analysis_dir, project_analysis_sl_dir)
        try:
            project_obj = projects_to_analyze[project_dir]
        except KeyError:
            project_obj = NGIProject(name=project_name,
                                     dirname=project_id,
                                     project_id=project_id,
                                     base_path=analysis_top_dir)
            projects_to_analyze[project_dir] = project_obj
        # Iterate over the samples in the project
        for sample in project.get('samples', []):
            sample_name = sample['sample_name']
            # If specific samples are specified, skip those that do not match
            if restrict_to_samples and sample_name not in restrict_to_samples:
                LOG.debug("Skipping sample {}: not in specified samples "
                          "{}".format(sample_name,
                                      ", ".join(restrict_to_samples)))
                continue
            LOG.info("Setting up sample {}".format(sample_name))
            # Create a directory for the sample if it doesn't already exist
            sample_dir = os.path.join(project_dir, sample_name)
            if create_files: safe_makedir(sample_dir, 0o2770)
            # This will only create a new sample object if it doesn't already exist in the project
            sample_obj = project_obj.add_sample(name=sample_name,
                                                dirname=sample_name)
            # Get the Library Prep ID for each file
            pattern = re.compile(".*\.(fastq|fq)(\.gz|\.gzip|\.bz2)?$")
            fastq_files = list(filter(pattern.match, sample.get('files', [])))
            # For each fastq file, create the libprep and seqrun objects
            # and add the fastq file to the seqprep object
            # Note again that these objects only get created if they don't yet exist;
            # if they do exist, the existing object is returned
            for fq_file in fastq_files:
                # Try to use assignment from SampleSheet
                samplesheet_sample = match_fastq_sample_number_to_samplesheet(
                    fq_file, samplesheet_sample_numbers, project_id)
                if samplesheet_sample is not None and \
                        samplesheet_sample[6] is not None:
                    libprep_name = samplesheet_sample[6]
                else:
                    LOG.debug(
                        'Unable to determine library prep from sample sheet file; try to determine from Charon'
                    )
                    try:
                        # Requires Charon access
                        libprep_name = determine_library_prep_from_fcid(
                            project_id, sample_name, fc_full_id)
                        LOG.debug('Found libprep name "{}" in Charon'.format(
                            libprep_name))
                    except ValueError:
                        charon_session = CharonSession()
                        libpreps = charon_session.sample_get_libpreps(
                            project_id, sample_name).get('libpreps')
                        if len(libpreps) == 1:
                            libprep_name = libpreps[0].get('libprepid')
                            LOG.warning(
                                'Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" '
                                'has no libprep information in Charon, but only one '
                                'library prep is present in Charon ("{}"). Using '
                                'this as the library prep.'.format(
                                    project_name, sample_name, fc_full_id,
                                    fq_file, libprep_name))
                        elif fallback_libprep:
                            libprep_name = fallback_libprep
                            LOG.warning(
                                'Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" '
                                'has no libprep information in Charon, but a fallback '
                                'libprep value of "{}" was supplied -- using this '
                                'value.'.format(project_name, sample_name,
                                                fc_full_id, fq_file,
                                                libprep_name))
                        else:
                            error_text = (
                                'Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" '
                                'has no libprep information in Charon. Skipping '
                                'analysis.'.format(project_name, sample_name,
                                                   fc_full_id, fq_file))
                            LOG.error(error_text)
                            if not config.get('quiet'):
                                mail_analysis(project_name=project_name,
                                              sample_name=sample_name,
                                              level="ERROR",
                                              info_text=error_text)
                            continue
                libprep_object = sample_obj.add_libprep(name=libprep_name,
                                                        dirname=libprep_name)
                libprep_dir = os.path.join(sample_dir, libprep_name)
                if create_files: safe_makedir(libprep_dir, 0o2770)
                seqrun_object = libprep_object.add_seqrun(name=fc_full_id,
                                                          dirname=fc_full_id)
                seqrun_dir = os.path.join(libprep_dir, fc_full_id)
                if create_files: safe_makedir(seqrun_dir, 0o2770)
                seqrun_object.add_fastq_files(fq_file)
            if fastq_files and create_files:
                src_sample_dir = os.path.join(fc_dir_structure['fc_dir'],
                                              project['data_dir'],
                                              project['project_dir'],
                                              sample['sample_dir'])
                for libprep_obj in sample_obj:
                    for seqrun_obj in libprep_obj:
                        src_fastq_files = [
                            os.path.join(src_sample_dir, fastq_file)
                            for fastq_file in seqrun_obj.fastq_files
                        ]
                        seqrun_dst_dir = os.path.join(project_obj.base_path,
                                                      "DATA",
                                                      project_obj.dirname,
                                                      sample_obj.dirname,
                                                      libprep_obj.dirname,
                                                      seqrun_obj.dirname)
                        LOG.info(
                            "Symlinking fastq files from {} to {}...".format(
                                src_sample_dir, seqrun_dst_dir))
                        try:
                            do_symlink(src_fastq_files, seqrun_dst_dir)
                        except OSError:
                            error_text = (
                                'Could not symlink files for project/sample'
                                'libprep/seqrun {}/{}/{}/{}'.format(
                                    project_obj, sample_obj, libprep_obj,
                                    seqrun_obj))
                            LOG.error(error_text)
                            if not config.get('quiet'):
                                mail_analysis(project_name=project_name,
                                              sample_name=sample_name,
                                              level="ERROR",
                                              info_text=error_text)
                            continue
    return projects_to_analyze
Пример #23
0
def recreate_project_from_filesystem(project_dir,
                                     restrict_to_samples=None,
                                     restrict_to_libpreps=None,
                                     restrict_to_seqruns=None,
                                     force_create_project=False,
                                     config=None, config_file_path=None):
    """Recreates the full project/sample/libprep/seqrun set of
    NGIObjects using the directory tree structure."""

    from ngi_pipeline.database.classes import CharonError
    from ngi_pipeline.database.communicate import get_project_id_from_name

    if not restrict_to_samples: restrict_to_samples = []
    if not restrict_to_libpreps: restrict_to_libpreps = []
    if not restrict_to_seqruns: restrict_to_seqruns = []

    project_dir = locate_project(project_dir)

    if os.path.islink(os.path.abspath(project_dir)):
        real_project_dir = os.path.realpath(project_dir)
        syml_project_dir = os.path.abspath(project_dir)
    else:
        real_project_dir = os.path.abspath(project_dir)
        search_dir = os.path.join(os.path.dirname(project_dir), "*")
        sym_files = filter(os.path.islink, glob.glob(search_dir))
        for sym_file in sym_files:
            if os.path.realpath(sym_file) == os.path.realpath(real_project_dir):
                syml_project_dir = os.path.abspath(sym_file)
                break
        else:
            syml_project_dir = None
    project_base_path, project_id = os.path.split(real_project_dir)
    if syml_project_dir:
        project_base_path, project_name = os.path.split(syml_project_dir)
    else: # project name is the same as project id (Uppsala perhaps)
        project_name = project_id
    if os.path.split(project_base_path)[1] == "DATA":
        project_base_path = os.path.split(project_base_path)[0]
    LOG.info('Setting up project "{}"'.format(project_id))
    project_obj = NGIProject(name=project_name,
                             dirname=project_id,
                             project_id=project_id,
                             base_path=project_base_path)
    samples_pattern = os.path.join(real_project_dir, "*")
    samples = filter(os.path.isdir, glob.glob(samples_pattern))
    if not samples:
        LOG.warn('No samples found for project "{}"'.format(project_obj))
    for sample_dir in samples:
        sample_name = os.path.basename(sample_dir)
        if restrict_to_samples and sample_name not in restrict_to_samples:
            LOG.debug('Skipping sample "{}": not in specified samples '
                      '"{}"'.format(sample_name, ', '.join(restrict_to_samples)))
            continue
        LOG.info('Setting up sample "{}"'.format(sample_name))
        sample_obj = project_obj.add_sample(name=sample_name, dirname=sample_name)

        libpreps_pattern = os.path.join(sample_dir, "*")
        libpreps = filter(os.path.isdir, glob.glob(libpreps_pattern))
        if not libpreps:
            LOG.warn('No libpreps found for sample "{}"'.format(sample_obj))
        for libprep_dir in libpreps:
            libprep_name = os.path.basename(libprep_dir)
            if restrict_to_libpreps and libprep_name not in restrict_to_libpreps:
                LOG.debug('Skipping libprep "{}": not in specified libpreps '
                          '"{}"'.format(libprep_name, ', '.join(restrict_to_libpreps)))
                continue
            LOG.info('Setting up libprep "{}"'.format(libprep_name))
            libprep_obj = sample_obj.add_libprep(name=libprep_name,
                                                 dirname=libprep_name)

            seqruns_pattern = os.path.join(libprep_dir, "*_*_*_*")
            seqruns = filter(os.path.isdir, glob.glob(seqruns_pattern))
            if not seqruns:
                LOG.warn('No seqruns found for libprep "{}"'.format(libprep_obj))
            for seqrun_dir in seqruns:
                seqrun_name = os.path.basename(seqrun_dir)
                if restrict_to_seqruns and seqrun_name not in restrict_to_seqruns:
                    LOG.debug('Skipping seqrun "{}": not in specified seqruns '
                              '"{}"'.format(seqrun_name, ', '.join(restrict_to_seqruns)))
                    continue
                LOG.info('Setting up seqrun "{}"'.format(seqrun_name))
                seqrun_obj = libprep_obj.add_seqrun(name=seqrun_name,
                                                    dirname=seqrun_name)
                for fq_file in fastq_files_under_dir(seqrun_dir, realpath=False):
                    fq_name = os.path.basename(fq_file)
                    LOG.info('Adding fastq file "{}" to seqrun "{}"'.format(fq_name, seqrun_obj))
                    seqrun_obj.add_fastq_files([fq_name])
    return project_obj
Пример #24
0
def setup_analysis_directory_structure(fc_dir, projects_to_analyze,
                                       restrict_to_projects=None, restrict_to_samples=None,
                                       create_files=True,
                                       fallback_libprep=None,
                                       quiet=False,
                                       config=None, config_file_path=None):
    """
    Copy and sort files from their CASAVA-demultiplexed flowcell structure
    into their respective project/sample/libPrep/FCIDs. This collects samples
    split across multiple flowcells.

    :param str fc_dir: The directory created by CASAVA for this flowcell.
    :param dict config: The parsed configuration file.
    :param set projects_to_analyze: A dict (of Project objects, or empty)
    :param bool create_files: Alter the filesystem (as opposed to just parsing flowcells) (default True)
    :param str fallback_libprep: If libprep cannot be determined, use this value if supplied (default None)
    :param list restrict_to_projects: Specific projects within the flowcell to process exclusively
    :param list restrict_to_samples: Specific samples within the flowcell to process exclusively

    :returns: A list of NGIProject objects that need to be run through the analysis pipeline
    :rtype: list

    :raises KeyError: If a required configuration key is not available.
    """
    LOG.info("Setting up analysis for demultiplexed data in source folder \"{}\"".format(fc_dir))
    if not restrict_to_projects: restrict_to_projects = []
    if not restrict_to_samples: restrict_to_samples = []
    config["quiet"] = quiet # Hack because I enter here from a script sometimes
    pattern="(.+(?:{}|{}))\/.+".format(config["analysis"]["sthlm_root"], config["analysis"]["upps_root"])
    matches=re.match(pattern, fc_dir)
    if matches:
        flowcell_root=matches.group(1)
    else:
        LOG.error("cannot guess which project the flowcell {} belongs to".format(fc_dir))
        raise RuntimeError

    analysis_top_dir = os.path.abspath(os.path.join(flowcell_root,config["analysis"]["top_dir"]))
    try:
        safe_makedir(analysis_top_dir)
    except OSError as e:
        LOG.error('Error: Analysis top directory {} does not exist and could not '
                  'be created.'.format(analysis_top_dir))
    fc_dir = fc_dir if os.path.isabs(fc_dir) else os.path.join(analysis_top_dir, fc_dir)
    if not os.path.exists(fc_dir):
        LOG.error("Error: Flowcell directory {} does not exist".format(fc_dir))
        return []
    # Map the directory structure for this flowcell
    try:
        fc_dir_structure = parse_flowcell(fc_dir)
    except (OSError, ValueError) as e:
        LOG.error("Error when processing flowcell dir \"{}\": {}".format(fc_dir, e))
        return []
    fc_full_id = fc_dir_structure['fc_full_id']
    if not fc_dir_structure.get('projects'):
        LOG.warn("No projects found in specified flowcell directory \"{}\"".format(fc_dir))
    # Iterate over the projects in the flowcell directory
    for project in fc_dir_structure.get('projects', []):
        project_name = project['project_name']
        project_original_name = project['project_original_name']
        samplesheet_path = fc_dir_structure.get("samplesheet_path")
        try:
            # Maps e.g. "Y.Mom_14_01" to "P123"
            project_id = get_project_id_from_name(project_name)
        except (CharonError, RuntimeError, ValueError) as e:
            LOG.warn('Could not retrieve project id from Charon (record missing?). '
                     'Using project name ("{}") as project id '
                     '(error: {})'.format(project_name, e))
            project_id = project_name
        # If specific projects are specified, skip those that do not match
        if restrict_to_projects and project_name not in restrict_to_projects and \
                                    project_id not in restrict_to_projects:
            LOG.debug("Skipping project {} (not in restrict_to_projects)".format(project_name))
            continue
        LOG.info("Setting up project {}".format(project.get("project_name")))
        # Create a project directory if it doesn't already exist, including
        # intervening "DATA" directory
        project_dir = os.path.join(analysis_top_dir, "DATA", project_id)
        project_sl_dir = os.path.join(analysis_top_dir, "DATA", project_name)
        project_analysis_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_id)
        project_analysis_sl_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_name)
        if create_files:
            safe_makedir(project_dir, 0o2770)
            safe_makedir(project_analysis_dir, 0o2770)
            if not project_dir == project_sl_dir and \
               not os.path.exists(project_sl_dir):
                os.symlink(project_dir, project_sl_dir)
            if not project_analysis_dir == project_analysis_sl_dir and \
               not os.path.exists(project_analysis_sl_dir):
                os.symlink(project_analysis_dir, project_analysis_sl_dir)
        try:
            project_obj = projects_to_analyze[project_dir]
        except KeyError:
            project_obj = NGIProject(name=project_name, dirname=project_id,
                                     project_id=project_id,
                                     base_path=analysis_top_dir)
            projects_to_analyze[project_dir] = project_obj
        # Iterate over the samples in the project
        for sample in project.get('samples', []):
            sample_name = sample['sample_name']
            # If specific samples are specified, skip those that do not match
            if restrict_to_samples and sample_name not in restrict_to_samples:
                LOG.debug("Skipping sample {}: not in specified samples "
                          "{}".format(sample_name, ", ".join(restrict_to_samples)))
                continue
            LOG.info("Setting up sample {}".format(sample_name))
            # Create a directory for the sample if it doesn't already exist
            sample_dir = os.path.join(project_dir, sample_name)
            if create_files: safe_makedir(sample_dir, 0o2770)
            # This will only create a new sample object if it doesn't already exist in the project
            sample_obj = project_obj.add_sample(name=sample_name, dirname=sample_name)
            # Get the Library Prep ID for each file
            pattern = re.compile(".*\.(fastq|fq)(\.gz|\.gzip|\.bz2)?$")
            fastq_files = filter(pattern.match, sample.get('files', []))
            # For each fastq file, create the libprep and seqrun objects
            # and add the fastq file to the seqprep object
            # Note again that these objects only get created if they don't yet exist;
            # if they do exist, the existing object is returned
            for fq_file in fastq_files:
                # Try to parse from SampleSheet
                try:
                    if not samplesheet_path: raise ValueError()
                    lane_num = re.match(r'[\w-]+_L\d{2}(\d)_\w+', fq_file).groups()[0]
                    libprep_name = determine_library_prep_from_samplesheet(samplesheet_path,
                                                                           project_original_name,
                                                                           sample_name,
                                                                           lane_num)
                except (IndexError, ValueError) as e:
                    LOG.debug('Unable to determine library prep from sample sheet file '
                              '("{}"); try to determine from Charon'.format(e))
                    try:
                        # Requires Charon access
                        libprep_name = determine_library_prep_from_fcid(project_id, sample_name, fc_full_id)
                        LOG.debug('Found libprep name "{}" in Charon'.format(libprep_name))
                    except ValueError:
                        charon_session = CharonSession()
                        libpreps = charon_session.sample_get_libpreps(project_id, sample_name).get('libpreps')
                        if len(libpreps) == 1:
                            libprep_name = libpreps[0].get('libprepid')
                            LOG.warn('Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" '
                                     'has no libprep information in Charon, but only one '
                                     'library prep is present in Charon ("{}"). Using '
                                     'this as the library prep.'.format(project_name,
                                                                        sample_name,
                                                                        fc_full_id,
                                                                        fq_file,
                                                                        libprep_name))
                        elif fallback_libprep:
                            libprep_name = fallback_libprep
                            LOG.warn('Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" '
                                     'has no libprep information in Charon, but a fallback '
                                     'libprep value of "{}" was supplied -- using this '
                                     'value.'.format(project_name,
                                                     sample_name,
                                                     fc_full_id,
                                                     fq_file,
                                                     libprep_name,
                                                     fallback_libprep))
                        else:
                            error_text = ('Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" '
                                          'has no libprep information in Charon. Skipping '
                                          'analysis.'.format(project_name, sample_name,
                                                             fc_full_id, fq_file))
                            LOG.error(error_text)
                            if not config.get('quiet'):
                                mail_analysis(project_name=project_name,
                                              sample_name=sample_name,
                                              level="ERROR",
                                              info_text=error_text)
                            continue
                libprep_object = sample_obj.add_libprep(name=libprep_name,
                                                        dirname=libprep_name)
                libprep_dir = os.path.join(sample_dir, libprep_name)
                if create_files: safe_makedir(libprep_dir, 0o2770)
                seqrun_object = libprep_object.add_seqrun(name=fc_full_id,
                                                          dirname=fc_full_id)
                seqrun_dir = os.path.join(libprep_dir, fc_full_id)
                if create_files: safe_makedir(seqrun_dir, 0o2770)
                seqrun_object.add_fastq_files(fq_file)
            if fastq_files and create_files:
                src_sample_dir = os.path.join(fc_dir_structure['fc_dir'],
                                              project['data_dir'],
                                              project['project_dir'],
                                              sample['sample_dir'])
                for libprep_obj in sample_obj:
                    for seqrun_obj in libprep_obj:
                        src_fastq_files = [os.path.join(src_sample_dir, fastq_file) for
                                           fastq_file in seqrun_obj.fastq_files]
                        seqrun_dst_dir = os.path.join(project_obj.base_path, project_obj.dirname,
                                                      sample_obj.dirname, libprep_obj.dirname,
                                                      seqrun_obj.dirname)
                        LOG.info("Symlinking fastq files from {} to {}...".format(src_sample_dir, seqrun_dir))
                        try:
                            do_symlink(src_fastq_files, seqrun_dir)
                        except OSError:
                            error_text = ('Could not symlink files for project/sample'
                                          'libprep/seqrun {}/{}/{}/{}'.format(project_obj,
                                                                              sample_obj,
                                                                              libprep_obj,
                                                                              seqrun_obj))
                            LOG.error(error_text)
                            if not config.get('quiet'):
                                mail_analysis(project_name=project_name,
                                              sample_name=sample_name,
                                              level="ERROR",
                                              info_text=error_text)
                            continue
    return projects_to_analyze
Пример #25
0
def collect_files_for_sample_analysis(project_obj, sample_obj, 
                                      restart_finished_jobs=False,
                                      status_field="alignment_status"):
    """This function finds all data files relating to a sample and
    follows a preset decision path to decide which of them to include in
    a sample-level analysis. This can include fastq files, bam files, and
    alignment-qc-level files.
    Doesn't modify existing project or sample objects; returns new copies.

    :param NGIProject project_obj: The NGIProject object to process
    :param NGISample sample_obj: The NGISample object to process
    :param bool restart_finished_jobs: Include jobs marked as "DONE" (default False)
    :param str status_field: Which Charon status field to check (alignment, genotype)

    :returns: A new NGIProject object, a list of alignment and qc files
    :rtype: NGIProject, list, list

    :raises ValueError: If there are no valid libpreps, seqruns, or fastq files
    """
    ### FASTQ
    # Access the filesystem to determine what fastq files are available
    # For each file, validate it.

    # This funtion goes into Charon and finds all valid libpreps and seqruns,
    # dvs libpreps for which               'qc' != "FAILED"
    # and seqruns  for which 'alignment_status' != "DONE"
    valid_libprep_seqruns = \
            get_valid_seqruns_for_sample(project_id=project_obj.project_id,
                                         sample_id=sample_obj.name,
                                         include_failed_libpreps=False,
                                         include_done_seqruns=restart_finished_jobs,
                                         status_field=status_field)
    if not valid_libprep_seqruns:
        raise ValueError('No valid libpreps/seqruns found for project/sample '
                         '"{}/{}"'.format(project_obj, sample_obj))

    # Now we find all fastq files that are available and validate them against
    # the group compiled in the previous step (get_valid_seqruns_for_sample)
    # We're going to recreate NGIProject/NGISample/NGILibraryPrep/NGISeqrun objects here
    sample_data_directory = os.path.join(project_obj.base_path, "DATA",
                                         project_obj.dirname, sample_obj.dirname)
    fastq_files_on_filesystem = fastq_files_under_dir(sample_data_directory, realpath=False)
    if not fastq_files_on_filesystem:
        raise ValueError('No valid fastq files found for project/sample '
                         '{}/{}'.format(project_obj, sample_obj))

    # Create a new NGIProject object (the old one could still be in use elsewhere)
    proj_obj = NGIProject(project_obj.name, project_obj.dirname,
                          project_obj.project_id, project_obj.base_path)
    sample_obj = proj_obj.add_sample(sample_obj.name, sample_obj.dirname)
    for fastq_path in fastq_files_on_filesystem:
        base_path, fastq = os.path.split(fastq_path)
        if not fastq:
            base_path, fastq = os.path.split(base_path) # Handles trailing slash
        base_path, fs_seqrun_name = os.path.split(base_path)
        base_path, fs_libprep_name = os.path.split(base_path)
        if fs_libprep_name not in valid_libprep_seqruns.keys():
            # Invalid library prep, skip this fastq file
            continue
        elif fs_seqrun_name not in valid_libprep_seqruns.get(fs_libprep_name, []):
            continue
        else:
            libprep_obj = sample_obj.add_libprep(name=fs_libprep_name, dirname=fs_libprep_name)
            seqrun_obj = libprep_obj.add_seqrun(name=fs_seqrun_name, dirname=fs_seqrun_name)
            seqrun_obj.add_fastq_files(fastq)

    ### EXISTING DATA
    # If we still have data here at this point, we'll copy it over. If we had
    # decided to scrap it, it would have been deleted already.
    files_to_copy = find_previous_sample_analyses(proj_obj, sample_obj)

    return (proj_obj, files_to_copy)