Exemplo n.º 1
0
def calculate_relatedness(dataset, args):
    if not (args.samples or args.projects):
        raise AnalysisDriverError('Require either --samples or --projects parameter to be set')
    os.makedirs(os.path.join(cfg['jobs_dir'], dataset.name), exist_ok=True)
    all_gvcfs = []
    sample_ids = []
    if args.samples:
        for sample_id in args.samples:
            sample_ids.append(sample_id)
            s = get_document(sample_id)
            project_id = s.get('project_id')
            gvcf_file = s.get('user_sample_id') + '.g.vcf.gz'
            gvcf = util.find_file(cfg['sample']['input_dir'], project_id, sample_id, gvcf_file)
            all_gvcfs.append(gvcf)
    elif args.projects:
        for project_id in args.projects:
            samples_for_project = clarity.get_sample_names_from_project(project_id)
            sample_ids.extend(samples_for_project)
            project_folder = util.find_file(cfg['project']['input_dir'], project_id)
            all_gvcfs.extend(get_all_project_gvcfs(project_folder))

    g = qc.GenotypeGVCFs(dataset=dataset, gVCFs=all_gvcfs, reference=args.reference)
    g.run()
    if args.method not in ('peddy', 'relatedness'):
        raise AnalysisDriverError('Choose either "peddy" or "relatedness" as method')
    if args.method == 'peddy':
        p = qc.Peddy(dataset=dataset, ids=sample_ids)
        p.run()
        o = qc.ParseRelatedness(dataset=dataset, parse_method='parse_peddy', ids=sample_ids)
        o.run()
    elif args.method == 'relatedness':
        r = qc.Relatedness(dataset=dataset)
        r.run()
        o = qc.ParseRelatedness(dataset=dataset, parse_method='parse_relatedness', ids=sample_ids)
        o.run()
Exemplo n.º 2
0
def test_find_file(mocked_log):
    assert util.find_file(TestEGCG.assets_path,
                          'ftest.txt') == join(TestEGCG.assets_path,
                                               'ftest.txt')
    assert util.find_file(TestEGCG.assets_path, 'ftest_.txt') is None
    assert util.find_file(TestEGCG.assets_path, 'ftest*.txt') is None
    mocked_log.assert_called_with(
        'Searched pattern %s for one file, but got %s',
        (TestEGCG.assets_path, 'ftest*.txt'), 2)
    def _run(self):
        """Detect gender of the sample based on the %het on the X chromosome."""
        name, ext = os.path.splitext(util.find_file(self.vcf_file))
        if ext == '.gz':
            file_opener = 'zcat'
            name, gz = os.path.splitext(name)
        else:
            file_opener = 'cat'

        gender_call_file = name + '.sex'

        command = util.str_join(
            '%s %s' % (file_opener, self.vcf_file),
            "grep -P '^chrX|^X'",
            "awk '{split($10,a,\":\"); count[a[1]]++; total++} END{for (g in count){print g\" \"count[g]/total}}'",
            "grep '0/1'",
            "awk '{if ($2>.35){gender=\"FEMALE\"}else{if ($2<.15){gender=\"MALE\"}else{gender=\"UNKNOWN\"}} print gender, $2}'",
            separator=' | '
        ) + ' > ' + gender_call_file
        self.info(command)

        return executor.execute(
            command,
            job_name='sex_detection',
            working_dir=self.job_dir,
            walltime=6,
            cpus=1,
            mem=2,
            log_commands=False
        ).join()
def create_output_links(input_dir, output_cfg, link_dir, **kwargs):
    exit_status = 0
    links = []

    for output_record in output_cfg.content.values():
        src_pattern = os.path.join(
            input_dir,
            os.path.join(*output_record.get('location', [''])),
            output_record['basename']
        ).format(**kwargs)

        source = util.find_file(src_pattern)
        if source:
            link_file = os.path.join(
                link_dir,
                output_record.get('new_name', os.path.basename(source))
            ).format(**kwargs)
            if os.path.islink(link_file):
                os.unlink(link_file)
            os.symlink(source, link_file)
            links.append(link_file)
        else:
            app_logger.warning('No file found for pattern ' + src_pattern)
            if output_record.get('required', True):
                exit_status += 1
    if exit_status == 0:
        return links
    else:
        raise AnalysisDriverError('link creation failed with exit status ' + str(exit_status))
Exemplo n.º 5
0
    def _move_dir_exists(self):
        frm = join(self.test_dir, 'from')
        to = join(self.test_dir, 'exists')
        md5_from1 = self._md5(join(frm, 'ftest.txt'))
        md5_from2 = self._md5(join(frm, 'subdir', 'ftest.txt'))

        assert util.find_file(frm, 'ftest.txt')
        assert util.find_file(to, 'ftest.txt')
        assert not md5_from1 == self._md5(join(to, 'ftest.txt'))
        assert not md5_from2 == self._md5(join(to, 'subdir', 'ftest.txt'))

        util.move_dir(frm, to)

        assert not util.find_file(frm, 'ftest.txt')
        assert util.find_file(to, 'ftest.txt')
        assert md5_from1 == self._md5(join(to, 'ftest.txt'))
        assert md5_from2 == self._md5(join(to, 'subdir', 'ftest.txt'))
 def _filter_bam(self):
     # use only chromosome 22 for speed
     return executor.execute(
         toolset['samtools'] + ' view -b %s chr22 > %s' % (util.find_file(self.bam_file), self.filtered_bam),
         job_name='filter_bam22',
         working_dir=self.job_dir,
         cpus=1,
         mem=2,
         log_commands=False
     ).join()
    def get_output_file(self, outfile_id):
        if self.post_pipeline:
            fp = self.output_cfg.output_dir_file(outfile_id)
        else:
            fp = self.output_cfg.job_dir_file(outfile_id)

        self.debug('Searching for %s in %s/%s' % (outfile_id, self.data_dir, fp))
        if fp:
            return util.find_file(
                self.data_dir,
                fp.format(sample_id=self.sample_id, user_sample_id=self.user_sample_id)
            )
 def _populate_barcode_info_from_phix_read_names(self, run_dir):
     for run_element_id, barcode_info in self.barcodes_info.items():
         if ELEMENT_BARCODE in barcode_info and barcode_info[ELEMENT_BARCODE] == 'unknown':
             read_name_file = util.find_file(
                 run_dir, 'Undetermined_S0_L00%s_phix_read_name.list' % barcode_info[ELEMENT_LANE]
             )
         else:
             read_name_file = util.find_file(
                 run_dir,
                 barcode_info[ELEMENT_PROJECT_ID],
                 barcode_info[ELEMENT_SAMPLE_INTERNAL_ID],
                 '*_S*_L00%s_phix_read_name.list' % barcode_info[ELEMENT_LANE]
             )
         if read_name_file:
             with open(read_name_file) as open_file:
                 barcode_info[ELEMENT_NB_READS_PHIX] = sum(1 for _ in open_file)
         elif barcode_info[ELEMENT_NB_READS_PASS_FILTER] == 0:
             self.info('No reads for %s, not expecting PhiX filtered file', run_element_id)
         else:
             # TODO: Not mandatory for now as there will be lots of old runs without it
             self.warning('No Phix read_name file found in %s for %s', run_dir, run_element_id)
    def _run(self):
        vcf = util.find_file(self.vcf_file)
        if not vcf:
            return 1

        name, ext = os.path.splitext(vcf)
        stats_file = name + '.stats'
        return executor.execute(
            '%s vcfstats %s > %s' % (toolset['rtg'], vcf, stats_file),
            job_name='rtg_vcfstats',
            working_dir=self.job_dir,
            cpus=4,
            mem=32
        ).join()
 def _run(self):
     self.info('Generating depth file: %s', self.samtools_depth_out_file)
     return executor.execute(
         bash_commands.samtools_depth_command(
             self.job_dir,
             find_file(self.bam_file),
             self.samtools_depth_out_file
         ),
         job_name='samtoolsdepth',
         working_dir=self.job_dir,
         cpus=1,
         mem=6,
         log_commands=False
     ).join()
 def _populate_barcode_info_from_fastq_filterer_files(self, run_dir):
     for run_element_id in self.barcodes_info:
         barcode_info = self.barcodes_info.get(run_element_id)
         if ELEMENT_BARCODE in barcode_info and barcode_info[ELEMENT_BARCODE] == 'unknown':
             fastqfilter_stats_file = util.find_file(
                 run_dir, 'Undetermined_S0_L00%s_fastqfilterer.stats' % barcode_info[ELEMENT_LANE]
             )
         else:
             fastqfilter_stats_file = util.find_file(
                 run_dir,
                 barcode_info[ELEMENT_PROJECT_ID],
                 barcode_info[ELEMENT_SAMPLE_INTERNAL_ID],
                 '*_S*_L00%s_fastqfilterer.stats' % barcode_info[ELEMENT_LANE]
             )
         if fastqfilter_stats_file:
             stats = dm.parse_fastq_filterer_stats(fastqfilter_stats_file)
             # make sure the stats can be nullable if rerun without filtering
             barcode_info[ELEMENT_TILES_FILTERED] = stats.get('remove_tiles')
             barcode_info[ELEMENT_TRIM_R1_LENGTH] = stats.get('trim_r1')
             barcode_info[ELEMENT_TRIM_R2_LENGTH] = stats.get('trim_r2')
         elif barcode_info[ELEMENT_NB_READS_PASS_FILTER] == 0:
             self.info('No reads for %s, Not expecting fastqfilter file', run_element_id)
         else:
             raise PipelineError('Cannot find fastqfilter file in %s for %s' % (run_dir, run_element_id))
Exemplo n.º 12
0
    def test_move_dir(self):
        frm = join(self.test_dir, 'from')
        to = join(self.test_dir, 'to')
        md5_from = self._md5(join(frm, 'ftest.txt'))
        assert util.find_file(frm, 'ftest.txt')
        assert not util.find_file(to)

        assert util.move_dir(frm, to) == 0

        assert not util.find_file(frm, 'ftest.txt')
        assert util.find_file(to, 'ftest.txt')
        assert util.find_file(to, 'subdir', 'ftest.txt')
        assert md5_from == self._md5(join(to, 'ftest.txt'))

        assert util.find_file(to, 'external_renamed.txt')
    def _populate_from_gc_bias_metrics(self, run_dir):
        for k, run_element in self.barcodes_info.items():
            if run_element.get('barcode') == 'unknown' or run_element[ELEMENT_NB_READS_PASS_FILTER] == 0:
                self.info('No reads for %s, not expecting GC bias data', run_element['run_element_id'])
                continue

            metrics_file = util.find_file(
                run_dir,
                run_element['project_id'],
                run_element['sample_id'],
                '*_S*_L00%s_gc_bias.metrics' % run_element['lane']
            )

            with open(metrics_file) as f:
                header = ''
                while not header.startswith('ACCUMULATION_LEVEL'):
                    header = f.readline()

                reader = csv.DictReader(f, header.split('\t'), delimiter='\t')
                lines = [l for l in reader]

                # gc slope
                data_points = [float(l['NORMALIZED_COVERAGE']) for l in lines if 20 <= int(l['GC']) <= 80]
                gc_slope = theilslopes(data_points)
                self.info('Calculated a GC slope of %s from %s data points', gc_slope, len(data_points))

                # deviation from normal
                total_windows = sum([int(l['WINDOWS']) for l in lines])
                # total_windows * 0.0004 gives approximately the same number of data points as 20 <= GC <= 80
                threshold = total_windows * 0.0004
                diffs = [abs(1 - float(l['NORMALIZED_COVERAGE'])) for l in lines if int(l['WINDOWS']) > threshold]
                normal_dev = sum(diffs) / len(diffs)
                self.info('Calculated a normal deviation of %s from %s data points', normal_dev, len(diffs))

                run_element['gc_bias'] = {
                    'slope': gc_slope[0],
                    'mean_deviation': normal_dev
                }
Exemplo n.º 14
0
def build_pipeline(dataset):
    sample_ids = [sample['sample_id'] for sample in dataset.samples_processed]
    project_source = os.path.join(cfg.query('project', 'input_dir'), dataset.name)
    gvcf_files = []
    for sample in dataset.samples_processed:
        # Only check if we have gvcf when the samples have been through human processing that generate a gvcf
        if query_dict(sample, 'aggregated.most_recent_proc.pipeline_used.name') == 'bcbio':
            gvcf_file = find_file(project_source, sample['sample_id'], sample['user_sample_id'] + '.g.vcf.gz')
            if not gvcf_file:
                raise PipelineError('Unable to find gVCF file for sample %s in %s' % (sample['sample_id'], project_source))
            gvcf_files.append(gvcf_file)

    if len(gvcf_files) < 2:
        # No need to run as there are not enough gvcfs to process
        cleanup = Cleanup(dataset=dataset)
    else:
        genotype_gvcfs = GenotypeGVCFs(dataset=dataset, gVCFs=gvcf_files)
        relatedness = Relatedness(dataset=dataset, previous_stages=[genotype_gvcfs])
        peddy = Peddy(dataset=dataset, ids=sample_ids, previous_stages=[genotype_gvcfs])
        parse = ParseRelatedness(dataset=dataset, ids=sample_ids, parse_method='parse_both', previous_stages=[relatedness, peddy])
        md5 = MD5Sum(dataset=dataset, previous_stages=[parse])
        output = Output(dataset=dataset, previous_stages=[md5])
        cleanup = Cleanup(dataset=dataset, previous_stages=[output])
    return cleanup
 def gatk_genotype_gvcfs_cmd(self, memory, number_threads):
     gvcf_variants = ' '. join(['--variant ' + util.find_file(i) for i in self.gVCFs])
     return java_command(memory=memory, tmp_dir=self.job_dir, jar=toolset['gatk']) + \
         '-T GenotypeGVCFs -nt %s -R %s %s -o %s' % (
             number_threads, self.dataset.reference_genome, gvcf_variants, self.gatk_outfile
         )
Exemplo n.º 16
0
 def tearDown(self):
     rmtree(util.find_file(self.test_dir, 'to'))
     rmtree(util.find_file(self.test_dir, 'from'))
     rmtree(util.find_file(self.test_dir, 'exists'))
     rmtree(util.find_file(self.test_dir, 'external'))
Exemplo n.º 17
0
def test_find_file():
    assert util.find_file(TestEGCG.assets_path, 'ftest.txt') == join(TestEGCG.assets_path, 'ftest.txt')
 def sample_fastq_command(self):
     return 'set -o pipefail; {seqtk} sample {fastq} {nb_reads} | {seqtk} seq -a > {fasta}'.format(
         nb_reads=self.nb_reads, seqtk=toolset['seqtk'], fastq=util.find_file(self.fastq_file),
         fasta=self.fasta_outfile
     )
Exemplo n.º 19
0
 def tearDown(self):
     for base in ('to', 'from', 'exists', 'external'):
         f = util.find_file(self.test_dir, base)
         if f:
             rmtree(f)
 def samtools_depth_out_file(self):
     return os.path.splitext(find_file(self.bam_file))[0] + '.depth'