Exemplo n.º 1
0
    def _run_mpileup(cls, bam, ref, outfile):
        cmd = ' '.join([
            'samtools mpileup --skip-indels -d 500 -t INFO/AD,INFO/ADF,INFO/ADR -C50 -uv',
            '-f', ref, bam, '>', outfile
        ])

        utils.syscall(cmd)
Exemplo n.º 2
0
    def _run_mccortex_view_kmers(self):
        # Example filename we're looking for:
        # cortex.out/binaries/cleaned/k31/sample_name.kmer31.q5cleaned_1.ctx
        cleaned_dir = os.path.join(self.cortex_outdir, 'binaries', 'cleaned')
        if not os.path.exists(cleaned_dir):
            print('Cleaned directory not found ' + cleaned_dir +
                  ' ... cannot run mccortex view kmers',
                  file=sys.stderr)
            return

        kmer_dirs = [x for x in os.listdir(cleaned_dir) if x.startswith('k')]
        if len(kmer_dirs) != 1:
            print('Error finding kmers directory inside ' + cleaned_dir +
                  ' ... cannot run mccortex view kmers',
                  file=sys.stderr)

        kmer_dir = os.path.join(cleaned_dir, kmer_dirs[0])
        ctx_files = [x for x in os.listdir(kmer_dir) if x.endswith('.ctx')]

        if len(ctx_files) != 1:
            print('Error finding ctx file inside ' + kmer_dir +
                  ' ... cannot run mccortex view kmers',
                  file=sys.stderr)

        ctx_file = os.path.join(kmer_dir, ctx_files[0])
        assert os.path.exists(ctx_file)
        command = ' '.join([
            self.mccortex, 'view', '--kmers', ctx_file,
            r'''| awk '{print $1,$2}' | gzip -9 > ''', self.kmer_counts_file
        ])
        utils.syscall(command)
Exemplo n.º 3
0
def validate(filenames):
    assert 1 <= len(filenames) <= 2
    cmd = 'fqtools validate ' + ' '.join(filenames)
    try:
        utils.syscall(cmd)
    except:
        raise Error('Error running ' + cmd)
Exemplo n.º 4
0
def validate(filenames):
    assert 1 <= len(filenames) <= 2
    cmd = "fqtools validate " + " ".join(filenames)
    try:
        utils.syscall(cmd)
    except:
        raise Error("Error running " + cmd)
Exemplo n.º 5
0
    def test_map_reads(self):
        '''test map_reads'''
        reads1 = os.path.join(data_dir, 'reads.1.fq')
        reads2 = os.path.join(data_dir, 'reads.2.fq')
        ref_fasta = os.path.join(data_dir, 'ref.fa')
        tmp_sam = 'tmp.test_map_reads.sam'
        if os.path.exists(tmp_sam):
            os.unlink(tmp_sam)
        read_map.map_reads(ref_fasta,
                           reads1,
                           reads2,
                           tmp_sam,
                           read_group=('1', 'GROUP_NAME'))
        self.assertTrue(os.path.exists(tmp_sam))
        tmp_stats = tmp_sam + '.stats'
        expected_stats = os.path.join(data_dir, 'flagstat')
        utils.syscall('samtools flagstat ' + tmp_sam + ' > ' + tmp_stats)
        self.assertTrue(filecmp.cmp(expected_stats, tmp_stats, shallow=False))
        found_rg_line = False
        with open(tmp_sam) as f:
            for line in f:
                if line == '@RG\tLB:LIB\tID:1\tSM:GROUP_NAME\n':
                    found_rg_line = True
                    break

        self.assertTrue(found_rg_line)
        os.unlink(tmp_sam)
        os.unlink(tmp_stats)
Exemplo n.º 6
0
def submit_xml_files(ini_file,
                     outfile,
                     files=None,
                     use_test_server=False,
                     unit_test=None,
                     unit_test_obj_type=None):
    username, password = parse_config_file(ini_file)
    if files is None:
        files_command = None
    else:
        files_command_list = [
            '-F "' + key + '=@' + value + '"' for key, value in files.items()
        ]
        files_command = ' '.join(files_command_list)

    if use_test_server:
        url = 'https://www-test.ebi.ac.uk/ena/submit/drop-box/submit/?auth=ENA%20'
    else:
        url = 'https://www.ebi.ac.uk/ena/submit/drop-box/submit/?auth=ENA%20'

    command_list = [
        'curl -k', files_command,
        '"' + url + username + '%20' + password + '"', '>', outfile
    ]

    command = ' '.join([x for x in command_list if x is not None])
    if unit_test is None:
        utils.syscall(command)
    elif unit_test == 'success':
        _make_dummy_success_receipt(outfile, unit_test_obj_type)
    elif unit_test == 'fail':
        _make_dummy_fail_receipt(outfile)
    else:
        raise Error('unit_test must be None, success, or fail. Got: ' +
                    unit_test)
Exemplo n.º 7
0
    def test_nextflow_qc_using_fastq_input(self):
        '''test nextflow_qc using fastq input'''
        reads1 = os.path.join(data_dir, 'Reads', 'reads.1.1.fq.gz')
        reads2 = os.path.join(data_dir, 'Reads', 'reads.1.2.fq.gz')
        output_dir = 'tmp.test_nextflow_qc_using_fastq_input'
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'qc.nf')
        nextflow_helper.write_config_file()
        work_dir = 'tmp.nextflow_qc.work'
        dag_file = 'nextflow.qc.dag.no_db.pdf'
        try:
            os.unlink(dag_file)
        except:
            pass

        command = ' '.join([
            'nextflow run', '--reads_in1', reads1, '--reads_in2', reads2,
            '--output_dir', output_dir, '--ref_fasta',
            os.path.join(data_dir, 'Reference',
                         'ref.fa'), '-with-dag', dag_file, '-c',
            nextflow_helper.config_file, '-w', work_dir, nextflow_file
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        self.assertTrue(os.path.exists(output_dir))
        for method in ['fastqc', 'samtools_qc']:
            qc_dir = os.path.join(output_dir, method)
            self.assertTrue(os.path.exists(qc_dir))
            self.assertTrue(len(os.listdir(qc_dir)) >= 1)

        shutil.rmtree(output_dir)
        nextflow_helper.clean_files()
Exemplo n.º 8
0
def mark_duplicates(sorted_bam_in, bam_out, xmx=2):
    # The picard jar file in the singularity container is /bioinf-tools/picard.jar.
    # If we're not using the container, then need to set the env variable
    # CLOCKWORK_PICARD_JAR instead
    PICARD_JAR = os.environ.get("CLOCKWORK_PICARD_JAR",
                                "/bioinf-tools/picard.jar")
    if not os.path.exists(PICARD_JAR):
        raise Error(
            "Picard jar file not found. Please set environment variable CLOCKWORK_PICARD_JAR, or put it here: /bioinf-tools/picard.jar"
        )

    tmpdir = tempfile.mkdtemp(prefix=bam_out + ".tmp.markdups.",
                              dir=os.path.dirname(bam_out))
    m_file = os.path.join(tmpdir, "covfefe")

    cmd = " ".join([
        "java",
        "-Xmx" + str(xmx) + "g",
        "-jar",
        PICARD_JAR,
        "MarkDuplicates",
        "VALIDATION_STRINGENCY=LENIENT",
        "INPUT=" + sorted_bam_in,
        "OUTPUT=" + bam_out,
        "M=" + m_file,
    ])

    try:
        utils.syscall(cmd)
    except:
        shutil.rmtree(tmpdir)
        raise Error("Error runnin mark_duplicates: " + cmd)

    shutil.rmtree(tmpdir)
Exemplo n.º 9
0
def mark_duplicates(sorted_bam_in, bam_out, xmx=2):
    # The picard jar file in the singularity container is /bioinf-tools/picard.jar.
    # If we're not using the container, then need to set the env variable
    # CLOCKWORK_PICARD_JAR instead
    PICARD_JAR = os.environ.get('CLOCKWORK_PICARD_JAR',
                                '/bioinf-tools/picard.jar')
    if not os.path.exists(PICARD_JAR):
        raise Error(
            'Picard jar file not found. Please set environment variable CLOCKWORK_PICARD_JAR, or put it here: /bioinf-tools/picard.jar'
        )

    tmpdir = tempfile.mkdtemp(prefix=bam_out + '.tmp.markdups.',
                              dir=os.path.dirname(bam_out))
    m_file = os.path.join(tmpdir, 'covfefe')

    cmd = ' '.join([
        'java', '-Xmx' + str(xmx) + 'g', '-jar', PICARD_JAR, 'MarkDuplicates',
        'VALIDATION_STRINGENCY=LENIENT', 'INPUT=' + sorted_bam_in,
        'OUTPUT=' + bam_out, 'M=' + m_file
    ])

    try:
        utils.syscall(cmd)
    except:
        shutil.rmtree(tmpdir)
        raise Error('Error runnin mark_duplicates: ' + cmd)

    shutil.rmtree(tmpdir)
Exemplo n.º 10
0
    def run(self):
        self._make_input_files()
        ref_fai = os.path.join(self.ref_dir, 'ref.fa.fai')
        genome_size = pyfastaq.tasks.stats_from_fai(ref_fai)['total_length']

        cmd = ' '.join([
            self.cortex_run_calls,
            '--fastaq_index', self.cortex_reads_index,
            '--auto_cleaning yes',
            '--first_kmer 31',
            '--bc yes',
            '--pd no',
            '--outdir', self.cortex_outdir,
            '--outvcf cortex',
            '--ploidy 2',
            '--stampy_hash', os.path.join(self.ref_dir, 'ref.stampy'),
            '--stampy_bin', self.stampy_script,
            '--list_ref_fasta', self.cortex_ref_fofn,
            '--refbindir', self.ref_dir,
            '--genome_size', str(genome_size),
            '--qthresh 5',
            '--mem_height', str(self.mem_height),
            '--mem_width 100',
            '--vcftools_dir', self.vcftools_dir,
            '--do_union yes',
            '--ref CoordinatesAndInCalling',
            '--workflow independent',
            '--logfile', self.cortex_log,
        ])

        utils.syscall(cmd)
        self._tidy_files()
Exemplo n.º 11
0
    def test_map_reads(self):
        """test map_reads"""
        reads1 = os.path.join(data_dir, "reads.1.fq")
        reads2 = os.path.join(data_dir, "reads.2.fq")
        ref_fasta = os.path.join(data_dir, "ref.fa")
        tmp_sam = "tmp.test_map_reads.sam"
        if os.path.exists(tmp_sam):
            os.unlink(tmp_sam)
        read_map.map_reads(ref_fasta,
                           reads1,
                           reads2,
                           tmp_sam,
                           read_group=("1", "GROUP_NAME"))
        self.assertTrue(os.path.exists(tmp_sam))
        tmp_stats = tmp_sam + ".stats"
        expected_stats = os.path.join(data_dir, "flagstat")
        utils.syscall("samtools flagstat " + tmp_sam +
                      " | grep -v primary > " + tmp_stats)
        self.assertTrue(filecmp.cmp(expected_stats, tmp_stats, shallow=False))
        found_rg_line = False
        with open(tmp_sam) as f:
            for line in f:
                if line == "@RG\tLB:LIB\tID:1\tSM:GROUP_NAME\n":
                    found_rg_line = True
                    break

        self.assertTrue(found_rg_line)
        os.unlink(tmp_sam)
        os.unlink(tmp_stats)
Exemplo n.º 12
0
    def test_nextflow_variant_call_using_fastq_input(self):
        """test nextflow_variant_call using fastq input"""
        reads1 = os.path.join(data_dir, "Reads", "reads.1.1.fq.gz")
        reads2 = os.path.join(data_dir, "Reads", "reads.1.2.fq.gz")
        outdir = os.path.abspath(
            "tmp.test_nextflow_variant_call_fastq_input.out")
        tmp_data_dir = "tmp.nextflow_variant_call_fastq_input.data"
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir,
                                     "variant_call.nf")
        nextflow_helper.write_config_file()
        work_dir = "tmp.nextflow_variant_call_fastq_input.work"
        sample_name = "test_sample_name"
        dag_file = "nextflow.variant_call.dag.no_db.pdf"
        try:
            os.unlink(dag_file)
        except:
            pass

        command = " ".join([
            "nextflow run",
            "--reads_in1",
            reads1,
            "--reads_in2",
            reads2,
            "--output_dir",
            outdir,
            "--ref_dir",
            os.path.join(tmp_data_dir, "Reference"),
            "--sample_name",
            sample_name,
            "--cortex_mem_height 17",
            "--gvcf",
            "--testing",
            "-with-dag",
            dag_file,
            "-c",
            nextflow_helper.config_file,
            "-w",
            work_dir,
            nextflow_file,
        ])
        utils.syscall(command)
        self._files_are_present_and_correct(outdir,
                                            sample_name,
                                            expect_rmdup_bam=True,
                                            expect_ref_check_files=False)
        self.assertTrue(
            os.path.exists(os.path.join(outdir, "minos", "gvcf.fasta")))
        self.assertTrue(
            os.path.exists(os.path.join(outdir, "minos", "gvcf.vcf")))
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)
        shutil.rmtree(tmp_data_dir)
        shutil.rmtree(outdir)
        nextflow_helper.clean_files()
Exemplo n.º 13
0
def _download_run(run_id, outdir):
    cmd = ' '.join([
        'enaDataGet',
        '-f fastq',
        '-d', outdir,
        run_id,
    ])
    print(cmd)
    utils.syscall(cmd)
Exemplo n.º 14
0
def _download_sample(sample_id, outdir):
    cmd = ' '.join([
        'enaGroupGet',
        '-f fastq',
        '-d', outdir,
        sample_id,
    ])
    print(cmd)
    utils.syscall(cmd)
Exemplo n.º 15
0
    def _run_fastqc(outdir, infiles):
        assert isinstance(infiles, list)
        try:
            os.mkdir(outdir)
        except:
            raise Error('Error mkdir ' + outdir)

        command = 'fastqc --threads 1 --extract -o ' + outdir + ' ' + ' '.join(infiles)
        utils.syscall(command)
Exemplo n.º 16
0
    def _run_fastqc(outdir, infiles):
        assert isinstance(infiles, list)
        try:
            os.mkdir(outdir)
        except:
            raise Exception("Error mkdir " + outdir)

        command = "fastqc --threads 1 --extract -o " + outdir + " " + " ".join(
            infiles)
        utils.syscall(command)
Exemplo n.º 17
0
    def _run_mpileup(cls, bam, ref, outfile):
        cmd = " ".join([
            "samtools mpileup --skip-indels -d 500 -t INFO/AD,INFO/ADF,INFO/ADR -C50 -uv",
            "-f",
            ref,
            bam,
            ">",
            outfile,
        ])

        utils.syscall(cmd)
Exemplo n.º 18
0
    def _run_mpileup(cls, bam, ref, outfile):
        cmd = " ".join([
            "bcftools mpileup --skip-indels -d 500 -a INFO/AD,INFO/ADF,INFO/ADR -C50 --output-type v",
            "-f",
            ref,
            bam,
            ">",
            outfile,
        ])

        utils.syscall(cmd)
Exemplo n.º 19
0
def upload_file_to_ena_ftp(ini_file, filename, uploaded_name):
    # paranoid about passwords and running ps? Looks like curl is ok:
    # https://unix.stackexchange.com/questions/385339/how-does-curl-protect-a-password-from-appearing-in-ps-output
    # "wipe the next argument out so that the username:password isn't
    # displayed in the system process list"
    username, password = parse_config_file(ini_file)
    cmd = ' '.join([
        'curl -T', filename, 'ftp://webin.ebi.ac.uk/' + uploaded_name,
        '--user', username + ':' + password
    ])
    utils.syscall(cmd)
Exemplo n.º 20
0
def run_trimmomatic(
    reads1,
    reads2,
    out1,
    out2,
    trimmo_root=None,
    adapters="TruSeq3-PE-2.fa",
    minlen=50,
    verbose=0,
    threads=1,
    qual_trim="",
    adapters_included=True,
    quality_encoding="phred33",
):

    if trimmo_root is None:
        trimmo_root = os.environ.get("CLOCKWORK_TRIMMO_DIR",
                                     "/bioinf-tools/Trimmomatic-0.36")
    trimmo_root = os.path.abspath(trimmo_root)
    jar_files = [x for x in os.listdir(trimmo_root) if x.endswith(".jar")]
    if len(jar_files) != 1:
        raise Exception('Error finding Trimmoatic jar file in directory "' +
                        trimmo_root + '". Found ' + str(len(jar_files)) +
                        " jar files. Cannot continue")
    jar_file = os.path.join(trimmo_root, jar_files[0])

    if adapters_included:
        adapters = os.path.join(trimmo_root, "adapters", adapters)

    if not os.path.exists(adapters):
        raise Exception('Cannot find adapters file "' + adapters + '".')

    cmd = " ".join([
        "java -Xmx1000m -jar",
        jar_file,
        "PE",
        "-threads",
        str(threads),
        reads1,
        reads2,
        out1,
        "/dev/null",
        out2,
        "/dev/null",
        "ILLUMINACLIP:" + os.path.abspath(adapters) + ":2:30:10",
        qual_trim,
        "MINLEN:" + str(minlen),
        "-" + quality_encoding,
    ])

    if verbose:
        print("Run trimmomatic:", cmd)
    utils.syscall(cmd)
Exemplo n.º 21
0
    def make_index_files(self,
                         fasta_in,
                         genome_is_big,
                         using_cortex,
                         cortex_mem_height=22):
        # seqtk just hangs if the input file doesn't exist, so check
        # it first instead
        if not os.path.exists(fasta_in):
            raise Error("File not found: " + fasta_in)

        try:
            os.makedirs(self.directory)
        except:
            raise Error("Error mkdir " + self.directory)

        # ensure sequence lines are 60 nt long, and remove comments from
        # header lines
        utils.syscall("seqtk seq -C -l 60 " + fasta_in + " > " +
                      self.ref_fasta)
        utils.syscall("samtools faidx " + self.ref_fasta)

        if genome_is_big:
            utils.syscall("bwa index -a bwtsw " + self.ref_fasta)
        else:
            utils.syscall("bwa index " + self.ref_fasta)

        if using_cortex:
            cortex.make_run_calls_index_files(self.ref_fasta,
                                              self.ref_fasta_prefix,
                                              mem_height=cortex_mem_height)
Exemplo n.º 22
0
    def _make_stats_and_plots(cls, samfile, ref_fasta, outprefix):
        stats_file = outprefix + ".stats"

        cmd = " ".join(
            ["samtools stats", "-r", ref_fasta, samfile, ">", stats_file])
        utils.syscall(cmd)

        cmd = " ".join(
            ["plot-bamstats", "-p", outprefix + ".plot", stats_file])
        utils.syscall(cmd)

        for filename in glob.glob(outprefix + "*"):
            if filename.endswith(".gp") or filename.endswith(".html"):
                os.unlink(filename)
Exemplo n.º 23
0
    def _make_stats_and_plots(cls, samfile, ref_fasta, outprefix):
        stats_file = outprefix + '.stats'

        cmd = ' '.join(
            ['samtools stats', '-r', ref_fasta, samfile, '>', stats_file])
        utils.syscall(cmd)

        cmd = ' '.join(
            ['plot-bamstats', '-p', outprefix + '.plot', stats_file])
        utils.syscall(cmd)

        for filename in glob.glob(outprefix + '*'):
            if filename.endswith('.gp') or filename.endswith('.html'):
                os.unlink(filename)
    def test_nextflow_remove_contam_using_fastq_input(self):
        '''test nextflow_remove_contam using fastq input'''
        reads1 = os.path.join(data_dir, 'Reads', 'reads.1.1.fq.gz')
        reads2 = os.path.join(data_dir, 'Reads', 'reads.1.2.fq.gz')
        outprefix = 'tmp.test_nextflow_remove_contam_using_fastq_input'
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir,
                                     'remove_contam.nf')
        nextflow_helper.write_config_file()
        work_dir = 'tmp.nextflow_remove_contam.work'
        dag_file = 'nextflow.remove_contam.dag.no_db.pdf'
        try:
            os.unlink(dag_file)
        except:
            pass

        command = ' '.join([
            'nextflow run', '--reads_in1', reads1, '--reads_in2', reads2,
            '--outprefix', outprefix, '--ref_metadata_tsv',
            os.path.join(data_dir, 'Reference',
                         'remove_contam_metadata.tsv'), '--ref_fasta',
            os.path.join(data_dir, 'Reference',
                         'ref.fa'), '--testing', '-with-dag', dag_file, '-c',
            nextflow_helper.config_file, '-w', work_dir, nextflow_file
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        for reads_type in ('contam', 'remove_contam'):
            for i in ('1', '2'):
                filename = outprefix + '.' + reads_type + '.' + i + '.fq.gz'
                self.assertTrue(os.path.exists(filename))
                os.unlink(filename)

        expected_counts_lines = [
            'Name\tIs_contam\tReads\n',
            'contam\t1\t40\n',
            'ref\t0\t132\n',
            'Unmapped\t0\t26\n',
            'Reads_kept_after_remove_contam\t0\t158\n',
        ]

        counts_tsv = outprefix + '.counts.tsv'
        with open(counts_tsv) as f:
            got_counts_lines = f.readlines()
        self.assertEqual(expected_counts_lines, got_counts_lines)
        os.unlink(counts_tsv)

        nextflow_helper.clean_files()
Exemplo n.º 25
0
 def test_map_reads_markdup(self):
     """test map_reads markdup"""
     reads1 = os.path.join(data_dir, "reads.1.fq")
     reads2 = os.path.join(data_dir, "reads.2.fq")
     ref_fasta = os.path.join(data_dir, "ref.fa")
     tmp_sam = "tmp.test_map_reads.sam"
     if os.path.exists(tmp_sam):
         os.unlink(tmp_sam)
     read_map.map_reads(ref_fasta, reads1, reads2, tmp_sam, markdup=True)
     self.assertTrue(os.path.exists(tmp_sam))
     tmp_stats = tmp_sam + ".stats"
     expected_stats = os.path.join(data_dir, "markdup.flagstat")
     utils.syscall("samtools flagstat " + tmp_sam + " > " + tmp_stats)
     self.assertTrue(filecmp.cmp(expected_stats, tmp_stats, shallow=False))
     os.unlink(tmp_sam)
     os.unlink(tmp_stats)
Exemplo n.º 26
0
 def test_map_reads_markdup(self):
     '''test map_reads markdup'''
     reads1 = os.path.join(data_dir, 'reads.1.fq')
     reads2 = os.path.join(data_dir, 'reads.2.fq')
     ref_fasta = os.path.join(data_dir, 'ref.fa')
     tmp_sam = 'tmp.test_map_reads.sam'
     if os.path.exists(tmp_sam):
         os.unlink(tmp_sam)
     read_map.map_reads(ref_fasta, reads1, reads2, tmp_sam, markdup=True)
     self.assertTrue(os.path.exists(tmp_sam))
     tmp_stats = tmp_sam + '.stats'
     expected_stats = os.path.join(data_dir, 'markdup.flagstat')
     utils.syscall('samtools flagstat ' + tmp_sam + ' > ' + tmp_stats)
     self.assertTrue(filecmp.cmp(expected_stats, tmp_stats, shallow=False))
     os.unlink(tmp_sam)
     os.unlink(tmp_stats)
Exemplo n.º 27
0
    def test_nextflow_qc_using_fastq_input(self):
        """test nextflow_qc using fastq input"""
        reads1 = os.path.join(data_dir, "Reads", "reads.1.1.fq.gz")
        reads2 = os.path.join(data_dir, "Reads", "reads.1.2.fq.gz")
        output_dir = "tmp.test_nextflow_qc_using_fastq_input"
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir, "qc.nf")
        nextflow_helper.write_config_file()
        work_dir = "tmp.nextflow_qc.work"
        dag_file = "nextflow.qc.dag.no_db.pdf"
        try:
            os.unlink(dag_file)
        except:
            pass

        command = " ".join([
            "nextflow run",
            "--reads_in1",
            reads1,
            "--reads_in2",
            reads2,
            "--output_dir",
            output_dir,
            "--ref_fasta",
            os.path.join(data_dir, "Reference", "ref.fa"),
            "-with-dag",
            dag_file,
            "-c",
            nextflow_helper.config_file,
            "-w",
            work_dir,
            nextflow_file,
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        self.assertTrue(os.path.exists(output_dir))
        for method in ["fastqc", "samtools_qc"]:
            qc_dir = os.path.join(output_dir, method)
            self.assertTrue(os.path.exists(qc_dir))
            self.assertTrue(len(os.listdir(qc_dir)) >= 1)

        shutil.rmtree(output_dir)
        nextflow_helper.clean_files()
Exemplo n.º 28
0
    def run(self):
        self._make_input_files()
        ref_fai = os.path.join(self.ref_dir, "ref.fa.fai")
        genome_size = pyfastaq.tasks.stats_from_fai(ref_fai)["total_length"]

        cmd = " ".join([
            self.cortex_run_calls,
            "--fastaq_index",
            self.cortex_reads_index,
            "--auto_cleaning yes",
            "--first_kmer 31",
            "--bc yes",
            "--pd no",
            "--outdir",
            self.cortex_outdir,
            "--outvcf cortex",
            "--ploidy 2",
            "--stampy_hash",
            os.path.join(self.ref_dir, "ref.stampy"),
            "--stampy_bin",
            self.stampy_script,
            "--list_ref_fasta",
            self.cortex_ref_fofn,
            "--refbindir",
            self.ref_dir,
            "--genome_size",
            str(genome_size),
            "--qthresh 5",
            "--mem_height",
            str(self.mem_height),
            "--mem_width 100",
            "--vcftools_dir",
            self.vcftools_dir,
            "--do_union yes",
            "--ref CoordinatesAndInCalling",
            "--workflow independent",
            "--logfile",
            self.cortex_log,
        ])

        utils.syscall(cmd)
        self._tidy_files()
        self._run_mccortex_view_kmers()
Exemplo n.º 29
0
    def _run_mccortex_view_kmers(self):
        # Example filename we're looking for:
        # cortex.out/binaries/cleaned/k31/sample_name.kmer31.q5cleaned_1.ctx
        cleaned_dir = os.path.join(self.cortex_outdir, "binaries", "cleaned")
        if not os.path.exists(cleaned_dir):
            print(
                "Cleaned directory not found " + cleaned_dir +
                " ... cannot run mccortex view kmers",
                file=sys.stderr,
            )
            return

        kmer_dirs = [x for x in os.listdir(cleaned_dir) if x.startswith("k")]
        if len(kmer_dirs) != 1:
            print(
                "Error finding kmers directory inside " + cleaned_dir +
                " ... cannot run mccortex view kmers",
                file=sys.stderr,
            )

        kmer_dir = os.path.join(cleaned_dir, kmer_dirs[0])
        ctx_files = [x for x in os.listdir(kmer_dir) if x.endswith(".ctx")]

        if len(ctx_files) != 1:
            print(
                "Error finding ctx file inside " + kmer_dir +
                " ... cannot run mccortex view kmers",
                file=sys.stderr,
            )

        ctx_file = os.path.join(kmer_dir, ctx_files[0])
        assert os.path.exists(ctx_file)
        command = " ".join([
            self.mccortex,
            "view",
            "--kmers",
            ctx_file,
            r"""| awk '{print $1,$2}' | gzip -9 > """,
            self.kmer_counts_file,
        ])
        utils.syscall(command)
def run(options):
    if len(options.reads_files) % 2 != 0:
        raise Exception(
            f"Must provide even number of reads files. Got these files: {', '.join(options.reads_files)}"
        )
    reads1 = [f for i, f in enumerate(options.reads_files) if i % 2 == 0]
    reads2 = [f for i, f in enumerate(options.reads_files) if i % 2 != 0]
    assert len(reads1) == len(reads2)
    if options.force and os.path.exists(options.outdir):
        utils.syscall(f"rm -r {options.outdir}")

    var_call_one_sample_pipeline.run(
        reads1,
        reads2,
        options.ref_dir,
        options.outdir,
        sample_name=options.sample_name,
        cortex_mem_height=options.mem_height,
        debug=options.debug,
        keep_bam=options.keep_bam,
    )