Пример #1
0
def fix_mate_pairs(fq1, fq2, f_suffix="/1", r_suffix="/2"):
    """
    takes two FASTQ files (fq1 and fq2) of paired end sequencing data
    and filters out reads without a mate pair.
    """
    fq1_out = append_stem(fq1, "fixed")
    fq2_out = append_stem(fq2, "fixed")
    fq1_single = append_stem(fq1, "singles")
    fq2_single = append_stem(fq2, "singles")

    if all(map(file_exists, [fq1_out, fq2_out, fq2_single, fq2_single])):
        return [fq1_out, fq2_out]

    f_dict = SeqIO.index(fq1, "fastq",
                         key_function=get_read_name_function(f_suffix))
    r_dict = SeqIO.index(fq2, "fastq",
                         key_function=get_read_name_function(r_suffix))

    with open(fq1_out, 'w') as fq1_out_handle, open(fq2_out, 'w') as fq2_out_handle, open(fq1_single, 'w') as fq1_single_handle, open(fq2_single, 'w') as fq2_single_handle:
        for key in f_dict:
            if key in r_dict:
                fq1_out_handle.write(f_dict.get_raw(key))
                fq2_out_handle.write(r_dict.get_raw(key))
            else:
                fq1_single_handle.write(f_dict.get_raw(key))
        for key in r_dict:
            if key not in f_dict:
                fq2_single_handle.write(r_dict.get_raw(key))

    return [fq1_out, fq2_out]
Пример #2
0
def filter_reads_by_length(fq1, fq2, min_length=30):
    """
    removes reads which are empty a pair of fastq files

    """

    logger.info("Removing reads in %s and %s that "
                "are less than %d bases." % (fq1, fq2, min_length))
    # just pick the first one if it can be multiple types
    quality_type = QUALITY_TYPE[DetectFastqFormat.run(fq1)[0]]
    fq1_out = append_stem(fq1, "fixed")
    fq2_out = append_stem(fq2, "fixed")
    fq1_single = append_stem(fq1, "singles")
    fq2_single = append_stem(fq2, "singles")
    if all(map(file_exists, [fq1_out, fq2_out, fq2_single, fq2_single])):
        return [fq1_out, fq2_out]

    fq1_in = SeqIO.parse(fq1, quality_type)
    fq2_in = SeqIO.parse(fq2, quality_type)

    with open(fq1_out, 'w') as fq1_out_handle, open(fq2_out, 'w') as fq2_out_handle, open(fq1_single, 'w') as fq1_single_handle, open(fq2_single, 'w') as fq2_single_handle:
        for fq1_record, fq2_record in izip(fq1_in, fq2_in):
            if len(fq1_record.seq) >= min_length and len(fq2_record.seq) >= min_length:
                fq1_out_handle.write(fq1_record.format(quality_type))
                fq2_out_handle.write(fq2_record.format(quality_type))
            else:
                if len(fq1_record.seq) > min_length:
                    fq1_single_handle.write(fq1_record.format(quality_type))
                if len(fq2_record.seq) > min_length:
                    fq2_single_handle.write(fq2_record.format(quality_type))

    return [fq1_out, fq2_out]
Пример #3
0
 def _load_gemini(self, in_file):
     log_id = os.path.join(self.log,
                           "gemini" + "_" + str(uuid.uuid4()) + ".log")
     sh.gemini.load(self.db,
                    v=in_file,
                    t=self.type,
                    _out=append_stem(log_id, "out"),
                    _err=append_stem(log_id, "err"))
Пример #4
0
Файл: sam.py Проект: roryk/bipy
    def _get_handles(self, in_file):
        assigned_name = append_stem(in_file, "unique")
        ambiguous_name = append_stem(in_file, "ambiguous")

        in_handle = pysam.Samfile(in_file, "rb")
        assigned = pysam.Samfile(assigned_name, "wb", template=in_handle)
        ambiguous = pysam.Samfile(ambiguous_name, "wb", template=in_handle)

        return (in_handle, assigned, ambiguous)
Пример #5
0
    def _get_handles(self, in_file):
        assigned_name = append_stem(in_file, "unique")
        ambiguous_name = append_stem(in_file, "ambiguous")

        in_handle = pysam.Samfile(in_file, "rb")
        assigned = pysam.Samfile(assigned_name, "wb", template=in_handle)
        ambiguous = pysam.Samfile(ambiguous_name, "wb", template=in_handle)

        return (in_handle, assigned, ambiguous)
Пример #6
0
def run_as_pe(first, second, config):
    first_out = append_stem(first, "sickle")
    second_out = append_stem(second, "sickle")
    single_out = append_stem(first, "single")
    quality_type = _get_quality_type(first)
    length_cutoff = _get_length_cutoff(config)
    quality_cutoff = _get_quality_cutoff(config)
    if all(map(os.path.exists, [first_out, second_out, single_out])):
        return (first_out, second_out)
    sh.sickle("pe", f=first, r=second, l=length_cutoff, q=quality_cutoff,
              t=quality_type, o=first_out, p=second_out, s=single_out)
    return (first_out, second_out)
Пример #7
0
    def __call__(self, pair):
        unique_files = [append_stem(x, "unique") for x in pair]
        ambig_files = [append_stem(x, "ambiguous") for x in pair]
        if all(map(os.path.exists, unique_files + ambig_files)):
            return [unique_files, ambig_files]

        handles_0 = self._get_handles(pair[0])
        handles_1 = self._get_handles(pair[1])
        self._process_reads(handles_0, handles_1, None, None)
        [x.close() for x in handles_0]
        [x.close() for x in handles_1]
        return [unique_files, ambig_files]
Пример #8
0
Файл: sam.py Проект: roryk/bipy
    def __call__(self, pair):
        unique_files = [append_stem(x, "unique") for x in pair]
        ambig_files = [append_stem(x, "ambiguous") for x in pair]
        if all(map(os.path.exists, unique_files + ambig_files)):
            return [unique_files, ambig_files]

        handles_0 = self._get_handles(pair[0])
        handles_1 = self._get_handles(pair[1])
        self._process_reads(handles_0, handles_1, None, None)
        [x.close() for x in handles_0]
        [x.close() for x in handles_1]
        return [unique_files, ambig_files]
Пример #9
0
Файл: sam.py Проект: roryk/bipy
def downsample_bam(bam_file, target_reads, out_file=None):
    if out_file is None:
        out_file = append_stem(bam_file, "downsampled")
    percentage_to_sample = _get_percentage_to_sample(bam_file, target_reads)
    sh.samtools.view("-h", "-b", "-s", percentage_to_sample, "-o", out_file,
                     bam_file)
    return out_file
Пример #10
0
def filter_results_by_length(filename, cutoff):
    """ filters the tsv results by the metric that both the overlap
    of the query sequence and the subject sequence must both be
    > cutoff of their length. This might be a little too restrictive though
    """
    def query_match(linedict):
        length = abs(float(linedict["qstart"]) - float(linedict["qend"]))
        if length / float(linedict["qlen"]) > (cutoff / float(100)):
            return True
        else:
            return False

    def subject_match(linedict):
        length = abs(float(linedict["sstart"]) - float(linedict["send"]))
        if length / float(linedict["slen"]) > (cutoff / float(100)):
            return True
        else:
            return False

    out_fname = append_stem(filename, str(cutoff) + "_filt")
    # skip if it already exists
    if os.path.exists(out_fname):
        return out_fname

    with open(filename) as in_handle:
        reader = csv.reader(in_handle, delimiter="\t")
        with open(out_fname, "w") as out_handle:
            writer = csv.writer(out_handle, delimiter="\t")
            writer.writerow(HEADER_FIELDS.split(" "))
            for line in reader:
                linedict = dict(zip(HEADER_FIELDS.split(" "), line))
                if query_match(linedict) & subject_match(linedict):
                    writer.writerow(line)
    return out_fname
Пример #11
0
def fix_RPKM_count_file(in_file, out_file=None):
    """
    splits the RPKM_count file id column into two separate columns;
    one with the id and the other with the feature
    """

    if not out_file:
        out_file = append_stem(in_file, "fixed")

    if file_exists(out_file):
        return out_file

    with open(in_file) as in_handle:
        rpkm = pd.io.parsers.read_table(in_handle)
        rpkm["gene_id"] = rpkm["accession"].apply(lambda x:
                                                  x.rsplit("_", 2)[0])
        rpkm["feature"] = rpkm["accession"].apply(lambda x:
                                                  x.rsplit("_", 2)[1])
        # remove the '#' character since it denotes a comment
        rpkm = rpkm.rename(columns={"#chrom": "chrom"})

    with file_transaction(out_file) as tmp_out_file:
        rpkm.to_csv(tmp_out_file, sep="\t", index=False)

    return out_file
Пример #12
0
def filter_results_by_length(filename, cutoff):
    """ filters the tsv results by the metric that both the overlap
    of the query sequence and the subject sequence must both be
    > cutoff of their length. This might be a little too restrictive though
    """
    def query_match(linedict):
        length = abs(float(linedict["qstart"]) - float(linedict["qend"]))
        if length / float(linedict["qlen"]) > (cutoff / float(100)):
            return True
        else:
            return False

    def subject_match(linedict):
        length = abs(float(linedict["sstart"]) - float(linedict["send"]))
        if length / float(linedict["slen"]) > (cutoff / float(100)):
            return True
        else:
            return False

    out_fname = append_stem(filename, str(cutoff) + "_filt")
    # skip if it already exists
    if os.path.exists(out_fname):
        return out_fname

    with open(filename) as in_handle:
        reader = csv.reader(in_handle, delimiter="\t")
        with open(out_fname, "w") as out_handle:
            writer = csv.writer(out_handle, delimiter="\t")
            writer.writerow(HEADER_FIELDS.split(" "))
            for line in reader:
                linedict = dict(zip(HEADER_FIELDS.split(" "), line))
                if query_match(linedict) & subject_match(linedict):
                    writer.writerow(line)
    return out_fname
Пример #13
0
 def test_length_filter(self):
     paired = self.config["input_paired"]
     out_files = filter_reads_by_length(paired[0], paired[1], min_length=20)
     correct_files = map(self._find_length_filter_correct, out_files)
     self.assertTrue(all(map(filecmp.cmp, correct_files, out_files)))
     map(os.remove, out_files)
     map(os.remove, [append_stem(x, "singles") for x in paired])
Пример #14
0
def downsample_bam(bam_file, target_reads, out_file=None):
    if out_file is None:
        out_file = append_stem(bam_file, "downsampled")
    percentage_to_sample = _get_percentage_to_sample(bam_file, target_reads)
    sh.samtools.view("-h", "-b", "-s", percentage_to_sample, "-o", out_file,
                     bam_file)
    return out_file
Пример #15
0
def hard_clip(in_file, bases=8, right_side=True, quality_format="sanger", out_file=None):
    """
    hard clip a fastq file by removing N bases from each read
    bases is the number of bases to clip
    right_side is True to trim from the right side, False to trim from
    the left

    example: hard_clip(fastq_file, bases=4, end="5prime")

    """
    if right_side:
        logger.info("Hard clipping %d bases from the right side of "
                    "reads in %s." % (bases, in_file))
    else:
        logger.info("Hard clipping %d bases from the left side of "
                    "reads in %s." % (bases, in_file))

    quality_type = QUALITY_TYPE_HARD_TRIM[quality_format]
    if not out_file:
        out_file = append_stem(in_file, "clip")
    if file_exists(out_file):
        return out_file
    in_iterator = SeqIO.parse(in_file, quality_type)

    out_iterator = (_trim_read(record, bases, right_side) for
                    record in in_iterator)
    with file_transaction(out_file) as tmp_out_file:
        with open(tmp_out_file, "w") as out_handle:
            SeqIO.write(out_iterator, out_handle, quality_type)
    return out_file
Пример #16
0
def run(in_file, stage_config, config):
    arguments = [stage_config["program"]]
    arguments += _parse(stage_config)
    results_dir = config["dir"].get("results", None)
    if results_dir:
        out_dir = os.path.join(results_dir, "cutadapt")
        safe_makedir(out_dir)
        out_file = os.path.join(out_dir, os.path.basename(append_stem(in_file, "trimmed")))
    else:
        out_file = append_stem(in_file, "trimmed")

    if file_exists(out_file):
        return out_file

    arguments.extend(["--output", out_file, in_file])
    subprocess.check_call(arguments)
    return out_file
Пример #17
0
def only_unmapped(in_file, out_file=None):
    if out_file is None:
        out_file = append_stem(in_file, "unmapped")
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tmp_out_file:
        sh.samtools.view(in_file, h=True, S=True, f=4, o=out_file)
    return out_file
Пример #18
0
Файл: sam.py Проект: roryk/bipy
def only_unmapped(in_file, out_file=None):
    if out_file is None:
        out_file = append_stem(in_file, "unmapped")
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tmp_out_file:
        sh.samtools.view(in_file, h=True, S=True, f=4, o=out_file)
    return out_file
Пример #19
0
def run_with_config(first, second=None, config=None):
    first_out = append_stem(first, "sickle")
    second_out = None
    if second:
        out_files = run_as_pe(first, second, config)
        return out_files

    else:
        out_file = run_as_se(first, config)
        return out_file
Пример #20
0
def run(in_file, stage_config, config):
    arguments = [stage_config["program"]]
    arguments += _parse(stage_config)
    results_dir = config["dir"].get("results", None)
    if results_dir:
        out_dir = os.path.join(results_dir, "cutadapt")
        safe_makedir(out_dir)
        out_file = os.path.join(out_dir,
                                os.path.basename(append_stem(in_file,
                                                             "trimmed")))
    else:
        out_file = append_stem(in_file, "trimmed")

    if file_exists(out_file):
        return out_file

    arguments.extend(["--output", out_file, in_file])
    subprocess.check_call(arguments)
    return out_file
Пример #21
0
    def _run_vep(self, in_file):
        out_file = append_stem(in_file, "vep")
        if file_exists(out_file):
            return out_file

        with file_transaction(out_file) as tmp_out_file:
            sh.perl(self.vep, "-i", in_file, "-o", tmp_out_file,
                    species=self.species, _convert_underscore=False,
                    **self.options)

        return out_file
Пример #22
0
    def out_file(self, in_file):
        """
        returns the expected output file name from the in_file

        example: "control_1.fastq" -> "control_1.groom.fastq"

        """
        results_dir = self.config["dirs"].get("results", "results")
        stage_dir = os.path.join(results_dir, self.stage)
        out_file = append_stem(os.path.basename(in_file), "groom")
        return os.path.join(stage_dir, out_file)
Пример #23
0
def run(in_file, end="se", qual="sanger", l="20", out_file=None):
    if not out_file:
        out_file = append_stem(in_file, "trimmed")

    if os.path.exists(out_file):
        return out_file

    cmd = ["sickle", end, "-f", in_file, "-o", out_file,
           "-t", qual, "-l", l, "-q", qual]

    subprocess.check_call(cmd)
    return out_file
Пример #24
0
    def sort_vcf(in_file):
        from bipy.utils import append_stem
        from bcbio.distributed.transaction import file_transaction
        from bcbio.utils import file_exists
        import sh

        out_file = append_stem(in_file, "sorted")
        if file_exists(out_file):
            return out_file
        with file_transaction(out_file) as tmp_out_file:
            sh.vcf_sort(in_file, _out=tmp_out_file)
        return out_file
Пример #25
0
    def sort_vcf(in_file):
        from bipy.utils import append_stem
        from bcbio.distributed.transaction import file_transaction
        from bcbio.utils import file_exists
        import sh

        out_file = append_stem(in_file, "sorted")
        if file_exists(out_file):
            return out_file
        with file_transaction(out_file) as tmp_out_file:
            sh.vcf_sort(in_file, _out=tmp_out_file)
        return out_file
Пример #26
0
def run(input_file, jellyfish_config, config):
    # run the jellyfish counting, this produces a set of files identified
    # by out_prefix
    out_prefix = _build_output_prefix(input_file, jellyfish_config, config)
    cmd = _build_command(input_file, out_prefix, config)
    subprocess.check_call(cmd)

    # combine the output files into one merged file and return that
    out_file = append_stem(out_prefix, "combined")
    merge_cmd = _build_merge_command(out_prefix, out_file)
    subprocess.check_call(merge_cmd)
    # find all of the output files and merge them into one file
    return out_file
Пример #27
0
def _run_trim(curr_files, config):
    logger.info("Trimming poor quality ends from %s" % (str(curr_files)))
    nfiles = len(curr_files)
    min_length = str(config["stage"]["trim"].get("min_length", 20))
    pair = str(config["stage"]["trim"].get("pair", "se"))
    platform = str(config["stage"]["trim"].get("platform", "sanger"))
    out_dir = os.path.join(config["dir"]["results"], "trimmed")
    safe_makedir(out_dir)
    out_files = [append_stem(os.path.basename(x), "trim") for x in curr_files]
    out_files = [os.path.join(out_dir, x) for x in out_files]
    out_files = view.map(sickle.run, curr_files, [pair] * nfiles,
                         [platform] * nfiles, [min_length] * nfiles, out_files)
    return out_files
Пример #28
0
def run(input_file, jellyfish_config, config):
    # run the jellyfish counting, this produces a set of files identified
    # by out_prefix
    out_prefix = _build_output_prefix(input_file, jellyfish_config, config)
    cmd = _build_command(input_file, out_prefix, config)
    subprocess.check_call(cmd)

    # combine the output files into one merged file and return that
    out_file = append_stem(out_prefix, "combined")
    merge_cmd = _build_merge_command(out_prefix, out_file)
    subprocess.check_call(merge_cmd)
    # find all of the output files and merge them into one file
    return out_file
Пример #29
0
def make_test(in_file, config, lines=1000000):
    """
    take a small subset of the input files for testing. only makes sense for
    text files where lines gives an appopriate number of records, for example,
    FASTQ files should be a multiple of 4.

    """
    results_dir = config["dir"]["results"]
    out_dir = os.path.join(results_dir, "test", "data")
    safe_makedir(out_dir)
    out_file = os.path.join(out_dir, append_stem(os.path.basename(in_file), "test"))
    with open(in_file) as in_handle, open(out_file, "w") as out_handle:
        for line in islice(in_handle, lines):
            out_handle.write(line)

    return out_file
Пример #30
0
    def _run_vep(self, in_file):
        out_file = append_stem(in_file, "vep")
        if file_exists(out_file):
            return out_file

        with file_transaction(out_file) as tmp_out_file:
            sh.perl(self.vep,
                    "-i",
                    in_file,
                    "-o",
                    tmp_out_file,
                    species=self.species,
                    _convert_underscore=False,
                    **self.options)

        return out_file
Пример #31
0
def annotate_table_with_biomart(in_file,
                                join_column,
                                filter_type,
                                organism,
                                out_file=None):
    """
    join_column is the column to combine the perform the lookups on
    filter_type describes the type of the join_column (see the getBM
    documentation in R for details), organism is the english name of
    the organism

    example:
    annotate_table_with_biomart(in_file, "id", "ensembl_gene_id",
                                "human")

    """

    if organism not in ORG_TO_ENSEMBL:
        logger.error("organism not supported")
        exit(1)

    logger.info("Annotating %s." % (organism))
    if not out_file:
        out_file = append_stem(in_file, "annotated")
    if os.path.exists(out_file):
        return out_file
    # use biomaRt to annotate the data file
    r = robjects.r
    r.assign('join_column', join_column)
    r.assign('in_file', in_file)
    r.assign('out_file', out_file)
    r.assign('ensembl_gene', ORG_TO_ENSEMBL[organism]["gene_ensembl"])
    r.assign('gene_symbol', ORG_TO_ENSEMBL[organism]["gene_symbol"])
    r.assign('filter_type', filter_type)
    r('''
    library(biomaRt)
    ensembl = useMart("ensembl", dataset = ensembl_gene)
    d = read.table(in_file, header=TRUE)
    a = getBM(attributes=c(filter_type,
                gene_symbol, "description"),
                filters=c(filter_type), values=d[,join_column],
                mart=ensembl)
    m = merge(d, a, by.x=join_column, by.y=filter_type)
    write.table(m, out_file, quote=FALSE, row.names=FALSE, sep="\t")
    ''')

    return out_file
Пример #32
0
def make_test(in_file, config, lines=1000000):
    """
    take a small subset of the input files for testing. only makes sense for
    text files where lines gives an appopriate number of records, for example,
    FASTQ files should be a multiple of 4.

    """
    results_dir = config["dir"]["results"]
    out_dir = os.path.join(results_dir, "test", "data")
    safe_makedir(out_dir)
    out_file = os.path.join(out_dir,
                            append_stem(os.path.basename(in_file), "test"))
    with open(in_file) as in_handle, open(out_file, "w") as out_handle:
        for line in islice(in_handle, lines):
            out_handle.write(line)

    return out_file
Пример #33
0
def _run_trim(curr_files, config):
    logger.info("Trimming poor quality ends from %s" % (str(curr_files)))
    nfiles = len(curr_files)
    min_length = str(config["stage"]["trim"].get("min_length", 20))
    pair = str(config["stage"]["trim"].get("pair", "se"))
    platform = str(config["stage"]["trim"].get("platform", "sanger"))
    out_dir = os.path.join(config["dir"]["results"], "trimmed")
    safe_makedir(out_dir)
    out_files = [append_stem(os.path.basename(x), "trim") for
                 x in curr_files]
    out_files = [os.path.join(out_dir, x) for x in out_files]
    out_files = view.map(sickle.run, curr_files,
                         [pair] * nfiles,
                         [platform] * nfiles,
                         [min_length] * nfiles,
                         out_files)
    return out_files
Пример #34
0
def filter_single_reads_by_length(in_file, min_length=30):
    """
    removes reads from a fastq file which are below a min_length in bases

    """
    logger.info("Removing reads in %s thare are less than %d bases."
                % (in_file, min_length))
    quality_type = QUALITY_TYPE[DetectFastqFormat.run(in_file)[0]]
    out_file = append_stem(in_file, "fixed")
    if file_exists(out_file):
        return out_file
    in_iterator = SeqIO.parse(in_file, quality_type)
    out_iterator = (record for record in in_iterator if
                    len(record.seq) > min_length)
    with file_transaction(out_file) as tmp_out_file:
        with open(tmp_out_file, "w") as out_handle:
            SeqIO.write(out_iterator, out_handle, quality_type)
    return out_file
Пример #35
0
def annotate_table_with_biomart(in_file, join_column,
                                filter_type, organism, out_file=None):
    """
    join_column is the column to combine the perform the lookups on
    filter_type describes the type of the join_column (see the getBM
    documentation in R for details), organism is the english name of
    the organism

    example:
    annotate_table_with_biomart(in_file, "id", "ensembl_gene_id",
                                "human")

    """

    if organism not in ORG_TO_ENSEMBL:
        logger.error("organism not supported")
        exit(1)

    logger.info("Annotating %s." % (organism))
    if not out_file:
        out_file = append_stem(in_file, "annotated")
    if os.path.exists(out_file):
        return out_file
    # use biomaRt to annotate the data file
    r = robjects.r
    r.assign('join_column', join_column)
    r.assign('in_file', in_file)
    r.assign('out_file', out_file)
    r.assign('ensembl_gene', ORG_TO_ENSEMBL[organism]["gene_ensembl"])
    r.assign('gene_symbol', ORG_TO_ENSEMBL[organism]["gene_symbol"])
    r.assign('filter_type', filter_type)
    r('''
    library(biomaRt)
    ensembl = useMart("ensembl", dataset = ensembl_gene)
    d = read.table(in_file, header=TRUE)
    a = getBM(attributes=c(filter_type,
                gene_symbol, "description"),
                filters=c(filter_type), values=d[,join_column],
                mart=ensembl)
    m = merge(d, a, by.x=join_column, by.y=filter_type)
    write.table(m, out_file, quote=FALSE, row.names=FALSE, sep="\t")
    ''')

    return out_file
Пример #36
0
 def _load_gemini(self, in_file):
     log_id = os.path.join(self.log,
                           "gemini" + "_" + str(uuid.uuid4()) + ".log")
     sh.gemini.load(self.db, v=in_file, t=self.type,
                    _out=append_stem(log_id, "out"),
                    _err=append_stem(log_id, "err"))
Пример #37
0
 def chr_out(chrom):
     out_file = os.path.join(break_dir, append_stem(in_file, chrom))
     out_file = replace_suffix(out_file, "vcf")
     return out_file
Пример #38
0
def _build_output_file(input_file, suffix, config):
    base = os.path.basename(input_file)
    return os.path.join(config["dir"]["results"], "tagdust",
                        append_stem(base, suffix))
Пример #39
0
 def chr_out(chrom):
     out_file = os.path.join(break_dir, append_stem(in_file, chrom))
     out_file = replace_suffix(out_file, "vcf")
     return out_file
Пример #40
0
def main(config_file):

    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    start_cluster(config)

    # after the cluster is up, import the view to i
    from bipy.cluster import view
    input_files = config["input"]
    results_dir = config["dir"]["results"]

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    curr_files = input_files

    ## qc steps
    for stage in config["run"]:
        if stage == "fastqc":
            # run the basic fastqc
            logger.info("Running %s on %s" % (stage, str(curr_files)))
            fastqc_config = config["stage"][stage]
            fastqc_outputs = view.map(fastqc.run, curr_files,
                                      [fastqc_config] * len(curr_files),
                                      [config] * len(curr_files))
            # this does nothing for now, not implemented yet
            summary_file = _combine_fastqc(fastqc_outputs)

        if stage == "trim":
            logger.info("Trimming poor quality ends "
                        " from %s" % (str(curr_files)))
            nlen = len(curr_files)
            min_length = str(config["stage"][stage].get("min_length", 20))

            # trim low quality ends of reads
            # do this dirty for now
            out_dir = os.path.join(results_dir, "trimmed")
            safe_makedir(out_dir)
            out_files = [
                append_stem(os.path.basename(x), "trim") for x in curr_files
            ]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            # XXX remove the magic number of 10 the length of the
            # minimum read to keep
            out_files = view.map(sickle.run, curr_files, ["se"] * nlen,
                                 ["sanger"] * nlen, [min_length] * nlen,
                                 out_files)
            curr_files = out_files

        if stage == "tagdust":
            input_files = curr_files
            # remove tags matching the other miRNA tested
            logger.info("Running %s on %s." % (stage, input_files))
            tagdust_config = config["stage"][stage]
            tagdust_outputs = view.map(tagdust.run, input_files,
                                       [tagdust_config] * len(input_files),
                                       [config] * len(input_files))
            curr_files = [x[0] for x in tagdust_outputs]

        if stage == "filter_length":
            # filter out reads below or above a certain length
            filter_config = config["stage"][stage]
            min_length = filter_config.get("min_length", 0)
            max_length = filter_config.get("max_length", MAX_READ_LENGTH)

            # length predicate
            def length_filter(x):
                return min_length < len(x.seq) < max_length

            # filter the input reads based on length
            # parallelizing this doesn't seem to work
            # ipython can't accept closures as an argument to view.map()
            """
            filtered_fastq = view.map(filter_seqio, tagdust_outputs,
                                      [lf] * len(tagdust_outputs),
                                      ["filt"] * len(tagdust_outputs),
                                      ["fastq"] * len(tagdust_outputs))"""
            out_files = [
                append_stem(os.path.basename(input_file[0]), "filt")
                for input_file in tagdust_outputs
            ]
            out_dir = os.path.join(config["dir"]["results"], "length_filtered")
            safe_makedir(out_dir)
            out_files = [os.path.join(out_dir, x) for x in out_files]

            filtered_fastq = [
                filter_seqio(x[0], length_filter, y, "fastq")
                for x, y in zip(tagdust_outputs, out_files)
            ]

            curr_files = filtered_fastq

        if stage == "count_ends":
            logger.info("Compiling nucleotide counts at 3' and 5' ends.")

            # count the nucleotide at the end of each read
            def count_ends(x, y):
                """ keeps a running count of an arbitrary set of keys
                during the reduce step """
                x[y] = x.get(y, 0) + 1
                return x

            def get_3prime_end(x):
                return str(x.seq[-1])

            def get_5prime_end(x):
                return str(x.seq[0])

            def output_counts(end_function, count_file):
                # if the count_file already exists, skip
                outdir = os.path.join(config["dir"]["results"], stage)
                safe_makedir(outdir)
                count_file = os.path.join(outdir, count_file)
                if os.path.exists(count_file):
                    return count_file
                # outputs a tab file of the counts at the end
                # of the fastq files kj
                counts = [
                    reduce(count_ends,
                           apply_seqio(x, end_function, kind="fastq"), {})
                    for x in curr_files
                ]
                df = pd.DataFrame(counts, index=map(_short_name, curr_files))
                df = df.astype(float)
                total = df.sum(axis=1)
                df = df.div(total, axis=0)
                df["total"] = total
                df.to_csv(count_file, sep="\t")

            output_counts(get_3prime_end, "3prime_counts.tsv")
            output_counts(get_5prime_end, "5prime_counts.tsv")

        if stage == "tophat":
            tophat_config = config["stage"][stage]
            logger.info("Running tophat on %s" % (str(curr_files)))
            nlen = len(curr_files)
            pair_file = None
            ref_file = tophat_config["annotation"]
            out_base = os.path.join(results_dir, "mirna")
            align_dir = os.path.join(results_dir, "tophat")
            config = config
            tophat_files = view.map(tophat.align, curr_files,
                                    [pair_file] * nlen, [ref_file] * nlen,
                                    [out_base] * nlen, [align_dir] * nlen,
                                    [config] * nlen)
            curr_files = tophat_files

        if stage == "novoalign":
            logger.info("Running novoalign on %s" % (str(curr_files)))
            # align
            ref = config["genome"]["file"]
            novoalign_config = config["stage"][stage]
            aligned_outputs = view.map(novoalign.run, curr_files,
                                       [ref] * len(curr_files),
                                       [novoalign_config] * len(curr_files),
                                       [config] * len(curr_files))
            # convert sam to bam, sort and index
            picard = BroadRunner(config["program"]["picard"], None, {})
            bamfiles = view.map(picardrun.picard_formatconverter,
                                [picard] * len(aligned_outputs),
                                aligned_outputs)
            sorted_bf = view.map(picardrun.picard_sort,
                                 [picard] * len(bamfiles), bamfiles)
            view.map(picardrun.picard_index, [picard] * len(sorted_bf),
                     sorted_bf)
            # these files are the new starting point for the downstream
            # analyses, so copy them over into the data dir and setting
            # them to read only
            #data_dir = os.path.join(config["dir"]["data"], stage)
            #safe_makedir(data_dir)
            #view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf))
            #new_files = [os.path.join(data_dir, x) for x in
            #             map(os.path.basename, sorted_bf)]
            #[os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files]
            # index the bam files for later use
            #view.map(picardrun.picard_index, [picard] * len(new_files),
            #         new_files)

            curr_files = sorted_bf

        if stage == "new_coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = blastn.prepare_ref_file(config["stage"][stage]["ref"],
                                          config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"], None, {})
            out_dir = os.path.join(results_dir, "new_coverage")
            safe_makedir(out_dir)
            out_files = [
                replace_suffix(os.path.basename(x), "metrics")
                for x in curr_files
            ]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun, curr_files, [ref] * nrun,
                                 [ribo] * nrun, out_files)
            curr_files = out_files

        if stage == "coverage":
            gtf = blastn.prepare_ref_file(config["annotation"], config)
            logger.info("Calculating coverage of features in %s for %s" %
                        (gtf, str(sorted_bf)))
            out_files = [replace_suffix(x, "counts.bed") for x in sorted_bf]
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            logger.info(out_files)
            out_files = [
                os.path.join(out_dir, os.path.basename(x)) for x in out_files
            ]
            logger.info(out_files)
            view.map(bedtools.count_overlaps, sorted_bf,
                     [gtf] * len(sorted_bf), out_files)

        if stage == "htseq-count":
            nfiles = len(curr_files)
            htseq_config = _get_stage_config(config, stage)
            htseq_outputs = view.map(htseq_count.run_with_config,
                                     aligned_outputs, [config] * nfiles,
                                     [stage] * nfiles)
            column_names = _get_short_names(input_files)
            logger.info("Column names: %s" % (column_names))
            out_file = os.path.join(config["dir"]["results"], stage,
                                    "combined.counts")
            combined_out = htseq_count.combine_counts(htseq_outputs,
                                                      column_names, out_file)
        if stage == "bedtools_intersect":
            bedfiles = config["stage"]["bedtools_intersect"].get("bed", None)
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            for bedfile in bedfiles:
                bedbase, bedext = os.path.splitext(bedfile)
                out_files = [remove_suffix(x) for x in sorted_bf]
                out_files = [
                    os.path.join(out_dir, os.path.basename(x))
                    for x in out_files
                ]
                out_files = [
                    "_vs_".join([x, os.path.basename(bedbase)])
                    for x in out_files
                ]
                out_files = [".".join([x, "bam"]) for x in out_files]
                test_out = map(bedtools.intersectbam2bed, sorted_bf,
                               [bedfile] * len(sorted_bf),
                               [False] * len(sorted_bf), out_files)
                count_files = [replace_suffix(x, "stats") for x in out_files]
                map(write_ratios, sorted_bf, out_files, count_files)

        if stage == "piranha":
            piranha_runner = piranha.PiranhaStage(config)
            out_files = view.map(piranha_runner, curr_files)

    stop_cluster()
Пример #41
0
Файл: sam.py Проект: roryk/bipy
def coordinate_sort_sam(in_file, config, out_file=None):
    out_file = append_stem(in_file, "sorted")
    picard = BroadRunner(config["program"]["picard"], None, {"algorithm": {}})
    picardrun.picard_sort(picard, in_file, sort_order="coordinate",
                          out_file=out_file)
    return out_file
Пример #42
0
Файл: sam.py Проект: roryk/bipy
def sortsam(in_file, out_file=None):
    out_file = append_stem(in_file, "sorted")
    with file_transaction(out_file) as tmp_out_file:
        sort = sh.sort.bake(s=True, k="1,1", _out=tmp_out_file)
        sort(in_file)
    return out_file
Пример #43
0
def run_as_se(first, config):
    first_out = append_stem(first, "sickle")
    pass
Пример #44
0
def main(config_file):

    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    start_cluster(config)

    # after the cluster is up, import the view to i
    from bipy.cluster import view
    input_files = config["input"]
    results_dir = config["dir"]["results"]

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    curr_files = input_files

    ## qc steps
    for stage in config["run"]:
        if stage == "fastqc":
            # run the basic fastqc
            logger.info("Running %s on %s" % (stage, str(curr_files)))
            fastqc_config = config["stage"][stage]
            fastqc_outputs = view.map(fastqc.run, curr_files,
                                      [fastqc_config] * len(curr_files),
                                      [config] * len(curr_files))
            # this does nothing for now, not implemented yet
            summary_file = _combine_fastqc(fastqc_outputs)

        if stage == "trim":
            logger.info("Trimming poor quality ends "
                        " from %s" % (str(curr_files)))
            nlen = len(curr_files)
            min_length = str(config["stage"][stage].get("min_length", 20))

            # trim low quality ends of reads
            # do this dirty for now
            out_dir = os.path.join(results_dir, "trimmed")
            safe_makedir(out_dir)
            out_files = [append_stem(os.path.basename(x), "trim") for
                         x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            # XXX remove the magic number of 10 the length of the
            # minimum read to keep
            out_files = view.map(sickle.run, curr_files,
                                 ["se"] * nlen,
                                 ["sanger"] * nlen,
                                 [min_length] * nlen,
                                 out_files)
            curr_files = out_files

        if stage == "tagdust":
            input_files = curr_files
            # remove tags matching the other miRNA tested
            logger.info("Running %s on %s." % (stage, input_files))
            tagdust_config = config["stage"][stage]
            tagdust_outputs = view.map(tagdust.run, input_files,
                                       [tagdust_config] * len(input_files),
                                       [config] * len(input_files))
            curr_files = [x[0] for x in tagdust_outputs]

        if stage == "filter_length":
            # filter out reads below or above a certain length
            filter_config = config["stage"][stage]
            min_length = filter_config.get("min_length", 0)
            max_length = filter_config.get("max_length", MAX_READ_LENGTH)

            # length predicate
            def length_filter(x):
                return min_length < len(x.seq) < max_length

            # filter the input reads based on length
            # parallelizing this doesn't seem to work
            # ipython can't accept closures as an argument to view.map()
            """
            filtered_fastq = view.map(filter_seqio, tagdust_outputs,
                                      [lf] * len(tagdust_outputs),
                                      ["filt"] * len(tagdust_outputs),
                                      ["fastq"] * len(tagdust_outputs))"""
            out_files = [append_stem(os.path.basename(input_file[0]),
                         "filt") for input_file in tagdust_outputs]
            out_dir = os.path.join(config["dir"]["results"],
                                   "length_filtered")
            safe_makedir(out_dir)
            out_files = [os.path.join(out_dir, x) for x in out_files]

            filtered_fastq = [filter_seqio(x[0], length_filter, y, "fastq")
                              for x, y in zip(tagdust_outputs, out_files)]

            curr_files = filtered_fastq

        if stage == "count_ends":
            logger.info("Compiling nucleotide counts at 3' and 5' ends.")
            # count the nucleotide at the end of each read
            def count_ends(x, y):
                """ keeps a running count of an arbitrary set of keys
                during the reduce step """
                x[y] = x.get(y, 0) + 1
                return x

            def get_3prime_end(x):
                return str(x.seq[-1])

            def get_5prime_end(x):
                return str(x.seq[0])

            def output_counts(end_function, count_file):
                # if the count_file already exists, skip
                outdir = os.path.join(config["dir"]["results"], stage)
                safe_makedir(outdir)
                count_file = os.path.join(outdir, count_file)
                if os.path.exists(count_file):
                    return count_file
                # outputs a tab file of the counts at the end
                # of the fastq files kj
                counts = [reduce(count_ends,
                                 apply_seqio(x, end_function, kind="fastq"),
                                 {}) for x in curr_files]
                df = pd.DataFrame(counts,
                                  index=map(_short_name, curr_files))
                df = df.astype(float)
                total = df.sum(axis=1)
                df = df.div(total, axis=0)
                df["total"] = total
                df.to_csv(count_file, sep="\t")

            output_counts(get_3prime_end, "3prime_counts.tsv")
            output_counts(get_5prime_end, "5prime_counts.tsv")

        if stage == "tophat":
            tophat_config = config["stage"][stage]
            logger.info("Running tophat on %s" % (str(curr_files)))
            nlen = len(curr_files)
            pair_file = None
            ref_file = tophat_config["annotation"]
            out_base = os.path.join(results_dir, "mirna")
            align_dir = os.path.join(results_dir, "tophat")
            config = config
            tophat_files = view.map(tophat.align,
                                    curr_files,
                                    [pair_file] * nlen,
                                    [ref_file] * nlen,
                                    [out_base] * nlen,
                                    [align_dir] * nlen,
                                    [config] * nlen)
            curr_files = tophat_files

        if stage == "novoalign":
            logger.info("Running novoalign on %s" % (str(curr_files)))
            # align
            ref = config["genome"]["file"]
            novoalign_config = config["stage"][stage]
            aligned_outputs = view.map(novoalign.run, curr_files,
                                       [ref] * len(curr_files),
                                       [novoalign_config] * len(curr_files),
                                       [config] * len(curr_files))
            # convert sam to bam, sort and index
            picard = BroadRunner(config["program"]["picard"], None, {})
            bamfiles = view.map(picardrun.picard_formatconverter,
                                [picard] * len(aligned_outputs),
                                aligned_outputs)
            sorted_bf = view.map(picardrun.picard_sort,
                                 [picard] * len(bamfiles),
                                 bamfiles)
            view.map(picardrun.picard_index, [picard] * len(sorted_bf),
                     sorted_bf)
            # these files are the new starting point for the downstream
            # analyses, so copy them over into the data dir and setting
            # them to read only
            #data_dir = os.path.join(config["dir"]["data"], stage)
            #safe_makedir(data_dir)
            #view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf))
            #new_files = [os.path.join(data_dir, x) for x in
            #             map(os.path.basename, sorted_bf)]
            #[os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files]
            # index the bam files for later use
            #view.map(picardrun.picard_index, [picard] * len(new_files),
            #         new_files)

            curr_files = sorted_bf

        if stage == "new_coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = blastn.prepare_ref_file(config["stage"][stage]["ref"],
                                          config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"], None, {})
            out_dir = os.path.join(results_dir, "new_coverage")
            safe_makedir(out_dir)
            out_files = [replace_suffix(os.path.basename(x),
                                        "metrics") for x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun,
                                 curr_files,
                                 [ref] * nrun,
                                 [ribo] * nrun,
                                 out_files)
            curr_files = out_files

        if stage == "coverage":
            gtf = blastn.prepare_ref_file(config["annotation"], config)
            logger.info("Calculating coverage of features in %s for %s"
                        % (gtf, str(sorted_bf)))
            out_files = [replace_suffix(x, "counts.bed") for
                         x in sorted_bf]
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            logger.info(out_files)
            out_files = [os.path.join(out_dir,
                                      os.path.basename(x)) for x in out_files]
            logger.info(out_files)
            view.map(bedtools.count_overlaps, sorted_bf,
                     [gtf] * len(sorted_bf),
                     out_files)

        if stage == "htseq-count":
            nfiles = len(curr_files)
            htseq_config = _get_stage_config(config, stage)
            htseq_outputs = view.map(htseq_count.run_with_config,
                                     aligned_outputs,
                                     [config] * nfiles,
                                     [stage] * nfiles)
            column_names = _get_short_names(input_files)
            logger.info("Column names: %s" % (column_names))
            out_file = os.path.join(config["dir"]["results"], stage,
                                    "combined.counts")
            combined_out = htseq_count.combine_counts(htseq_outputs,
                                                      column_names,
                                                      out_file)
        if stage == "bedtools_intersect":
            bedfiles = config["stage"]["bedtools_intersect"].get("bed", None)
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            for bedfile in bedfiles:
                bedbase, bedext = os.path.splitext(bedfile)
                out_files = [remove_suffix(x) for x in sorted_bf]
                out_files = [os.path.join(out_dir, os.path.basename(x)) for x in
                             out_files]
                out_files = ["_vs_".join([x, os.path.basename(bedbase)])
                             for x in out_files]
                out_files = [".".join([x, "bam"]) for x in out_files]
                test_out = map(bedtools.intersectbam2bed, sorted_bf,
                               [bedfile] * len(sorted_bf),
                               [False] * len(sorted_bf),
                               out_files)
                count_files = [replace_suffix(x, "stats") for x in
                               out_files]
                map(write_ratios, sorted_bf, out_files, count_files)

        if stage == "piranha":
            piranha_runner = piranha.PiranhaStage(config)
            out_files = view.map(piranha_runner, curr_files)

    stop_cluster()
Пример #45
0
 def out_file(self, in_file):
     results_dir = self.config["dir"].get("results", "results")
     out_dir = os.path.join(results_dir, self.stage)
     out_base = append_stem(os.path.basename(in_file), "clip")
     return os.path.join(out_dir, out_base)
Пример #46
0
def _build_output_file(input_file, suffix, config):
    base = os.path.basename(input_file)
    return os.path.join(config["dir"]["results"], "tagdust",
                        append_stem(base, suffix))
Пример #47
0
def coordinate_sort_sam(in_file, config, out_file=None):
    out_file = append_stem(in_file, "sorted")
    picard = BroadRunner(config["program"]["picard"], None, {"algorithm": {}})
    picardrun.picard_sort(picard, in_file, sort_order="coordinate",
                          out_file=out_file)
    return out_file
Пример #48
0
def sortsam(in_file, out_file=None):
    out_file = append_stem(in_file, "sorted")
    with file_transaction(out_file) as tmp_out_file:
        sort = sh.sort.bake(s=True, k="1,1", _out=tmp_out_file)
        sort(in_file)
    return out_file