예제 #1
0
def main(st_data_file, names_map, output_file):

    if not fileOk(st_data_file) or not fileOk(names_map):
        sys.stderr.write("Error, input file not present or invalid format\n")
        sys.exit(1)

    # loads a map with the ensembl -> gene name
    genes_map = dict()
    with open(names_map, "r") as map_file:
        for line in map_file.readlines():
            tokens = line.split()
            assert (len(tokens) == 2)
            genes_map[tokens[0]] = tokens[1]

    # Iterates the genes IDs to get gene names
    st_data = pd.read_table(st_data_file, sep="\t", header=0, index_col=0)
    adjustedList = list()
    for gene in st_data.columns:
        try:
            gene = genes_map[gene]
        except KeyError:
            sys.stdout.write(
                "Warning, {} was not found in the MAP file\n".format(gene))
        adjustedList.append(gene)

    # Update the table with the gene names
    st_data.columns = adjustedList

    # Write table to file
    st_data.to_csv(output_file, sep="\t")
예제 #2
0
def sortSamFile(input_sam, outputFolder=None):
    """
    It simply sorts by position a sam/bam file containing mapped reads 
    :param input: is a SAM/BAM file with mapped reads
    :param outputFolder: the location where to place the output file (optional)
    :type input: str
    :type outputFolder: str
    :returns: the path to the sorted file
    :raises: RuntimeError
    """

    logger = logging.getLogger("STPipeline")

    sam_type = os.path.splitext(input_sam)[1].lower()
    output_sam = 'mapped_filtered_sorted{}'.format(sam_type)

    if outputFolder is not None and os.path.isdir(outputFolder):
        output_sam = os.path.join(outputFolder, output_sam)

    pysam.sort("-n", "-o", output_sam, "-O", sam_type, "-T", output_sam,
               input_sam)

    if not fileOk(output_sam):
        error = "Error sorting the SAM/BAM file.\n" \
        "Output file is not present\n {}".format(output_sam)
        logger.error(error)
        raise RuntimeError(error)

    return output_sam
예제 #3
0
def annotateReads(mappedReads, gtfFile, outputFile, outputDiscarded, mode,
                  strandness, htseq_no_ambiguous, include_non_annotated,
                  temp_dir, threads):
    """
    Annotates a file with mapped reads (BAM) using a modified 
    version of the htseq-count tool. It writes the annotated records to a file.
    It assumes the input reads (BAM) are single end and do not contain
    multiple alignments or un-annotated reads.
    :param mappedReads: path to a BAM file with mapped reads sorted by coordinate
    :param gtfFile: path to an annotation file in GTF format
    :param outputFile: where to write the annotated records (BAM)
    :param outputDiscarded: where to write the non-annotated records (BAM)
    :param mode: htseq-count overlapping mode (see htseq-count documentation)
    :param strandness: the type of strandness to use when annotating (yes, no or reverse)
    :param htseq_no_ambiguous: true if we want to discard ambiguous annotations
    :param include_non_annotated: true if we want to include 
    non annotated reads as __no_feature in the output
    :param outputFile: the name/path to the output file
    :param temp_dir: path to the folder where to put the created files
    :param threads: the number of CPU cores to use
    :type mappedReads: str
    :type gtfFile: str
    :type outputFile: str
    :type outputDiscarded: str
    :type mode: str
    :type strandness: str
    :type htseq_no_ambiguos: boolean
    :type include_non_annotated: str
    :type outputFile: str
    :type temp_dir: str
    :type threads: int
    :raises: RuntimeError, ValueError
    """

    logger = logging.getLogger("STPipeline")

    if not os.path.isfile(mappedReads):
        error = "Error during annotation, input file not present {}\n".format(
            mappedReads)
        logger.error(error)
        raise RuntimeError(error)

    # split bam file (returns a dictionary PART -> FILENAME
    partial_bam_files = split_bam(mappedReads, temp_dir, threads)

    # the splitted input files
    sub_input_files = [
        bamfile_name for _, bamfile_name in partial_bam_files.iteritems()
    ]

    # the name of the splitted output files
    sub_out_files = [
        os.path.join(temp_dir, "tmp_annotated_part_{}.bam".format(part))
        for part in partial_bam_files.keys()
    ]

    # the name of the splitted output discarded files
    sub_out_discarded_files = [
        os.path.join(temp_dir,
                     "tmp_annotated_discarded_part_{}.bam".format(part))
        if outputDiscarded else None for part in partial_bam_files.keys()
    ]
    # counter of annotated reads
    annotated = 0
    discarded_annotations = 0
    try:
        # create an annotation subprocess for each partial bam
        subprocesses = [
            multiprocessing.Process(
                target=count_reads_in_features,
                args=(
                    input,
                    gtfFile,
                    "bam",  # Type BAM for files
                    "pos",  # Order pos or name
                    strandness,  # Strand yes/no/reverse
                    mode,  # intersection_nonempty, union, intersection_strict
                    "exon",  # feature type in GFF
                    "gene_id",  # gene_id or gene_name
                    True,  # Quiet mode
                    0,  # Min quality score
                    output,
                    include_non_annotated,
                    htseq_no_ambiguous,
                    discarded)) for input, output, discarded in
            izip(sub_input_files, sub_out_files, sub_out_discarded_files)
        ]

        # start work in child processes
        for p in subprocesses:
            p.start()

        # wait for all processes to finish
        while True in [p.is_alive() for p in subprocesses]:
            time.sleep(0.1)

        # join the children
        for p in subprocesses:
            assert p.exitcode == 0, "Error during annotation: subprocess error."
            p.join()

        # merge the annotated bam files and summmarize the stats
        annotated = merge_bam(outputFile, sub_out_files)
        if outputDiscarded:
            discarded_annotations = merge_bam(outputDiscarded,
                                              sub_out_discarded_files)

    except Exception as e:
        error = "Error during annotation. HTSEQ execution failed\n"
        logger.error(error)
        raise e
    finally:
        # remove the sub-files
        for input, output, discarded in izip(sub_input_files, sub_out_files,
                                             sub_out_discarded_files):
            if os.path.isfile(input):
                os.remove(input)
            if os.path.isfile(output):
                os.remove(output)
            if discarded and os.path.isfile(discarded):
                os.remove(discarded)

    if not fileOk(outputFile) or annotated == 0:
        error = "Error during annotation. Output file not present {}\n".format(
            outputFile)
        logger.error(error)
        raise RuntimeError(error)

    logger.info("Annotated reads: {}".format(annotated))
    qa_stats.reads_after_annotation = annotated
예제 #4
0
def filterMappedReads(mapped_reads,
                      hash_reads,
                      file_output,
                      file_output_discarded=None):
    """ 
    Iterates a BAM file containing mapped reads 
    and discards reads that are not demultiplexed with TaggD
    (for that a dictionary with the read name as key and the X,Y and UMI)
    as values must be given.
    This function will add the X,Y coordinates and UMI as extra tags
    to the output BAM file. 
    It assumes all the reads are aligned (do not contain un-aligned reads),
    filtered for minimum read length and unique (no multimap).
    Demultiplexed reads with the extra tags (x,y and UMI) will be written
    to a file.
    :param mapped_reads: path to a BAM file containing the START alignments
    :param hash_reads: a dictionary of read_names to (x,y,umi) SAM tags
    :param file_output: the path to the file where to write the records
    :param file_output_discarded: the path to the file where to write discarded files
    :type mapped_reads: str
    :type hash_reads: dict
    :type file_output: str
    :type file_output_discarded: str
    :raises: RuntimeError
    """
    logger = logging.getLogger("STPipeline")

    if not os.path.isfile(mapped_reads):
        error = "Error, input file not present {}\n".format(mapped_reads)
        logger.error(error)
        raise RuntimeError(error)

    # Create output files handlers
    flag_read = "rb"
    flag_write = "wb"
    infile = pysam.AlignmentFile(mapped_reads, flag_read)
    outfile = pysam.AlignmentFile(file_output, flag_write, template=infile)
    if file_output_discarded is not None:
        outfile_discarded = pysam.AlignmentFile(file_output_discarded,
                                                flag_write,
                                                template=infile)
    # Create some counters and loop the records
    dropped_barcode = 0
    present = 0
    for sam_record in infile.fetch(until_eof=True):
        present += 1
        discard_read = False
        # Add the UMI and X,Y coordinates as extra SAM tags
        try:
            # Using as key the read name as it was used to generate the dictionary
            # In order to save memory we truncate the read
            # name to only keep the unique part (lane, tile, x_pos, y_pos)
            # TODO this procedure is specific to only Illumina technology
            key = "".join(sam_record.query_name.split(":")[-4:])
            for tag in hash_reads[key]:
                # TODO add error check here
                tag_tokens = tag.split(":")
                sam_record.set_tag(tag_tokens[0], tag_tokens[2], tag_tokens[1])
            outfile.write(sam_record)
        except KeyError:
            dropped_barcode += 1
            if file_output_discarded is not None:
                outfile_discarded.write(sam_record)

    # Close handlers
    infile.close()
    outfile.close()
    if file_output_discarded is not None:
        outfile_discarded.close()

    if not fileOk(file_output):
        error = "Error filtering mapped reads.\n" \
        "Output file is not present\n {}".format(file_output)
        logger.error(error)
        raise RuntimeError(error)

    logger.info("Finish processing aligned reads (R2):" \
                "\nPresent: {0}" \
                "\nDropped - barcode: {1}".format(present,dropped_barcode))

    # Update QA object
    qa_stats.reads_after_mapping = present
    qa_stats.reads_after_demultiplexing = (present - dropped_barcode)
예제 #5
0
def barcodeDemultiplexing(reads,
                          idFile,
                          mismatches,
                          kmer,
                          start_positon,
                          over_hang,
                          taggd_metric,
                          taggd_multiple_hits_keep_one,
                          taggd_trim_sequences,
                          cores,
                          outputFilePrefix,
                          keep_discarded_files=False):
    """ 
    This functions performs a demultiplexing using Taggd. Input reads will be filtered
    out looking at their barcodes. Only the ones that contain a barcode
    that is matched in the barcodes files will be kept.
    Information about the barcode and the array coordinates will be added
    to the output file. 
    :param reads: a file in FASTQ/BAM format containing reads with barcodes
    :param idFile: a tab delimited file (BARCODE - X - Y) containing all the barcodes
    :param mismatches: the number of allowed mismatches
    :param kmer: the kmer length
    :param start_positon: the start position of the barcode
    :param over_hang: the number of bases to allow for overhang
    :param taggd_metric: the distance metric algorithm (Subglobal, Levensthein or Hamming)
    :param taggd_multiple_hits_keep_one: when True keep one random hit when multiple candidates
    :param taggd_trim_sequences: coordinates to trim in the barcode
    :param outputFilePrefix: location and prefix for the output files
    :param keep_discarded_files: if True files with the non demultiplexed reads will be generated
    :type reads: str
    :type idFile: str
    :type mismatches: int
    :type kmer: int
    :type start_positon: int
    :type over_hang: int
    :type taggd_metric: str
    :type taggd_multiple_hits_keep_one: bool
    :type taggd_trim_sequences: list
    :type outputFilePrefix: str
    :type keep_discarded_files: bool
    :raises: RuntimeError,ValueError,OSError,CalledProcessError
    """
    logger = logging.getLogger("STPipeline")

    if not os.path.isfile(reads):
        error = "Error, input file not present {}\n".format(reads)
        logger.error(error)
        raise RuntimeError(error)

    # Taggd options
    #--metric (subglobal (default) , Levenshtein or Hamming)
    #--slider-increment (space between kmer searches, 0 is default = kmer length)
    #--seed
    #--overhang additional flanking bases around read barcode to allow
    #--estimate-min-edit-distance is set estimate the min edit distance among true barcodes
    #--no-offset-speedup turns off speed up,
    #  it might yield more hits (exactly as findIndexes)
    #--homopolymer-filter if set excludes reads where barcode
    #  contains a homolopymer of the given length (0 no filter), default 8

    if taggd_metric == "Hamming": over_hang = 0
    args = ['taggd_demultiplex.py']

    if taggd_trim_sequences is not None:
        args.append("--trim-sequences")
        for pos in taggd_trim_sequences:
            args.append(pos)

    args += [
        "--max-edit-distance",
        mismatches,
        "--k",
        kmer,
        "--barcode-tag",
        "B0",  # if input if BAM we tell taggd what tag contains the barcode
        "--start-position",
        start_positon,
        "--homopolymer-filter",
        0,
        "--subprocesses",
        cores,
        "--metric",
        taggd_metric,
        "--overhang",
        over_hang
    ]  #,
    #'--use-samtools-merge'] # Could be added to merge using samtools instead of pysam WIP on taggd

    if taggd_multiple_hits_keep_one:
        args.append("--multiple-hits-keep-one")

    if not keep_discarded_files:
        args.append("--no-unmatched-output")
        args.append("--no-ambiguous-output")
        args.append("--no-results-output")

    args += [idFile, reads, outputFilePrefix]

    try:
        proc = subprocess.Popen([str(i) for i in args],
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE,
                                close_fds=True,
                                shell=False)
        (stdout, errmsg) = proc.communicate()
    except ValueError as e:
        logger.error("Error demultiplexing with TAGGD\n Incorrect arguments.")
        raise e
    except OSError as e:
        logger.error("Error demultiplexing with TAGGD\n Executable not found.")
        raise e
    except CalledProcessError as e:
        logger.error(
            "Error demultiplexing with TAGGD\n Program returned error.")
        raise e

    # We know the output file from the prefix and suffix
    outputFile = "{}_matched{}".format(outputFilePrefix,
                                       os.path.splitext(reads)[1].lower())
    if not fileOk(outputFile):
        error = "Error demultiplexing with TAGGD.\n" \
        "Output file is not present {}\n{}\n".format(outputFile, errmsg)
        logger.error(error)
        raise RuntimeError(error)

    if len(errmsg) > 0:
        logger.warning("Taggd has generated error messages during " \
                       "demultiplexing.\n{}\n".format(errmsg))

    # TODO must be a cleaner way to get the stats from the output file
    procOut = stdout.split("\n")
    logger.info("Demultiplexing Mapping stats:")
    for line in procOut:
        if line.find("Total reads:") != -1:
            logger.info(str(line))
        if line.find("Total reads written:") != -1:
            logger.info(str(line))
            qa_stats.reads_after_demultiplexing = line.split()[-1]
        if line.find("Perfect Matches:") != -1:
            logger.info(str(line))
        if line.find("Imperfect Matches") != -1:
            logger.info(str(line))
        if line.find("Ambiguous matches:") != -1:
            logger.info(str(line))
        if line.find("Non-unique ambiguous matches:") != -1:
            logger.info(str(line))
        if line.find("Unmatched:") != -1:
            logger.info(str(line))
예제 #6
0
def alignReads(reverse_reads, ref_map, outputFile, annotation, outputFolder,
               trimReverse, invTrimReverse, cores, min_intron_size,
               max_intron_size, disable_multimap, diable_softclipping,
               twopassMode, min_length, include_non_mapped,
               star_genome_loading, star_sort_mem_limit):
    """
    This function will perform a sequence alignment using STAR.
    Mapped and unmapped reads are written to the paths given as
    parameters. It needs the path of the STAR genome index.
    It allows to perform the 2-Pass mode.
    It needs the annotation file to use the on-the-fly mode.
    :param reverse_reads: file containing reverse reads in BAM format
    :param ref_map: a path to the genome/transcriptome STAR index
    :param outputFile: the name of the SAM/BAM output file to write the alignments to
    :param annotation: the annotation file in GTF
    :param outputFolder: the path of the output folder
    :param trimReverse: the number of bases to trim in the reverse reads (from 5')
    :param invTrimReverse: number of bases to trim from the 3'
    :param cores: the number of cores to use to speed up the alignment
    :param min_intron_size: min allowed intron size when spanning splice junctions
    :param max_intron size: max allowed intron size when spanning splice junctions 
    :param disable_multimap: if True no multiple alignments will be allowed
    :param diable_softclipping: it True no local alignment allowed
    :param twopassMode: True to use the 2-pass mode
    :param min_length: the min allowed read length (mapped bases)
    :param include_non_mapped: True to include un-aligned reads in the output
    :param star_genome_loading: The type of genome sharing for STAR
    :param star_sort_mem_limit: The BAM sort memory limit for STAR
    :type reverse_reads: str
    :type ref_map: str
    :type outputFile: str
    :type annotation: str
    :type outputFolder: str
    :type trimReverse: int
    :type invTrimReverse: int
    :type cores: int
    :type min_intron_size: int
    :type max_intron: int
    :type disable_multimap: bool
    :type diable_softclipping: bool
    :type twopassMode: bool
    :type min_length: str
    :type include_non_mapped: bool
    :type star_genome_loading: str
    :type star_sort_mem_limit: int
    :raises: RuntimeError,ValueError,OSError,CalledProcessError
    """
    logger = logging.getLogger("STPipeline")

    if not os.path.isfile(reverse_reads):
        error = "Error mapping with STAR, input file not present {}\n".format(
            reverse_reads)
        logger.error(error)
        raise RuntimeError(error)

    # STAR has predefined output names for the files
    tmpOutputFile = "Aligned.sortedByCoord.out.bam"
    tmpOutputFileDiscarded = "Unmapped.out.mate1"
    log_std = "Log.std.out"
    log = "Log.out"
    log_sj = "SJ.out.tab"
    log_final = "Log.final.out"
    log_progress = "Log.progress.out"

    if outputFolder is not None and os.path.isdir(outputFolder):
        tmpOutputFile = os.path.join(outputFolder, tmpOutputFile)
        tmpOutputFileDiscarded = os.path.join(outputFolder,
                                              tmpOutputFileDiscarded)
        log_std = os.path.join(outputFolder, log_std)
        log = os.path.join(outputFolder, log)
        log_sj = os.path.join(outputFolder, log_sj)
        log_final = os.path.join(outputFolder, log_final)
        log_progress = os.path.join(outputFolder, log_progress)

    multi_map_number = 1 if disable_multimap else 20  # 10 is the STAR default
    alignment_mode = "EndToEnd" if diable_softclipping else "Local"

    flags = [
        "--clip3pNbases",
        invTrimReverse,
        "--clip5pNbases",
        trimReverse,
        "--runThreadN",
        str(max(cores, 1)),
        "--outFilterType",
        "Normal",
        "--outSAMtype",
        "BAM",
        "SortedByCoordinate",
        "--alignEndsType",
        alignment_mode,
        "--outSAMorder",
        "Paired",
        "--outSAMprimaryFlag",
        "OneBestScore",
        "--outFilterMultimapNmax",
        multi_map_number,
        "--alignIntronMin",
        min_intron_size,
        "--alignIntronMax",
        max_intron_size,
        "--outFilterMatchNmin",
        min_length,
        "--outSAMmultNmax",
        1,
        "--outMultimapperOrder",
        "Random",
        "--readMatesLengthsIn",
        "NotEqual",
        "--outFilterMismatchNoverLmax",
        0.1,  ## (0.3 default)
        "--genomeLoad",
        star_genome_loading,
        "--limitBAMsortRAM",
        star_sort_mem_limit,
        "--readFilesType",
        "SAM",
        "SE",  # Input in BAM format
        "--readFilesCommand",
        "samtools",
        "view",
        "-h"
    ]

    if twopassMode:
        flags += ["--twopassMode", "Basic"]

    if annotation is not None:
        flags += ["--sjdbGTFfile", annotation]

    if include_non_mapped:
        flags += ["--outSAMunmapped", "Within"]
    else:
        flags += ["--outSAMunmapped", "None"]

    args = [
        "STAR", "--genomeDir", ref_map, "--readFilesIn", reverse_reads,
        "--outFileNamePrefix", outputFolder + os.sep
    ]
    args += flags

    try:
        proc = subprocess.Popen([str(i) for i in args],
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE,
                                close_fds=True,
                                shell=False)
        (stdout, errmsg) = proc.communicate()
    except ValueError as e:
        logger.error("Error mapping with STAR\n Incorrect arguments.")
        raise e
    except OSError as e:
        logger.error("Error mapping with STAR\n Executable not found.")
        raise e
    except CalledProcessError as e:
        logger.error("Error mapping with STAR\n Program returned error.")
        raise e

    if not fileOk(tmpOutputFile):
        error = "Error mapping with STAR.\n" \
        "Output file not present {}\n{}\n".format(tmpOutputFile, errmsg)
        logger.error(error)
        raise RuntimeError(error)

    if len(errmsg) > 0:
        logger.warning(
            "STAR has generated error messages during mapping.\n{}\n".format(
                errmsg))

    # Rename output files.
    shutil.move(tmpOutputFile, outputFile)

    # Remove temp files from STAR
    if os.path.isfile(log_std): os.remove(log_std)
    if os.path.isfile(log): os.remove(log)
    if os.path.isfile(log_progress): os.remove(log_progress)
    if os.path.isfile(log_sj): os.remove(log_sj)

    if not os.path.isfile(log_final):
        logger.warning("Log output file from STAR is not present")
    else:
        logger.info("Mapping stats: ")
        logger.info(
            "Mapping stats are computed from all the pair reads present in the raw files"
        )
        uniquely_mapped = 0
        multiple_mapped = 0
        # Parse log file from STAR to get stats
        # TODO find a cleaner way to do this
        with open(log_final, "r") as star_log:
            for line in star_log.readlines():
                if line.find("Uniquely mapped reads %") != -1 \
                or line.find("Uniquely mapped reads number") != -1 \
                or line.find("Number of reads mapped to multiple loci") != -1 \
                or line.find("% of reads mapped to multiple loci") != -1 \
                or line.find("% of reads unmapped: too short") != -1:
                    logger.info(str(line).rstrip())
                # Some duplicated code here; TODO refactor
                if line.find("Uniquely mapped reads number") != -1:
                    uniquely_mapped = int(str(line).rstrip().split()[-1])
                if line.find("Number of reads mapped to multiple loci") != -1:
                    multiple_mapped = int(str(line).rstrip().split()[-1])
            logger.info("Total mapped reads: {}".format(uniquely_mapped +
                                                        multiple_mapped))

    # Remove log file
    if os.path.isfile(log_final): os.remove(log_final)
def barcodeDemultiplexing(reads, 
                          idFile,
                          mismatches,
                          kmer, 
                          start_positon,
                          over_hang,
                          taggd_metric,
                          taggd_multiple_hits_keep_one,
                          taggd_trim_sequences,
                          cores,
                          outputFilePrefix,
                          keep_discarded_files=False):
    """ 
    This functions performs a demultiplexing using Taggd. Input reads will be filtered
    out looking at their barcodes. Only the ones that contain a barcode
    that is matched in the barcodes files will be kept.
    Information about the barcode and the array coordinates will be added
    to the output file. 
    :param reads: a file in FASTQ/BAM format containing reads with barcodes
    :param idFile: a tab delimited file (BARCODE - X - Y) containing all the barcodes
    :param mismatches: the number of allowed mismatches
    :param kmer: the kmer length
    :param start_positon: the start position of the barcode
    :param over_hang: the number of bases to allow for overhang
    :param taggd_metric: the distance metric algorithm (Subglobal, Levensthein or Hamming)
    :param taggd_multiple_hits_keep_one: when True keep one random hit when multiple candidates
    :param taggd_trim_sequences: coordinates to trim in the barcode
    :param outputFilePrefix: location and prefix for the output files
    :param keep_discarded_files: if True files with the non demultiplexed reads will be generated
    :type reads: str
    :type idFile: str
    :type mismatches: int
    :type kmer: int
    :type start_positon: int
    :type over_hang: int
    :type taggd_metric: str
    :type taggd_multiple_hits_keep_one: bool
    :type taggd_trim_sequences: list
    :type outputFilePrefix: str
    :type keep_discarded_files: bool
    :raises: RuntimeError,ValueError,OSError,CalledProcessError
    """
    logger = logging.getLogger("STPipeline")
    
    if not os.path.isfile(reads):
        error = "Error, input file not present {}\n".format(reads)
        logger.error(error)
        raise RuntimeError(error)
    
    # Taggd options
    #--metric (subglobal (default) , Levenshtein or Hamming)
    #--slider-increment (space between kmer searches, 0 is default = kmer length)
    #--seed
    #--overhang additional flanking bases around read barcode to allow
    #--estimate-min-edit-distance is set estimate the min edit distance among true barcodes
    #--no-offset-speedup turns off speed up, 
    #  it might yield more hits (exactly as findIndexes)
    #--homopolymer-filter if set excludes reads where barcode 
    #  contains a homolopymer of the given length (0 no filter), default 8
    
    if taggd_metric == "Hamming": over_hang = 0 
    args = ['taggd_demultiplex.py']
    
    if taggd_trim_sequences is not None:
        args.append("--trim-sequences") 
        for pos in taggd_trim_sequences:
            args.append(pos) 
            
    args += ["--max-edit-distance", mismatches,
            "--k", kmer,
            "--barcode-tag", "B0", # if input if BAM we tell taggd what tag contains the barcode
            "--start-position", start_positon,
            "--homopolymer-filter", 0,
            "--subprocesses", cores,
            "--metric", taggd_metric,
            "--overhang", over_hang] #,
            #'--use-samtools-merge'] # Could be added to merge using samtools instead of pysam WIP on taggd
            
    if taggd_multiple_hits_keep_one:
        args.append("--multiple-hits-keep-one")  
            
    if not keep_discarded_files:
        args.append("--no-unmatched-output")
        args.append("--no-ambiguous-output")
        args.append("--no-results-output")
        
    args += [idFile, reads, outputFilePrefix]

    try:
        proc = subprocess.Popen([str(i) for i in args], 
                                stdout=subprocess.PIPE, stderr=subprocess.PIPE, 
                                close_fds=True, shell=False)
        (stdout, errmsg) = proc.communicate()
    except ValueError as e:
        logger.error("Error demultiplexing with TAGGD\n Incorrect arguments.")
        raise e
    except OSError as e:
        logger.error("Error demultiplexing with TAGGD\n Executable not found.")
        raise e
    except CalledProcessError as e:
        logger.error("Error demultiplexing with TAGGD\n Program returned error.")
        raise e
    
    # We know the output file from the prefix and suffix
    outputFile = "{}_matched{}".format(outputFilePrefix, os.path.splitext(reads)[1].lower())
    if not fileOk(outputFile):
        error = "Error demultiplexing with TAGGD.\n" \
        "Output file is not present {}\n{}\n".format(outputFile, errmsg)
        logger.error(error)
        raise RuntimeError(error)
 
    if len(errmsg) > 0:
        logger.warning("Taggd has generated error messages during " \
                       "demultiplexing.\n{}\n".format(errmsg))
           
    # TODO must be a cleaner way to get the stats from the output file
    procOut = stdout.split("\n")
    logger.info("Demultiplexing Mapping stats:")
    for line in procOut: 
        if line.find("Total reads:") != -1:
            logger.info(str(line))
        if line.find("Total reads written:") != -1:
            logger.info(str(line))
            qa_stats.reads_after_demultiplexing = line.split()[-1]
        if line.find("Perfect Matches:") != -1:
            logger.info(str(line))
        if line.find("Imperfect Matches") != -1:
            logger.info(str(line))
        if line.find("Ambiguous matches:") != -1:
            logger.info(str(line))
        if line.find("Non-unique ambiguous matches:") != -1:
            logger.info(str(line))
        if line.find("Unmatched:") != -1:
            logger.info(str(line))
def alignReads(reverse_reads, 
               ref_map,
               outputFile,
               annotation,
               outputFolder,
               trimReverse,
               invTrimReverse,
               cores,
               min_intron_size,
               max_intron_size,
               disable_multimap,
               diable_softclipping,
               twopassMode,
               min_length,
               include_non_mapped,
               star_genome_loading,
               star_sort_mem_limit):
    """
    This function will perform a sequence alignment using STAR.
    Mapped and unmapped reads are written to the paths given as
    parameters. It needs the path of the STAR genome index.
    It allows to perform the 2-Pass mode.
    It needs the annotation file to use the on-the-fly mode.
    :param reverse_reads: file containing reverse reads in BAM format
    :param ref_map: a path to the genome/transcriptome STAR index
    :param outputFile: the name of the SAM/BAM output file to write the alignments to
    :param annotation: the annotation file in GTF
    :param outputFolder: the path of the output folder
    :param trimReverse: the number of bases to trim in the reverse reads (from 5')
    :param invTrimReverse: number of bases to trim from the 3'
    :param cores: the number of cores to use to speed up the alignment
    :param min_intron_size: min allowed intron size when spanning splice junctions
    :param max_intron size: max allowed intron size when spanning splice junctions 
    :param disable_multimap: if True no multiple alignments will be allowed
    :param diable_softclipping: it True no local alignment allowed
    :param twopassMode: True to use the 2-pass mode
    :param min_length: the min allowed read length (mapped bases)
    :param include_non_mapped: True to include un-aligned reads in the output
    :param star_genome_loading: The type of genome sharing for STAR
    :param star_sort_mem_limit: The BAM sort memory limit for STAR
    :type reverse_reads: str
    :type ref_map: str
    :type outputFile: str
    :type annotation: str
    :type outputFolder: str
    :type trimReverse: int
    :type invTrimReverse: int
    :type cores: int
    :type min_intron_size: int
    :type max_intron: int
    :type disable_multimap: bool
    :type diable_softclipping: bool
    :type twopassMode: bool
    :type min_length: str
    :type include_non_mapped: bool
    :type star_genome_loading: str
    :type star_sort_mem_limit: int
    :raises: RuntimeError,ValueError,OSError,CalledProcessError
    """
    logger = logging.getLogger("STPipeline")
    
    if not os.path.isfile(reverse_reads):
        error = "Error mapping with STAR, input file not present {}\n".format(reverse_reads)
        logger.error(error)
        raise RuntimeError(error)
    
    # STAR has predefined output names for the files
    tmpOutputFile = "Aligned.sortedByCoord.out.bam"
    tmpOutputFileDiscarded = "Unmapped.out.mate1"
    log_std = "Log.std.out"
    log = "Log.out"
    log_sj = "SJ.out.tab"
    log_final = "Log.final.out"
    log_progress = "Log.progress.out"
    
    if outputFolder is not None and os.path.isdir(outputFolder):
        tmpOutputFile = os.path.join(outputFolder, tmpOutputFile)
        tmpOutputFileDiscarded = os.path.join(outputFolder, tmpOutputFileDiscarded)
        log_std = os.path.join(outputFolder, log_std)
        log = os.path.join(outputFolder, log)
        log_sj = os.path.join(outputFolder, log_sj)
        log_final = os.path.join(outputFolder, log_final)
        log_progress = os.path.join(outputFolder, log_progress)
    
    multi_map_number = 1 if disable_multimap else 20 # 10 is the STAR default
    alignment_mode = "EndToEnd" if diable_softclipping else "Local"
    
    flags = ["--clip3pNbases", invTrimReverse,
             "--clip5pNbases", trimReverse,
             "--runThreadN", str(max(cores, 1)),
             "--outFilterType", "Normal", 
             "--outSAMtype", "BAM", "SortedByCoordinate",
             "--alignEndsType", alignment_mode,
             "--outSAMorder", "Paired",    
             "--outSAMprimaryFlag", "OneBestScore", 
             "--outFilterMultimapNmax", multi_map_number,
             "--alignIntronMin", min_intron_size,
             "--alignIntronMax", max_intron_size,
             "--outFilterMatchNmin", min_length,
             "--outSAMmultNmax", 1,
             "--outMultimapperOrder", "Random",
             "--readMatesLengthsIn", "NotEqual",
             "--outFilterMismatchNoverLmax", 0.1, ## (0.3 default)
             "--genomeLoad", star_genome_loading,
             "--limitBAMsortRAM", star_sort_mem_limit,
             "--readFilesType", "SAM","SE", # Input in BAM format
             "--readFilesCommand", "samtools", "view", "-h"] 
    
    if twopassMode:
        flags += ["--twopassMode", "Basic"]

    if annotation is not None:
        flags += ["--sjdbGTFfile", annotation]
       
    if include_non_mapped:
        flags += ["--outSAMunmapped", "Within"]
    else:
        flags += ["--outSAMunmapped", "None"]
        
    args = ["STAR",
            "--genomeDir", ref_map,
            "--readFilesIn", reverse_reads,
            "--outFileNamePrefix", outputFolder + os.sep]  
    args += flags
    
    try:
        proc = subprocess.Popen([str(i) for i in args],
                                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                                close_fds=True, shell=False)
        (stdout, errmsg) = proc.communicate()
    except ValueError as e:
        logger.error("Error mapping with STAR\n Incorrect arguments.")
        raise e
    except OSError as e:
        logger.error("Error mapping with STAR\n Executable not found.")
        raise e
    except CalledProcessError as e:
        logger.error("Error mapping with STAR\n Program returned error.")
        raise e
        
    if not fileOk(tmpOutputFile):
        error = "Error mapping with STAR.\n" \
        "Output file not present {}\n{}\n".format(tmpOutputFile, errmsg)
        logger.error(error)
        raise RuntimeError(error)

    if len(errmsg) > 0:
        logger.warning("STAR has generated error messages during mapping.\n{}\n".format(errmsg))
        
    # Rename output files.
    shutil.move(tmpOutputFile, outputFile)
        
    # Remove temp files from STAR
    if os.path.isfile(log_std): os.remove(log_std)
    if os.path.isfile(log): os.remove(log)
    if os.path.isfile(log_progress): os.remove(log_progress)
    if os.path.isfile(log_sj): os.remove(log_sj)
    
    if not os.path.isfile(log_final):
        logger.warning("Log output file from STAR is not present")
    else:
        logger.info("Mapping stats: ")
        logger.info("Mapping stats are computed from all the pair reads present in the raw files")
        uniquely_mapped = 0
        multiple_mapped = 0
        # Parse log file from STAR to get stats
        # TODO find a cleaner way to do this
        with open(log_final, "r") as star_log:
            for line in star_log.readlines():
                if line.find("Uniquely mapped reads %") != -1 \
                or line.find("Uniquely mapped reads number") != -1 \
                or line.find("Number of reads mapped to multiple loci") != -1 \
                or line.find("% of reads mapped to multiple loci") != -1 \
                or line.find("% of reads unmapped: too short") != -1:
                    logger.info(str(line).rstrip())
                # Some duplicated code here; TODO refactor
                if line.find("Uniquely mapped reads number") != -1:
                    uniquely_mapped = int(str(line).rstrip().split()[-1])
                if line.find("Number of reads mapped to multiple loci") != -1:
                    multiple_mapped = int(str(line).rstrip().split()[-1])
            logger.info("Total mapped reads: {}".format(uniquely_mapped + multiple_mapped))   
             
    # Remove log file       
    if os.path.isfile(log_final): os.remove(log_final)
예제 #9
0
def filterInputReads(fw, 
                     rw,
                     out_fw,
                     out_rw,
                     out_rw_discarded=None,
                     barcode_start=0, 
                     barcode_length=18,
                     filter_AT_content=90,
                     molecular_barcodes=False, 
                     mc_start=18, 
                     mc_end=27,
                     min_qual=20, 
                     min_length=28,
                     polyA_min_distance=0, 
                     polyT_min_distance=0, 
                     polyG_min_distance=0, 
                     polyC_min_distance=0,
                     qual64=False,
                     umi_filter=False,
                     umi_filter_template="WSNNWSNNV",
                     umi_quality_bases=3):
    """
    This function does four things (all done in one loop for performance reasons)
      - It performs a sanity check (forward and reverse reads same length and order)
      - It performs a BWA quality trimming discarding very short reads
      - It removes adaptors from the reads (optional)
      - It performs a sanity check on the UMI (optional)
    Reads that do not pass the filters are discarded (both R1 and R2)
    :param fw: the fastq file with the forward reads
    :param rw: the fastq file with the reverse reads
    :param out_fw: the name of the output file for the forward reads
    :param out_rw: the name of the output file for the reverse reads
    :param out_rw_discarded: the name of the output file for descarded reads
    :param barcode_start: the base index where the barcode sequence starts
    :param barcode_length: the number of bases present in the barcodes
    :param molecular_barcodes: if True the forward reads contain molecular barcodes
    :param mc_start: the start position of the molecular barcodes if any
    :param mc_end: the end position of the molecular barcodes if any
    :param min_qual: the min quality value to use to trim quality
    :param min_length: the min valid length for a read after trimming
    :param polyA_min_distance: if >0 we remove PolyA adaptors from the reads
    :param polyT_min_distance: if >0 we remove PolyT adaptors from the reads
    :param polyG_min_distance: if >0 we remove PolyG adaptors from the reads
    :param qual64: true of qualities are in phred64 format
    :param umi_filter performs: a UMI quality filter when True
    :param umi_filter_template: the template to use for the UMI filter
    :param umi_quality_bases: the number of low quality bases allowed in an UMI
    """
    logger = logging.getLogger("STPipeline")
    
    if not os.path.isfile(fw) or not os.path.isfile(rw):
        error = "Error, input file/s not present {}\n{}\n".format(fw,rw)
        logger.error(error)
        raise RuntimeError(error)
    
    # Check if discarded files must be written out 
    keep_discarded_files = out_rw_discarded is not None
    
    # Create output file writers
    out_rw_handle = safeOpenFile(out_rw, 'w')
    out_rw_writer = writefq(out_rw_handle)
    out_fw_handle = safeOpenFile(out_fw, 'w')
    out_fw_writer = writefq(out_fw_handle)
    if keep_discarded_files:
        out_rw_handle_discarded = safeOpenFile(out_rw_discarded, 'w')
        out_rw_writer_discarded = writefq(out_rw_handle_discarded)
    
    # Some counters
    total_reads = 0
    dropped_rw = 0
    dropped_umi = 0
    dropped_umi_template = 0
    dropped_AT = 0
    dropped_adaptor = 0
    
    # Build fake sequence adaptors with the parameters given
    adaptorA = "".join("A" for k in xrange(polyA_min_distance))
    adaptorT = "".join("T" for k in xrange(polyT_min_distance))
    adaptorG = "".join("G" for k in xrange(polyG_min_distance))
    adaptorC = "".join("C" for k in xrange(polyC_min_distance))
    do_adaptorA = polyA_min_distance > 0
    do_adaptorT = polyT_min_distance > 0
    do_adaptorG = polyG_min_distance > 0
    do_adaptorC = polyC_min_distance > 0
    
    # Quality format
    phred = 64 if qual64 else 33
    
    # Check if barcode settings are correct
    iscorrect_mc = molecular_barcodes
    if mc_start < (barcode_start + barcode_length) \
    or mc_end < (barcode_start + barcode_length):
        logger.warning("Your UMI sequences overlap with the barcodes sequences")
        iscorrect_mc = False
    
    # Open fastq files with the fastq parser
    fw_file = safeOpenFile(fw, "rU")
    rw_file = safeOpenFile(rw, "rU")
    for (header_fw, sequence_fw, quality_fw), (header_rv, sequence_rv, quality_rv) \
    in izip(readfq(fw_file), readfq(rw_file)):
        
        if not sequence_fw or not sequence_fw:
            error = "Error doing quality trimming checks of raw reads.\n" \
            "The input files {},{} are not of the same length".format(fw,rw)
            logger.error(error)
            fw_file.close()
            rw_file.close()
            out_rw_handle.flush()
            out_rw_writer.close()
            out_fw_handle.flush()
            out_fw_writer.close()
            if keep_discarded_files:
                out_rw_handle_discarded.flush()
                out_rw_writer_discarded.close()
            raise RuntimeError(error)
        
        if header_fw.split()[0] != header_rv.split()[0]:
            logger.warning("Pair reads found with different " \
                           "names {} and {}".format(header_fw,header_rv))
            
        # Increase reads counter
        total_reads += 1
        discard_read = False
        
        # If we want to check for UMI quality and the UMI is incorrect
        # then we discard the reads
        if iscorrect_mc and umi_filter \
        and not check_umi_template(sequence_fw[mc_start:mc_end], umi_filter_template):
            dropped_umi_template += 1
            discard_read = True
        
        # Check if the UMI has any low quality base
        if not discard_read and iscorrect_mc and \
        len([b for b in quality_fw[mc_start:mc_end] if (ord(b) - phred) < min_qual]) > umi_quality_bases:
            dropped_umi += 1
            discard_read = True
                                                            
        # If reverse read has a high AT content discard...
        if not discard_read and \
        ((sequence_rv.count("A") + sequence_rv.count("T")) / len(sequence_rv)) * 100 >= filter_AT_content:
            dropped_AT += 1
            discard_read = True
        
        # Store the original reads to write them to the discarded output if applies
        if keep_discarded_files:    
            orig_sequence_rv = sequence_rv
            orig_quality_rv = quality_rv 
            
        if not discard_read:  
            # if indicated we remove the artifacts PolyA from reverse reads
            if do_adaptorA: 
                sequence_rv, quality_rv = removeAdaptor(sequence_rv, quality_rv, adaptorA) 
            # if indicated we remove the artifacts PolyT from reverse reads
            if do_adaptorT: 
                sequence_rv, quality_rv = removeAdaptor(sequence_rv, quality_rv, adaptorT) 
            # if indicated we remove the artifacts PolyG from reverse reads
            if do_adaptorG: 
                sequence_rv, quality_rv = removeAdaptor(sequence_rv, quality_rv, adaptorG) 
            # if indicated we remove the artifacts PolyC from reverse reads
            if do_adaptorC: 
                sequence_rv, quality_rv = removeAdaptor(sequence_rv, quality_rv, adaptorC)
            # Check if the read is smaller than the minimum after removing artifacts   
            if len(sequence_rv) < min_length:
                dropped_adaptor += 1
                discard_read = True
            else:              
                # Trim reverse read (will return None if length of trimmed sequence is lower than min)
                sequence_rv, quality_rv = trim_quality(sequence_rv, quality_rv, 
                                                       min_qual, min_length, phred)
                if not sequence_rv or not quality_rv:
                    discard_read = True

                
        # Write reverse read to output
        if not discard_read:
            out_rw_writer.send((header_rv, sequence_rv, quality_rv))
            out_fw_writer.send((header_fw, sequence_fw, quality_fw))
        else:
            dropped_rw += 1  
            if keep_discarded_files:
                out_rw_writer_discarded.send((header_rv, orig_sequence_rv, orig_quality_rv))
    
    fw_file.close()
    rw_file.close()
    out_rw_handle.flush()
    out_rw_writer.close()
    out_fw_handle.flush()
    out_fw_writer.close()
    if keep_discarded_files:
        out_rw_handle_discarded.flush()
        out_rw_writer_discarded.close()
        
    # Write info to the log
    logger.info("Trimming stats total reads (pair): {}".format(total_reads))
    logger.info("Trimming stats {} reads have been dropped!".format(dropped_rw)) 
    perc2 = '{percent:.2%}'.format(percent= float(dropped_rw) / float(total_reads) )
    logger.info("Trimming stats you just lost about {} of your data".format(perc2))
    logger.info("Trimming stats reads remaining: {}".format(total_reads - dropped_rw))
    logger.info("Trimming stats dropped pairs due to incorrect UMI: {}".format(dropped_umi_template))
    logger.info("Trimming stats dropped pairs due to low quality UMI: {}".format(dropped_umi))
    logger.info("Trimming stats dropped pairs due to high AT content: {}".format(dropped_AT))
    logger.info("Trimming stats dropped pairs due to presence of artifacts: {}".format(dropped_adaptor))
    
    # Check that output file was written ok
    if not fileOk(out_rw):
        error = "Error doing quality trimming checks of raw reads." \
        "\nOutput file not present {}\n".format(out_rw)
        logger.error(error)
        raise RuntimeError(error)
    
    # Adding stats to QA Stats object
    qa_stats.input_reads_forward = total_reads
    qa_stats.input_reads_reverse = total_reads
    qa_stats.reads_after_trimming_forward = total_reads
    qa_stats.reads_after_trimming_reverse = total_reads - dropped_rw
def annotateReads(mappedReads, 
                  gtfFile,
                  outputFile,
                  outputDiscarded,
                  mode,
                  strandness,
                  htseq_no_ambiguous, 
                  include_non_annotated,
                  temp_dir,
                  threads):
    """
    Annotates a file with mapped reads (BAM) using a modified 
    version of the htseq-count tool. It writes the annotated records to a file.
    It assumes the input reads (BAM) are single end and do not contain
    multiple alignments or un-annotated reads.
    :param mappedReads: path to a BAM file with mapped reads sorted by coordinate
    :param gtfFile: path to an annotation file in GTF format
    :param outputFile: where to write the annotated records (BAM)
    :param outputDiscarded: where to write the non-annotated records (BAM)
    :param mode: htseq-count overlapping mode (see htseq-count documentation)
    :param strandness: the type of strandness to use when annotating (yes, no or reverse)
    :param htseq_no_ambiguous: true if we want to discard ambiguous annotations
    :param include_non_annotated: true if we want to include 
    non annotated reads as __no_feature in the output
    :param outputFile: the name/path to the output file
    :param temp_dir: path to the folder where to put the created files
    :param threads: the number of CPU cores to use
    :type mappedReads: str
    :type gtfFile: str
    :type outputFile: str
    :type outputDiscarded: str
    :type mode: str
    :type strandness: str
    :type htseq_no_ambiguos: boolean
    :type include_non_annotated: str
    :type outputFile: str
    :type temp_dir: str
    :type threads: int
    :raises: RuntimeError, ValueError
    """
    
    logger = logging.getLogger("STPipeline")
    
    if not os.path.isfile(mappedReads):
        error = "Error during annotation, input file not present {}\n".format(mappedReads)
        logger.error(error)
        raise RuntimeError(error)
    
    try:
        annotated = count_reads_in_features(mappedReads,
                                            gtfFile,
                                            "bam", # Type BAM for filesz
                                            strandness, # Strand yes/no/reverse
                                            mode, # intersection_nonempty, union, intersection_strict
                                            "exon", # feature type in GFF
                                            "gene_id", # gene_id or gene_name
                                            True, # Quiet mode
                                            0, # Min quality score
                                            outputFile,
                                            include_non_annotated,
                                            htseq_no_ambiguous,
                                            outputDiscarded)
    except Exception as e:
        error = "Error during annotation. HTSEQ execution failed\n"
        logger.error(error)
        raise e
    
    if not fileOk(outputFile) or annotated == 0:
        error = "Error during annotation. Output file not present {}\n".format(outputFile)
        logger.error(error)
        raise RuntimeError(error)
    
    logger.info("Annotated reads: {}".format(annotated))
    qa_stats.reads_after_annotation = annotated
def filterInputReads(fw, rv, out_fw, out_rw, out_rw_discarded,
                     filter_AT_content, filter_GC_content, umi_start, umi_end,
                     min_qual, min_length, polyA_min_distance,
                     polyT_min_distance, polyG_min_distance,
                     polyC_min_distance, polyN_min_distance, qual64,
                     umi_filter, umi_filter_template, umi_quality_bases,
                     adaptor_missmatches):
    """
    This function does few things (all done in one loop for performance reasons)
      - It performs a sanity check (forward and reverse reads same length and order)
      - It performs a BWA-based quality trimming discarding very short reads
      - It removes adaptors from the reads (optional)
      - It checks for AT and GC content (optional)
      - It performs a sanity check on the UMI (optional)
    Reads that do not pass the filters are discarded (both R1 and R2)
    :param fw: the fastq file with the forward reads
    :param rv: the fastq file with the reverse reads
    :param out_fw: the name of the output file for the forward reads
    :param out_rw: the name of the output file for the reverse reads
    :param out_rw_discarded: the name of the output file for discarded reverse reads
    :param filter_AT_content: % of A and T bases a read2 must have to be discarded
    :param filter_GC_content: % of G and C bases a read2 must have to be discarded
    :param umi_start: the start position of the UMI
    :param umi_end: the end position of the UMI
    :param min_qual: the min quality value to use in the trimming
    :param min_length: the min valid length for a read after trimming
    :param polyA_min_distance: if >5 remove PolyA adaptors from the reads
    :param polyT_min_distance: if >5 remove PolyT adaptors from the reads
    :param polyG_min_distance: if >5 remove PolyG adaptors from the reads
    :param polyC_min_distance: if >5 remove PolyC adaptors from the reads
    :param polyN_min_distance: if >5 remove PolyN adaptors from the reads
    :param qual64: true of qualities are in phred64 format
    :param umi_filter: performs a UMI quality template filter when True
    :param umi_filter_template: the template to use for the UMI filter
    :param umi_quality_bases: the number of low quality bases allowed in an UMI
    :param adaptor_missmatches: number of miss-matches allowed when removing adaptors
    """
    logger = logging.getLogger("STPipeline")

    if not os.path.isfile(fw) or not os.path.isfile(rv):
        error = "Error doing quality trimming, input file/s not present {}\n{}\n".format(
            fw, rv)
        logger.error(error)
        raise RuntimeError(error)

    # Check if discarded files must be written out
    keep_discarded_files = out_rw_discarded is not None

    # Create output file writers
    out_rv_handle = safeOpenFile(out_rw, 'w')
    out_rv_writer = writefq(out_rv_handle)
    out_fw_handle = safeOpenFile(out_fw, 'w')
    out_fw_writer = writefq(out_fw_handle)
    if keep_discarded_files:
        out_rv_handle_discarded = safeOpenFile(out_rw_discarded, 'w')
        out_rv_writer_discarded = writefq(out_rv_handle_discarded)

    # Some counters
    total_reads = 0
    dropped_rv = 0
    dropped_umi = 0
    dropped_umi_template = 0
    dropped_AT = 0
    dropped_GC = 0
    dropped_adaptor = 0

    # Build fake sequence adaptors with the parameters given
    adaptorA = "".join("A" for k in xrange(polyA_min_distance))
    adaptorT = "".join("T" for k in xrange(polyT_min_distance))
    adaptorG = "".join("G" for k in xrange(polyG_min_distance))
    adaptorC = "".join("C" for k in xrange(polyC_min_distance))
    adaptorN = "".join("N" for k in xrange(polyN_min_distance))

    # Not recommended to do adaptor trimming for adaptors smaller than 5
    do_adaptorA = polyA_min_distance >= 5
    do_adaptorT = polyT_min_distance >= 5
    do_adaptorG = polyG_min_distance >= 5
    do_adaptorC = polyC_min_distance >= 5
    do_adaptorN = polyN_min_distance >= 5
    do_AT_filter = filter_AT_content > 0
    do_GC_filter = filter_GC_content > 0

    # Quality format
    phred = 64 if qual64 else 33

    # Open fastq files with the fastq parser
    fw_file = safeOpenFile(fw, "rU")
    rv_file = safeOpenFile(rv, "rU")
    for (header_fw, sequence_fw, quality_fw), (header_rv, sequence_rv, quality_rv) \
    in izip(readfq(fw_file), readfq(rv_file)):

        if not sequence_fw or not sequence_rv:
            error = "Error doing quality trimming, Checks of raw reads.\n" \
            "The input files {},{} are not of the same length".format(fw,rv)
            logger.error(error)
            break

        if header_fw.split()[0] != header_rv.split()[0]:
            logger.warning("Pair reads found with different " \
                           "names {} and {}".format(header_fw,header_rv))

        # Increase reads counter
        total_reads += 1
        discard_read = False

        # If we want to check for UMI quality and the UMI is incorrect
        # then we discard the reads
        if umi_filter \
        and not check_umi_template(sequence_fw[umi_start:umi_end], umi_filter_template):
            dropped_umi_template += 1
            discard_read = True

        # Check if the UMI has many low quality bases
        if not discard_read and (umi_end - umi_start) >= umi_quality_bases and \
        len([b for b in quality_fw[umi_start:umi_end] if (ord(b) - phred) < min_qual]) > umi_quality_bases:
            dropped_umi += 1
            discard_read = True

        # If reverse read has a high AT content discard...
        if not discard_read and do_AT_filter and \
        ((sequence_rv.count("A") + sequence_rv.count("T")) / len(sequence_rv)) * 100 >= filter_AT_content:
            dropped_AT += 1
            discard_read = True

        # If reverse read has a high GC content discard...
        if not discard_read and do_GC_filter and \
        ((sequence_rv.count("G") + sequence_rv.count("C")) / len(sequence_rv)) * 100 >= filter_GC_content:
            dropped_GC += 1
            discard_read = True

        # Store the original reads to write them to the discarded output if applies
        if keep_discarded_files:
            orig_sequence_rv = sequence_rv
            orig_quality_rv = quality_rv

        if not discard_read:
            # if indicated we remove the artifacts PolyA from reverse reads
            if do_adaptorA and len(sequence_rv) > min_length:
                sequence_rv, quality_rv = removeAdaptor(
                    sequence_rv, quality_rv, adaptorA, adaptor_missmatches)
            # if indicated we remove the artifacts PolyT from reverse reads
            if do_adaptorT and len(sequence_rv) > min_length:
                sequence_rv, quality_rv = removeAdaptor(
                    sequence_rv, quality_rv, adaptorT, adaptor_missmatches)
            # if indicated we remove the artifacts PolyG from reverse reads
            if do_adaptorG and len(sequence_rv) > min_length:
                sequence_rv, quality_rv = removeAdaptor(
                    sequence_rv, quality_rv, adaptorG, adaptor_missmatches)
            # if indicated we remove the artifacts PolyC from reverse reads
            if do_adaptorC and len(sequence_rv) > min_length:
                sequence_rv, quality_rv = removeAdaptor(
                    sequence_rv, quality_rv, adaptorC, adaptor_missmatches)

            # if indicated we remove the artifacts PolyC from reverse reads
            if do_adaptorN and len(sequence_rv) > min_length:
                sequence_rv, quality_rv = removeAdaptor(
                    sequence_rv, quality_rv, adaptorN, adaptor_missmatches)

            # Check if the read is smaller than the minimum after removing artifacts
            if len(sequence_rv) < min_length:
                dropped_adaptor += 1
                discard_read = True
            else:
                # Trim reverse read (will return None if length of trimmed sequence is less than min_length)
                sequence_rv, quality_rv = trim_quality(sequence_rv, quality_rv,
                                                       min_qual, min_length,
                                                       phred)
                if not sequence_rv or not quality_rv:
                    discard_read = True

        # Write reverse read to output
        if not discard_read:
            out_rv_writer.send((header_rv, sequence_rv, quality_rv))
            out_fw_writer.send((header_fw, sequence_fw, quality_fw))
        else:
            dropped_rv += 1
            if keep_discarded_files:
                out_rv_writer_discarded.send(
                    (header_rv, orig_sequence_rv, orig_quality_rv))

    fw_file.close()
    rv_file.close()
    out_rv_handle.flush()
    out_rv_handle.close()
    out_rv_writer.close()
    out_fw_handle.flush()
    out_rv_writer.close()
    out_fw_writer.close()
    if keep_discarded_files:
        out_rv_handle_discarded.flush()
        out_rv_handle_discarded.close()
        out_rv_writer_discarded.close()

    # Write info to the log
    logger.info("Trimming stats total reads (pair): {}".format(total_reads))
    logger.info(
        "Trimming stats {} reads have been dropped!".format(dropped_rv))
    perc2 = '{percent:.2%}'.format(percent=float(dropped_rv) /
                                   float(total_reads))
    logger.info(
        "Trimming stats you just lost about {} of your data".format(perc2))
    logger.info("Trimming stats reads remaining: {}".format(total_reads -
                                                            dropped_rv))
    logger.info("Trimming stats dropped pairs due to incorrect UMI: {}".format(
        dropped_umi_template))
    logger.info(
        "Trimming stats dropped pairs due to low quality UMI: {}".format(
            dropped_umi))
    logger.info(
        "Trimming stats dropped pairs due to high AT content: {}".format(
            dropped_AT))
    logger.info(
        "Trimming stats dropped pairs due to high GC content: {}".format(
            dropped_GC))
    logger.info(
        "Trimming stats dropped pairs due to presence of artifacts: {}".format(
            dropped_adaptor))

    # Check that output file was written ok
    if not fileOk(out_rw):
        error = "Error doing quality trimming checks of raw reads." \
        "\nOutput file not present {}\n".format(out_rw)
        logger.error(error)
        raise RuntimeError(error)

    # Adding stats to QA Stats object
    qa_stats.input_reads_forward = total_reads
    qa_stats.input_reads_reverse = total_reads
    qa_stats.reads_after_trimming_forward = (total_reads - dropped_rv)
    qa_stats.reads_after_trimming_reverse = (total_reads - dropped_rv)
예제 #12
0
def alignReads(reverse_reads, 
               ref_map,
               outputFile,
               outputFileDiscarded=None,
               outputFolder=None,
               trimReverse=0, 
               cores=4,
               min_intron_size=20,
               max_intron_size=1000000,
               max_gap_size=1000000,
               use_splice_juntions=True,
               disable_multimap=False,
               diable_softclipping=False,
               invTrimReverse=0,
               sortedBAMOutput=True):
    """
    This function will perform a sequence alignment using STAR.
    Mapped and unmapped reads are written to the paths given as
    parameters. It needs the path of the STAR genome index. 
    :param reverse_reads: file containing reverse reads in fastq format (Illumina pair end)
    :param ref_map: a path to the genome/transcriptome STAR index
    :param outputFile: the name of the SAM/BAM output file to write the alignments to
    :param outputFileDiscarded: the name of the SAM/BAM output file to write discarded alignments
    :param outputFolder: the path of the output folder
    :param trimReverse: the number of bases to trim in the reverse reads (to not map)
    :param cores: the number of cores to use to speed up the alignment
    :param file_name_patter: indicates how the output files will be named
    :param min_intron_size: min allowed intron size when spanning splice junctions
    :param max_intron size: max allowed intron size when spanning splice junctions
    :param max_gap_size: max allowed gap between pairs
    :param use_splice_junctions: whether to use splice aware alignment or not
    :param disable_multimap: if True no multiple alignments will be allowed
    :param diable_softclipping: it True no local alignment allowed
    :param invTrimReverse: number of bases to trim in the 5' of the read2
    :param sortedBAMOutput: True if the BAM output must be sorted
    :type reverse_reads: str
    :type ref_map: str
    :type outputFile: str
    :type outputFileDiscarded: str
    :type outputFolder: str
    :type trimReverse: int
    :type cores: int
    :type file_name_patter: str
    :type min_intron_size: int
    :type max_intron: int
    :type max_gap_size: int
    :type use_splice_junctions: bool
    :type disable_multimap: bool
    :type diable_softclipping: bool
    :type invTrimReverse: int
    :type sortedBAMOutput: bool
    :raises: RuntimeError,ValueError,OSError,CalledProcessError
    """
    logger = logging.getLogger("STPipeline")
    
    if not os.path.isfile(reverse_reads):
        error = "Error, input file not present {}\n".format(reverse_reads)
        logger.error(error)
        raise RuntimeError(error)
    
    # STAR has predefined output names for the files
    tmpOutputFile = "Aligned.sortedByCoord.out.bam" if sortedBAMOutput else "Aligned.out.bam"
    tmpOutputFileDiscarded = "Unmapped.out.mate1"
    log_std = "Log.std.out"
    log = "Log.out"
    log_sj = "SJ.out.tab"
    log_final = "Log.final.out"
    log_progress = "Log.progress.out"
    
    if outputFolder is not None and os.path.isdir(outputFolder):
        tmpOutputFile = os.path.join(outputFolder, tmpOutputFile)
        tmpOutputFileDiscarded = os.path.join(outputFolder, tmpOutputFileDiscarded)
        log_std = os.path.join(outputFolder, log_std)
        log = os.path.join(outputFolder, log)
        log_sj = os.path.join(outputFolder, log_sj)
        log_final = os.path.join(outputFolder, log_final)
        log_progress = os.path.join(outputFolder, log_progress)
    
    # Options
    # outFilterType(BySJout) this will keep only reads 
    #     that contains junctions present in SJ.out.tab
    # outSamOrder(Paired) one mate after the other 
    # outSAMprimaryFlag(OneBestScore) only one alignment with the best score is primary
    # outFilterMultimapNmax 
    #     read alignments will be output only if the read maps fewer than this value
    # outFilterMismatchNmax = alignment will be output only if 
    #     it has fewer mismatches than this value
    # outFilterMismatchNoverLmax = alignment will be output only if 
    #     its ratio of mismatches to *mapped* length is less than this value
    # alignIntronMin minimum intron size: genomic gap is considered intron 
    #     if its length>=alignIntronMin, otherwise it is considered Deletion
    # alignIntronMax maximum intron size, if 0, max intron size will be 
    #     determined by (2 to the power of winBinNbits)*winAnchorDistNbins
    # alignMatesGapMax maximum gap between two mates, if 0, max intron gap will 
    #     be determined by (2 to the power of winBinNbits)*winAnchorDistNbins
    # alignEndsType Local standard local alignment with soft-clipping allowed EndToEnd: 
    #     force end-to-end read alignment, do not soft-clip
    # chimSegmentMin if >0 To switch on detection of chimeric (fusion) alignments
    # --outMultimapperOrder Random multimap are written in Random order
    # --outSAMmultNmax Number of multimap that we want to output 
    # put to 1 to not include multiple mappings (default 10)
    
    multi_map_number = 1 if disable_multimap else 10
    alignment_mode = "EndToEnd" if diable_softclipping else "Local"
    sjdb_overhang = 100 if use_splice_juntions else 0
    bam_sorting = "SortedByCoordinate" if sortedBAMOutput else "Unsorted"
    
    core_flags = ["--runThreadN", str(max(cores, 1))]
    trim_flags = ["--clip5pNbases", trimReverse, 
                  "--clip3pNbases", invTrimReverse]
    io_flags   = ["--outFilterType", "Normal", 
                  "--outSAMtype", "BAM", bam_sorting, 
                  "--alignEndsType", alignment_mode, 
                  "--outSAMunmapped", "None", # unmapped reads not included in main output
                  "--outSAMorder", "Paired",    
                  "--outSAMprimaryFlag", "OneBestScore", 
                  "--outFilterMultimapNmax", multi_map_number, 
                  "--alignSJoverhangMin", 5, # default is 5
                  "--alignSJDBoverhangMin", 3, # default is 3
                  "--sjdbOverhang", sjdb_overhang, # 0 to not use splice junction database
                  "--outFilterMismatchNmax", 10, # large number switches it off (default 10)
                  "--outFilterMismatchNoverLmax", 0.3, # default is 0.3
                  "--alignIntronMin", min_intron_size,
                  "--alignIntronMax", max_intron_size, 
                  "--alignMatesGapMax", max_gap_size,
                  "--winBinNbits", 16,
                  "--winAnchorDistNbins", 9,
                  "--chimSegmentMin", 0,
                  "--readMatesLengthsIn", "NotEqual",
                  "--genomeLoad", "NoSharedMemory"] # Options to use share remove can be given here 

    args = ['STAR']
    args += trim_flags
    args += core_flags
    args += io_flags
    args += ["--genomeDir", ref_map,
             "--readFilesIn", reverse_reads,
             "--outFileNamePrefix", outputFolder + os.sep]  # MUST ENSURE AT LEAST ONE SLASH
    args += ["--outReadsUnmapped", "Fastx"]
    
    try:
        proc = subprocess.Popen([str(i) for i in args],
                                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                                close_fds=True, shell=False)
        (stdout, errmsg) = proc.communicate()
    except ValueError as e:
        logger.error("Error mapping with STAR\n Incorrect arguments.")
        raise e
    except OSError as e:
        logger.error("Error mapping with STAR\n Executable not found.")
        raise e
    except CalledProcessError as e:
        logger.error("Error mapping with STAR\n Program returned error.")
        raise e
        
    if not fileOk(tmpOutputFile):
        error = "Error mapping with STAR.\n" \
        "Output file not present {}\n{}\n".format(tmpOutputFile, errmsg)
        logger.error(error)
        raise RuntimeError(error)

    if len(errmsg) > 0:
        logger.warning("STAR has generated error messages during mapping.\n{}\n".format(errmsg))
        
    # Rename output files.
    shutil.move(tmpOutputFile, outputFile)
    shutil.move(tmpOutputFileDiscarded, outputFileDiscarded)
        
    # Remove temp files from STAR
    if os.path.isfile(log_std): os.remove(log_std)
    if os.path.isfile(log): os.remove(log)
    if os.path.isfile(log_progress): os.remove(log_progress)
    # Do not remove to use it for computing a new index in 2pass mode
    # if os.path.isfile(log_sj): os.remove(log_sj)
    
    if not os.path.isfile(log_final):
        logger.warning("Log output file from STAR is not present")
    else:
        logger.info("Mapping stats: ")
        logger.info("Mapping stats are computed from all the pair reads present in the raw files")
        uniquely_mapped = 0
        multiple_mapped = 0
        # Parse log file from STAR to get stats
        # TODO find a cleaner way to do this
        with open(log_final, "r") as star_log:
            for line in star_log.readlines():
                if line.find("Uniquely mapped reads %") != -1 \
                or line.find("Uniquely mapped reads number") != -1 \
                or line.find("Number of reads mapped to multiple loci") != -1 \
                or line.find("% of reads mapped to multiple loci") != -1 \
                or line.find("% of reads unmapped: too short") != -1:
                    logger.info(str(line).rstrip())
                # Some duplicated code here; TODO refactor
                if line.find("Uniquely mapped reads number") != -1:
                    uniquely_mapped = int(str(line).rstrip().split()[-1])
                if line.find("Number of reads mapped to multiple loci") != -1:
                    multiple_mapped = int(str(line).rstrip().split()[-1])
            logger.info("Total mapped reads: {}".format(uniquely_mapped + multiple_mapped))   
             
    # Remove log file       
    if os.path.isfile(log_final): os.remove(log_final)
예제 #13
0
def annotateReads(mappedReads, gtfFile, outputFile, outputDiscarded, mode,
                  strandness, htseq_no_ambiguous, include_non_annotated,
                  temp_dir, threads):
    """
    Annotates a file with mapped reads (BAM) using a modified 
    version of the htseq-count tool. It writes the annotated records to a file.
    It assumes the input reads (BAM) are single end and do not contain
    multiple alignments or un-annotated reads.
    :param mappedReads: path to a BAM file with mapped reads sorted by coordinate
    :param gtfFile: path to an annotation file in GTF format
    :param outputFile: where to write the annotated records (BAM)
    :param outputDiscarded: where to write the non-annotated records (BAM)
    :param mode: htseq-count overlapping mode (see htseq-count documentation)
    :param strandness: the type of strandness to use when annotating (yes, no or reverse)
    :param htseq_no_ambiguous: true if we want to discard ambiguous annotations
    :param include_non_annotated: true if we want to include 
    non annotated reads as __no_feature in the output
    :param outputFile: the name/path to the output file
    :param temp_dir: path to the folder where to put the created files
    :param threads: the number of CPU cores to use
    :type mappedReads: str
    :type gtfFile: str
    :type outputFile: str
    :type outputDiscarded: str
    :type mode: str
    :type strandness: str
    :type htseq_no_ambiguos: boolean
    :type include_non_annotated: str
    :type outputFile: str
    :type temp_dir: str
    :type threads: int
    :raises: RuntimeError, ValueError
    """

    logger = logging.getLogger("STPipeline")

    if not os.path.isfile(mappedReads):
        error = "Error during annotation, input file not present {}\n".format(
            mappedReads)
        logger.error(error)
        raise RuntimeError(error)

    try:
        annotated = count_reads_in_features(
            mappedReads,
            gtfFile,
            "bam",  # Type BAM for filesz
            strandness,  # Strand yes/no/reverse
            mode,  # intersection_nonempty, union, intersection_strict
            "exon",  # feature type in GFF
            "gene_id",  # gene_id or gene_name
            True,  # Quiet mode
            0,  # Min quality score
            outputFile,
            include_non_annotated,
            htseq_no_ambiguous,
            outputDiscarded)
    except Exception as e:
        error = "Error during annotation. HTSEQ execution failed\n"
        logger.error(error)
        raise e

    if not fileOk(outputFile) or annotated == 0:
        error = "Error during annotation. Output file not present {}\n".format(
            outputFile)
        logger.error(error)
        raise RuntimeError(error)

    logger.info("Annotated reads: {}".format(annotated))
    qa_stats.reads_after_annotation = annotated
예제 #14
0
def filterMappedReads(mapped_reads,
                      hash_reads,
                      file_output,
                      file_output_discarded=None,
                      min_length=28):
    """ 
    Iterate a SAM/BAM file containing mapped reads 
    and discards the reads that are secondary or too short.
    It also discards reads that do not contain a valid barcode.
    It will add the barcode, coordinates and UMI as extra tags
    to the output SAM/BAM file. The UMI will be added only if it is present.
    It assumes all the reads are mapped (do not contain un-aligned reads).
    :param mapped_reads: path to a SAM/BAM file containing the alignments
    :param hash_reads: a hash table of read_names to (x,y,umi) tags
    :param min_length: the min number of mapped bases we enforce in an alignment
    :param file_output: the path where to put the records
    :param file_output_discarded: the path where to put discarded files
    :type mapped_reads: str
    :type hash_reads: dict
    :type min_length: integer
    :type file_output: str
    :type file_output_discarded: str
    :raises: RuntimeError
    """
    logger = logging.getLogger("STPipeline")

    if not os.path.isfile(mapped_reads):
        error = "Error, input file not present {}\n".format(mapped_reads)
        logger.error(error)
        raise RuntimeError(error)

    # Create output files handlers
    flag_read = "rb"
    flag_write = "wb"
    infile = pysam.AlignmentFile(mapped_reads, flag_read)
    outfile = pysam.AlignmentFile(file_output, flag_write, template=infile)
    if file_output_discarded is not None:
        outfile_discarded = pysam.AlignmentFile(file_output_discarded,
                                                flag_write,
                                                template=infile)
    # Create some counters and loop the records
    dropped_secondary = 0
    dropped_short = 0
    dropped_barcode = 0
    present = 0
    for sam_record in infile.fetch(until_eof=True):
        present += 1
        discard_read = False

        # Add the barcode and coordinates info if present otherwise discard
        try:
            # The probability of a collision is very very low
            key = hash(sam_record.query_name)
            for tag in hash_reads[key]:
                tag_tokens = tag.split(":")
                sam_record.set_tag(tag_tokens[0], tag_tokens[2], tag_tokens[1])
        except KeyError:
            dropped_barcode += 1
            continue

        # Get how many bases were mapped
        mapped_bases = 0
        for cigar_tuple in sam_record.cigartuples:
            if cigar_tuple[0] == 0:
                mapped_bases += cigar_tuple[1]

        # We need this so we don't duplicate reads
        if not sam_record.is_secondary:
            sam_record.set_tag("NH", None)

        # Discard if secondary alignment or only few bases mapped
        if sam_record.is_secondary:
            dropped_secondary += 1
            discard_read = True
        elif mapped_bases != 0 and mapped_bases < min_length:
            dropped_short += 1
            discard_read = True

        if discard_read:
            if file_output_discarded is not None:
                outfile_discarded.write(sam_record)
        else:
            outfile.write(sam_record)

    # Close handlers
    infile.close()
    outfile.close()
    if file_output_discarded is not None:
        outfile_discarded.close()

    if not fileOk(file_output):
        error = "Error filtering mapped reads.\n" \
        "Output file is not present\n {}".format(file_output)
        logger.error(error)
        raise RuntimeError(error)

    logger.info("Finish filtering mapped reads, stats:" \
                "\nPresent: {0}" \
                "\nDropped - secondary alignment: {1}" \
                "\nDropped - too short: {2}" \
                "\nDropped - barcode: {3}".format(present,
                                                  dropped_secondary,
                                                  dropped_short,
                                                  dropped_barcode))

    # Update QA object
    qa_stats.reads_after_mapping = present - (dropped_secondary +
                                              dropped_short)
예제 #15
0
def annotateReads(mappedReads,
                  gtfFile,
                  outputFile,
                  mode,
                  strandness="reverse",
                  htseq_no_ambiguous=True,
                  include_non_annotated=False):
    """
    Annotates a file with mapped reads (SAM/BAM) using a modified 
    version of the htseq-count tool. It writes the annotated records to a file.
    :param mappedReads: path to a SAM/BAM file with mapped reads sorted by coordinate
    :param gtfFile: path to an annotation file in GTF format
    :param mode: htseq-count overlapping mode (see htseq-count documentation)
    :param strandness: the type of strandness to use when annotating (yes, no or reverse)
    :param htseq_no_ambiguous: true if we want to discard ambiguous annotations
    :param include_non_annotated: true if we want to include 
    non annotated reads as __no_feature in the output
    :param outputFile: the name/path to the output file
    :type mappedReads: str
    :type gtfFile: str
    :type mode: str
    :type strandness: str
    :type htseq_no_ambiguos: boolean
    :param include_non_annotated: boolean
    :param outputFile: str
    :raises: RuntimeError, ValueError
    """

    logger = logging.getLogger("STPipeline")

    if not os.path.isfile(mappedReads):
        error = "Error, input file not present {}\n".format(mappedReads)
        logger.error(error)
        raise RuntimeError(error)

    try:
        annotated = count_reads_in_features(
            mappedReads,
            gtfFile,
            "bam",  # Type BAM for files
            "pos",  # Order pos or name
            strandness,  # Strand yes/no/reverse
            mode,  # intersection_nonempty, union, intersection_strict
            "exon",  # feature type in GFF
            "gene_id",  # gene_id or gene_name
            True,  # Quiet mode
            0,  # Min quality score
            outputFile,
            include_non_annotated,
            htseq_no_ambiguous)
    except Exception as e:
        error = "Error during annotation. HTSEQ execution failed\n"
        logger.error(error)
        raise e

    if not fileOk(outputFile):
        error = "Error during annotation. Output file not present {}\n".format(
            outputFile)
        logger.error(error)
        raise RuntimeError(error)

    logger.info("Annotated reads: {}".format(annotated))
    qa_stats.reads_after_annotation = annotated