Exemplo n.º 1
0
def checkFileExistence(infile, outfile):
    '''check whether file exists.

    Files are uncompressed before checking existence.
    '''
    track = P.snip(infile, ".log")
    compute_file_metrics(infile,
                         outfile,
                         metric="file",
                         suffixes=P.as_list(
                             P.as_list(PARAMS.get('%s_regex_exist' % track,
                                                  ""))))
Exemplo n.º 2
0
def buildCheckSums(infile, outfile):
    '''build checksums for files in the build directory.

    Files are uncompressed before computing the checksum
    as gzip stores meta information such as the time stamp.
    '''
    track = P.snip(infile, ".log")
    compute_file_metrics(infile,
                         outfile,
                         metric="md5sum",
                         suffixes=P.as_list(
                             P.as_list(PARAMS.get('%s_regex_md5' % track,
                                                  ""))))
Exemplo n.º 3
0
def buildLineCounts(infile, outfile):
    '''compute line counts.

    Files are uncompressed before computing the number of lines.
    '''
    track = P.snip(infile, ".log")
    compute_file_metrics(infile,
                         outfile,
                         metric="wc -l",
                         suffixes=P.as_list(
                             P.as_list(
                                 PARAMS.get('%s_regex_linecount' % track,
                                            ""))))
Exemplo n.º 4
0
def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.get_temp_dir(".")
    databases = " ".join(P.as_list(P.get_params()["tomtom_databases"]))

    target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]),
                               "tomtom", outfile)

    if iotools.is_empty(infile):
        E.warn("input is empty - no computation performed")
        iotools.touch_file(outfile)
        return

    statement = '''
    tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run(statement)

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
Exemplo n.º 5
0
def exportMotifDiscoverySequences(infile, outfile):
    '''export sequences for motif discovery.

    This method requires the _interval tables.

    For motif discovery, only the sequences with the highest S/N ratio
    are supplied.

    1. The top *motifs_proportion* intervals sorted by peakval
    2. Only a region +/- *motifs_halfwidth* around the peak
    3. At least *motifs_min_sequences*. If there are not enough sequences
          to start with, all will be used.
    4. At most *motifs_max_size* sequences will be output.

    '''
    track = P.snip(infile, "_intervals.load")
    dbhandle = connect()

    p = P.substitute_parameters(**locals())
    nseq = motifs.writeSequencesForIntervals(
        track,
        outfile,
        dbhandle,
        full=False,
        masker=P.as_list(p['motifs_masker']),
        halfwidth=int(p["motifs_halfwidth"]),
        maxsize=int(p["motifs_max_size"]),
        proportion=p["motifs_proportion"],
        min_sequences=p["motifs_min_sequences"],
        num_sequences=p["motifs_num_sequences"],
        order=p['motifs_score'])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        iotools.touch_file(outfile)
Exemplo n.º 6
0
def run_test(infile, outfile):
    '''run a test.

    Multiple targets are run iteratively.
    '''

    track = P.snip(outfile, ".log")
    pipeline_name = PARAMS.get("%s_pipeline" % track, track[len("test_"):])

    pipeline_targets = P.as_list(PARAMS.get("%s_target" % track, "full"))

    # do not run on cluster, mirror
    # that a pipeline is started from
    # the head node
    #to_cluster = False

    template_statement = ("cd %%(track)s.dir; "
                          "xvfb-run -d cgatflow %%(pipeline_name)s "
                          "%%(pipeline_options)s "
                          "%%(workflow_options)s make %s "
                          "-L ../%%(outfile)s "
                          "-S ../%%(outfile)s.stdout "
                          "-E ../%%(outfile)s.stderr")

    if len(pipeline_targets) == 1:
        statement = template_statement % pipeline_targets[0]
        P.run(statement, ignore_errors=True, job_memory="unlimited")
    else:
        statements = []
        for pipeline_target in pipeline_targets:
            statements.append(template_statement % pipeline_target)
        P.run(statement, ignore_errors=True, job_memory="unlimited")
Exemplo n.º 7
0
def get_repeat_gff(outfile):
    """This task downloads UCSC repetetive RNA types.
    """
    ModuleTrna.getRepeatDataFromUCSC(
        dbhandle=connectToUCSC(),
        repclasses=P.as_list(PARAMS["ucsc_rnatypes"]),
        outfile=outfile,
        remove_contigs_regex=PARAMS["ucsc_remove_contigs"],
        job_memory="3G")
Exemplo n.º 8
0
def importRepeatsFromUCSC(outfile):
    """This task downloads UCSC repeats types as identified
    in the configuration file.
    """
    gtfsubset.getRepeatDataFromUCSC(dbhandle=connectToUCSC(),
                                    repclasses=P.as_list(
                                        PARAMS["ucsc_repeattypes"]),
                                    outfile=outfile,
                                    job_memory=PARAMS["job_memory"])
Exemplo n.º 9
0
def importRNAAnnotationFromUCSC(outfile):
    """This task downloads UCSC repetetive RNA types.
    """
    gtfsubset.getRepeatDataFromUCSC(
        dbhandle=connectToUCSC(),
        repclasses=P.as_list(PARAMS["ucsc_rnatypes"]),
        outfile=outfile,
        remove_contigs_regex=PARAMS["ncbi_remove_contigs"],
        job_memory=PARAMS["job_memory"])
Exemplo n.º 10
0
def runMEME(track, outfile, dbhandle):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the
    top 10% of intervals (peakval) are used.
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    This method is deprecated - use runMEMEOnSequences instead.
    '''
    # job_options = "-l mem_free=8000M"

    target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]),
                               "meme", outfile)

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(P.get_params()["genome_dir"],
                     P.get_params()["genome"]))

    tmpdir = P.get_temp_dir(".")
    tmpfasta = os.path.join(tmpdir, "in.fa")

    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=False,
        masker=P.as_list(P.get_params()['motifs_masker']),
        halfwidth=int(P.get_params()["meme_halfwidth"]),
        maxsize=int(P.get_params()["meme_max_size"]),
        proportion=P.get_params()["meme_proportion"],
        min_sequences=P.get_params()["meme_min_sequences"])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        iotools.touch_file(outfile)
    else:
        statement = '''
        meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log
        '''
        P.run(statement)

        collectMEMEResults(tmpdir, target_path, outfile)
Exemplo n.º 11
0
def exportIntervalSequences(infile, outfile, track, method):
    '''export sequences for motif discovery.

    This method requires the _interval tables.

    For motif discovery, only the sequences with the highest S/N ratio
    are supplied.

    1. The top *motifs_proportion* intervals sorted by peakval
    2. Only a region +/- *motifs_halfwidth* around the peak
    3. At least *motifs_min_sequences*. If there are not enough sequences
          to start with, all will be used.
    4. At most *motifs_max_size* sequences will be output.

    '''
    dbhandle = connect()

    try:
        halfwidth = int(PARAMS[method+"_halfwidth"])
        full = False
    except ValueError:
        full = True
        halfwidth = None

    try:
        maxsize = int(PARAMS[method+"_max_size"])
    except ValueError:
        maxsize = None

    nseq = PipelineMotifs.writeSequencesForIntervals(
        track,
        outfile,
        dbhandle,
        full=full,
        masker=P.as_list(PARAMS[method+'_masker']),
        halfwidth=halfwidth,
        maxsize=maxsize,
        num_sequences=PARAMS[method+"_num_sequences"],
        proportion=PARAMS[method+"_proportion"],
        min_sequences=PARAMS[method+"_min_sequences"],
        order=PARAMS[method+'_score'])

    if nseq == 0:
        E.warn("%s: no sequences - %s skipped" % (outfile, method))
        P.touch_file(outfile)
Exemplo n.º 12
0
    def processReads(infile, outfiles):
        '''process reads from .fastq and other sequence files.
        '''
        trimmomatic_options = P.get_params()["trimmomatic_options"]

        if P.get_params()["auto_remove"]:
            trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % (
                "contaminants.fasta",
                P.get_params()["trimmomatic_mismatches"],
                P.get_params()["trimmomatic_p_thresh"],
                P.get_params()["trimmomatic_c_thresh"],
                P.get_params()["trimmomatic_min_adapter_len"],
                P.get_params()["trimmomatic_keep_both_reads"]) + trimmomatic_options

        elif P.get_params()["trimmomatic_adapter"]:
            trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % (
                P.get_params()["trimmomatic_adapter"],
                P.get_params()["trimmomatic_mismatches"],
                P.get_params()["trimmomatic_p_thresh"],
                P.get_params()["trimmomatic_c_thresh"],
                P.get_params()["trimmomatic_min_adapter_len"],
                P.get_params()["trimmomatic_keep_both_reads"]) + trimmomatic_options

        job_threads = P.get_params()["threads"]
        job_memory = "12G"

        track = re.match(REGEX_TRACK, infile).groups()[0]

        m = preprocess.MasterProcessor(
            save=P.get_params()["save"],
            summarize=P.get_params()["summarize"],
            threads=P.get_params()["threads"],
            qual_format=P.get_params()['qual_format'])

        for tool in P.as_list(P.get_params()["preprocessors"]):

            if tool == "fastx_trimmer":
                m.add(preprocess.FastxTrimmer(
                    P.get_params()["fastx_trimmer_options"],
                    threads=P.get_params()["threads"]))
            elif tool == "trimmomatic":
                m.add(preprocess.Trimmomatic(
                    trimmomatic_options,
                    threads=P.get_params()["threads"]))
            elif tool == "sickle":
                m.add(preprocess.Sickle(
                    P.get_params()["sickle_options"],
                    threads=P.get_params()["threads"]))
            elif tool == "trimgalore":
                m.add(preprocess.Trimgalore(
                    P.get_params()["trimgalore_options"],
                    threads=P.get_params()["threads"]))
            elif tool == "flash":
                m.add(preprocess.Flash(
                    P.get_params()["flash_options"],
                    threads=P.get_params()["threads"]))
            elif tool == "reversecomplement":
                m.add(preprocess.ReverseComplement(
                    P.get_params()["reversecomplement_options"]))
            elif tool == "pandaseq":
                m.add(preprocess.Pandaseq(
                    P.get_params()["pandaseq_options"],
                    threads=P.get_params()["threads"]))
            elif tool == "cutadapt":
                cutadapt_options = P.get_params()["cutadapt_options"]
                if P.get_params()["auto_remove"]:
                    cutadapt_options += " -a file:contaminants.fasta "
                m.add(preprocess.Cutadapt(
                    cutadapt_options,
                    threads=P.get_params()["threads"],
                    untrimmed=P.get_params()['cutadapt_reroute_untrimmed'],
                    process_paired=P.get_params()["cutadapt_process_paired"]))
            else:
                raise NotImplementedError("tool '%s' not implemented" % tool)

        statement = m.build((infile,), "processed.dir/trimmed-", track)
        P.run(statement)
Exemplo n.º 13
0
def checkFile(infile, outfile):
    seqdat = PipelineAssembly.SequencingData(infile)
    outf = open(outfile, 'w')
    outf.write(
        "name\t{}\nformat\t{}\ncompressed\t{}\npaired\t{}\ninterleaved\t{}\n".
        format(seqdat.filename, seqdat.fileformat, seqdat.compressed,
               seqdat.paired, seqdat.interleaved))
    outf.close()


##################################################
#Run Selected Assemblers
##################################################

#get the list of assemblers to run on the data
ASSEMBLERS = P.as_list(PARAMS.get("Assembler_assemblers", ""))


###################################################
# Run Megahit
###################################################
@active_if("megahit" in ASSEMBLERS)
@follows(checkFile)
@follows(mkdir("megahit_out.dir"))
@transform(SEQUENCEFILES, SEQUENCEFILES_REGEX,
           r"megahit_out.dir/\1_complete.log")
def runMegahit(infile, outfile):
    job_memory = str(PARAMS["Megahit_clus_memory"]) + "G"
    job_threads = int(PARAMS["Megahit_clus_threads"])
    seqdat = PipelineAssembly.SequencingData(infile)
    assembler = PipelineAssembly.Megahit(seqdat, "megahit_out.dir", PARAMS)
Exemplo n.º 14
0
import subprocess

###################################################
###################################################
###################################################
# Pipeline configuration
###################################################
# load options from the config file
import cgatcore.pipeline as P
P.get_parameters([
    "%s/pipeline.yml" % __file__[:-len(".py")], "../pipeline.yml",
    "pipeline.yml"
])
PARAMS = P.PARAMS

FEATURES = P.as_list(PARAMS.get("General_feature_list"))
FEATUREPAIRS = P.as_list(PARAMS.get("General_feature_pairs"))
FEATUREPAIRS = [
    "{}_BY_{}".format(x.split(":")[0],
                      x.split(":")[1]) for x in FEATUREPAIRS
]
ALLFEATURES = FEATURES + FEATUREPAIRS

from pipeline_assembly import PipelineAssembly
from pipeline_enumerate import PipelineEnumerate
from pipeline_filter import PipelineFilter

#get all files within the directory to process
SEQUENCEFILES = ("*.fasta", "*.fasta.gz", "*.fasta.1.gz", "*.fasta.1", "*.fna",
                 "*.fna.gz", "*.fna.1.gz", "*.fna.1", "*.fa", "*.fa.gz",
                 "*.fa.1.gz", "*.fa.1", "*.fastq", "*.fastq.gz",
Exemplo n.º 15
0
    ["%s/pipeline.yml" % os.path.splitext(__file__)[0],
     "../pipeline.yml",
     "pipeline.yml"])

dbname = PARAMS['db_name']
unmapped = enrichment.getUnmapped(PARAMS)
outfilesuffixes = ["_genestoterms.tsv",
                   "_termstogenes.tsv",
                   "_termstodetails.tsv",
                   "_termstoont.tsv"]

unmappedouts = [["annotations.dir/%s%s" % (u, s)
                 for s in outfilesuffixes]
                for u in unmapped]

hpatissues = P.as_list(PARAMS.get('hpa_tissue', []))
hpatissues = ['clean_backgrounds.dir/%s_hpa_background.tsv'
              % tissue.replace(" ", "_") for tissue in hpatissues]

########################################################
# Set up database connection
########################################################


def connect():
    '''utility function to connect to database.

    Use this method to connect to the pipeline database.
    Additional databases can be attached here as well.

    Returns an sqlite3 database handle.
Exemplo n.º 16
0
           regex(".*/(.*).bed.gz"),
           r"motifs/\1.control.fasta")
def exportMotifControlSequences(infile, outfile):
    '''for each interval, export the left and right
    sequence segment of the same size.
    '''
    PipelineMotifs.exportSequencesFromBedFile(
        infile, outfile,
        masker=PARAMS['motifs_masker'],
        mode="leftright")


############################################################
############################################################
############################################################
@active_if("meme" in P.as_list(PARAMS["methods"]) or
           "disc_meme" in P.as_list(PARAMS["methods"]))
@transform(loadIntervals,
           suffix("_intervals.load"),
           ".meme.fasta")
def exportMemeIntervalSequences(infile, outfile):
    
    track = os.path.basename(P.snip(infile, "_intervals.load"))

    exportIntervalSequences(infile, outfile, track, "meme")


############################################################
@follows(mkdir("meme.dir"))
@active_if("meme" in P.as_list(PARAMS["methods"]))
@transform(exportMemeIntervalSequences, regex("(.+).meme.fasta"),
Exemplo n.º 17
0
def getAssociatedBAMFiles(track):
    '''return a list of BAM files associated with a track.

    By default, this method searches for ``track.bam`` file in the
    current directory and returns an offset of 0.

    Associations can be defined in the .yml file in the section
    [bams]. For example, the following snippet associates track
    track1 with the bamfiles :file:`track1.bam` and :file:`track2.bam`::

       [bams]
       track1=track1.bam,track2.bam

    Glob expressions are permitted.

    Offsets are used to shift tags in ChIP experiments. Offsets
    need to be defined in the [offsets] sections. If no offsets
    are defined, the method returns a list of 0 offsets.

    Offsets need to be defined in the same order as the bam files::

       [offsets]
       track1=120,200

    returns a list of BAM files and offsets.

    Default tracks and offsets can be specified using a placeholder ``%``. The
    following will associate all tracks with the same bam file::

        [bams]
        %=all.bam

    '''
    fn = track.asFile()
    bamfiles = glob.glob("%s.bam" % fn)

    if bamfiles == []:
        if "bams_%s" % fn.lower() in PARAMS:
            for ff in P.as_list(PARAMS["bams_%s" % fn.lower()]):
                bamfiles.extend(glob.glob(ff))
        else:
            for pattern, value in P.CONFIG.items("bams"):
                if "%" in pattern:
                    p = re.sub("%", "\S+", pattern)
                    if re.search(p, fn, re.IGNORECASE):
                        bamfiles.extend(glob.glob(value))

    offsets = []
    if "offsets_%s" % fn.lower() in PARAMS:
        offsets = list(map(int, P.as_list(PARAMS["offsets_%s" % fn.lower()])))
    else:
        for pattern, value in P.CONFIG.items("offsets"):
            if "%" in pattern:
                p = re.sub("%", "\S+", pattern)
                if re.search(p, fn, re.IGNORECASE):
                    offsets.extend(list(map(int, value.split(","))))

    if offsets == []:
        offsets = [0] * len(bamfiles)

    if len(bamfiles) != len(offsets):
        raise ValueError("number of BAM files %s is not the "
                         "same as number of offsets: %s" %
                         (str(bamfiles), str(offsets)))

    return bamfiles, offsets
Exemplo n.º 18
0
PARAMS = P.get_parameters([
    "%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml",
    "pipeline.yml"
])

# WARNING: pipeline names with underscores in their name are not allowed
TESTS = sorted(
    set([
        "test_{}".format(x.split("_")[1]) for x in PARAMS.keys()
        if x.startswith("test_")
    ]))


# obtain prerequisite generic data
@files([(None, "%s.tgz" % x)
        for x in P.as_list(PARAMS.get("prerequisites", ""))])
def setupPrerequisites(infile, outfile):
    '''setup pre-requisites.

    These are tar-balls that are unpacked, but not run.
    '''

    #to_cluster = False
    track = P.snip(outfile, ".tgz")

    # obtain data - should overwrite pipeline.yml file
    statement = '''
    wget --no-check-certificate -O %(track)s.tgz %(data_url)s/%(track)s.tgz'''
    P.run(statement)

    tf = tarfile.open(outfile)
Exemplo n.º 19
0
def compareCheckSums(infiles, outfile):
    '''compare checksum files against existing reference data.
    '''

    outf = iotools.open_file(outfile, "w")
    outf.write("\t".join((
        ("track", "status", "job_finished", "nfiles", "nref", "missing",
         "extra", "different", "different_md5", "different_lines", "same",
         "same_md5", "same_lines", "same_exist", "files_missing",
         "files_extra", "files_different_md5", "files_different_lines"))) +
               "\n")

    for infile in infiles:
        E.info("working on {}".format(infile))
        track = P.snip(infile, ".stats")

        logfiles = glob.glob(track + "*.log")
        job_finished = True
        for logfile in logfiles:
            is_complete = iotools.is_complete(logfile)
            E.debug("logcheck: {} = {}".format(logfile, is_complete))
            job_finished = job_finished and is_complete

        reffile = track + ".ref"

        # regular expression of files to test only for existence
        regex_exist = PARAMS.get('%s_regex_exist' % track, None)
        if regex_exist:
            regex_exist = re.compile("|".join(P.as_list(regex_exist)))

        regex_linecount = PARAMS.get('%s_regex_linecount' % track, None)
        if regex_linecount:
            regex_linecount = re.compile("|".join(P.as_list(regex_linecount)))

        regex_md5 = PARAMS.get('%s_regex_md5' % track, None)
        if regex_md5:
            regex_md5 = re.compile("|".join(P.as_list(regex_md5)))

        if not os.path.exists(reffile):
            raise ValueError('no reference data defined for %s' % track)

        cmp_data = pandas.read_csv(iotools.open_file(infile),
                                   sep="\t",
                                   index_col=0)

        ref_data = pandas.read_csv(iotools.open_file(reffile),
                                   sep="\t",
                                   index_col=0)

        shared_files = set(cmp_data.index).intersection(ref_data.index)
        missing = set(ref_data.index).difference(cmp_data.index)
        extra = set(cmp_data.index).difference(ref_data.index)

        different = set(shared_files)

        # remove those for which only check for existence
        if regex_exist:
            same_exist = set([x for x in different if regex_exist.search(x)])

            different = set(
                [x for x in different if not regex_exist.search(x)])
        else:
            same_exist = set()

        # select those for which only check for number of lines
        if regex_linecount:
            check_lines = [x for x in different if regex_linecount.search(x)]

            dd = (cmp_data['nlines'][check_lines] !=
                  ref_data['nlines'][check_lines])
            different_lines = set(dd.index[dd])
            different = different.difference(check_lines)

            dd = (cmp_data['nlines'][check_lines] == ref_data['nlines']
                  [check_lines])
            same_lines = set(dd.index[dd])

        else:
            different_lines = set()
            same_lines = set()

        # remainder - check md5
        if regex_md5:
            check_md5 = [x for x in different if regex_md5.search(x)]

            dd = (cmp_data['md5'][check_md5] != ref_data['md5'][check_md5])
            different_md5 = set(dd.index[dd])

            dd = (cmp_data['md5'][check_md5] == ref_data['md5'][check_md5])
            same_md5 = set(dd.index[dd])

        else:
            different_md5 = set()
            same_md5 = set()

        if job_finished and (len(missing) + len(extra) + len(different_md5) +
                             len(different_lines) == 0):
            status = "OK"
        else:
            status = "FAIL"

        outf.write("\t".join(
            map(str, (
                track,
                status,
                job_finished,
                len(cmp_data),
                len(ref_data),
                len(missing),
                len(extra),
                len(different_md5) + len(different_lines),
                len(different_md5),
                len(different_lines),
                len(same_md5) + len(same_lines) + len(same_exist),
                len(same_md5),
                len(same_lines),
                len(same_exist),
                ",".join(missing),
                ",".join(extra),
                ",".join(different_md5),
                ",".join(different_lines),
            ))) + "\n")

    outf.close()
Exemplo n.º 20
0
                    (entry.gene_id, transcript2gene_dict[entry.transcript_id]))
        else:
            transcript2gene_dict[entry.transcript_id] = entry.gene_id

    with iotools.open_file(outfile, "w") as outf:
        outf.write("transcript_id\tgene_id\n")
        for key, value in sorted(transcript2gene_dict.items()):
            outf.write("%s\t%s\n" % (key, value))


###################################################
# count-based quantifiers
###################################################


@active_if("featurecounts" in P.as_list(PARAMS["quantifiers"]))
@follows(mkdir("featurecounts.dir"))
@transform(["%s.bam" % x.asFile() for x in BAM_TRACKS], regex("(\S+).bam"),
           add_inputs(PARAMS['geneset']), [
               r"featurecounts.dir/\1/transcripts.tsv.gz",
               r"featurecounts.dir/\1/genes.tsv.gz"
           ])
def runFeatureCounts(infiles, outfiles):
    '''
    Counts reads falling into "features" - in each transcript and
    each gene.

    A read is counted as overlapping with a feature if at least one bp
    overlaps.

    Pairs and strandedness can be used to resolve reads falling into