Exemplo n.º 1
0
def my_taxonomy_tuple_provider(filename, *xargs):
    """
from a summary file like this :

Sequence11_count=1      Bacteria        Spirochaetes    Spirochaetia    Spirochaetales  Spirochaetaceae Treponema       Treponema bryantii
Sequence57_count=1      Bacteria        Firmicutes      Clostridia      Clostridiales   Lachnospiraceae Butyrivibrio    NA
Sequence62_count=1      Bacteria        Bacteroidetes   Bacteroidia     Bacteroidales   Prevotellaceae  Prevotella      NA
Sequence89_count=1      Bacteria        Bacteroidetes   Bacteroidia     Bacteroidales   Prevotellaceae  Prevotella      NA
Sequence106_count=1     Bacteria        Bacteroidetes   Bacteroidia     Bacteroidales   Prevotellaceae  Prevotella      NA
Sequence114_count=1     Bacteria        Firmicutes      Clostridia      Clostridiales   Clostridiaceae  Sarcina NA
Sequence117_count=1     Archaea Euryarchaeota   Methanobacteria Methanobacteriales      Methanobacteriaceae     Methanobrevibacter      NA
Sequence142_count=1     Bacteria        Firmicutes      Clostridia      Clostridiales   Eubacteriaceae  Eubacterium     Eubacterium pyruvativorans
Sequence143_count=1     Bacteria        Firmicutes      Clostridia      Clostridiales   Lachnospiraceae Butyrivibrio    NA
Sequence147_count=1     Bacteria        Bacteroidetes   Bacteroidia     Bacteroidales   Prevotellaceae  Prevotella      NA
Sequence175_count=1     Bacteria        Bacteroidetes   Bacteroidia     Bacteroidales   Prevotellaceae  Prevotella      NA
Sequence188_count=2     Bacteria        Bacteroidetes   Bacteroidia     Bacteroidales   Prevotellaceae  Prevotella      NA
Sequence188_count=2     Bacteria        Bacteroidetes   Bacteroidia     Bacteroidales   Prevotellaceae  Prevotella      NA
Sequence197_count=1     Bacteria        Bacteroidetes   Bacteroidia     Bacteroidales   Prevotellaceae  Prevotella      Prevotella brevis
Sequence302_count=2     Bacteria        Bacteroidetes   Bacteroidia     Bacteroidales   Prevotellaceae  Prevotella      Prevotella brevis
Sequence302_count=2     Bacteria        Bacteroidetes   Bacteroidia     Bacteroidales   Prevotellaceae  Prevotella      Prevotella brevis
Sequence311_count=1     Bacteria        Bacteroidetes   Bacteroidia     Bacteroidales   Prevotellaceae  Prevotella      NA

return a tuple of the first column, and othe columns as requested 
"""
    tuple_stream = from_tab_delimited_file(
        filename, 0, *xargs[0:])  # pick which fields define the bins

    atuple = tuple_stream.next()
    while True:
        # note that we patch NA to taxNA , as NA confuses R
        yield ((atuple[0], re.sub("NA", "taxNA", "_".join(atuple[1:]))))
        atuple = tuple_stream.next()
Exemplo n.º 2
0
def my_description_provider(filename, *xargs):
    """
    transform the tab-delimited stream, to only yield the records that relates either to a hit
    or "no hit" . Note that sometimes this format reports multiple hits to the same target
    - we only want the top hit - this is provided by the next method
# BLASTN 2.6.0+
# Query: seq_20382 count=638
# Database: /bifo/scratch/datacache/ncbi/indexes/blast/capra_hircus_ncbi_PRJNA290100.fasta
# 0 hits found
# BLAST processed 1 queries
# BLASTN 2.6.0+
# Query: seq_21074 count=204
# Database: /bifo/scratch/datacache/ncbi/indexes/blast/capra_hircus_ncbi_PRJNA290100.fasta
# Fields: query acc.ver, subject acc.ver, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
# 17 hits found
seq_21074       CM004590.1      100.000 64      0       0       1       64      4028254 4028191 1.14e-25        119
seq_21074       CM004590.1      100.000 64      0       0       1       64      39689322        39689385        1.14e-25        119
seq_21074       CM004590.1      98.438  64      1       0       1       64      402829  402892  5.31e-24        113
seq_21074       CM004590.1      98.438  64      1       0       1       64      3455400 3455337 5.31e-24        113

    """
    weighting_method = xargs[0]
    raw_tuple_stream = from_tab_delimited_file(filename,*xargs[1:])   # query, description
    database=[None]
    tuple_stream = ((item[0], database[0], item[1]) for item in raw_tuple_stream)

    atuple = tuple_stream.next()
    query = ""
    while True:
        #print "DEBUG1", atuple
        database_match=re.search("^#\s+Database:\s+(\S+)$",atuple[0].strip())
        if database_match is not None:
            database[0] = os.path.splitext( os.path.basename(database_match.groups()[0]) )[0]
        weight = 1
        query_match = re.search("^#\s+Query:\s+(.*)$",atuple[0].strip())
        if query_match is not None:
            query = query_match.groups()[0]
        if re.search(" 0 hits",atuple[0],re.IGNORECASE) is not None:
            if weighting_method == "tag_count":
                weighting_match = re.search("count=(\d*\.*\d*)\s*$", query)
                weight = float(weighting_match.groups()[0])
            yield ((query,database[0],'No hits'),weight)
        elif atuple[0] == re.split("\s+",query)[0]:
            if weighting_method == "tag_count":
                weighting_match = re.search("count=(\d*\.*\d*)\s*$", query)
                weight = float(weighting_match.groups()[0])
            #print "DEBUG2", ((query,database[0],atuple[2]), weight)
            # e.g.
            #(('seq_91347 count=1.001001', 'nt', 'PREDICTED: Salmo salar uncharacterized LOC106591627 (LOC106591627), ncRNA'), 1.001001)
            yield ((query,database[0],atuple[2]), weight)
            
        else:
            pass 
        
        atuple = tuple_stream.next()
Exemplo n.º 3
0
def build_run_tax_distributions(run_name,
                                tax_pattern=None,
                                name_infix="",
                                exclusions=None):
    """
     the input tax table looks like  :

     Kingdom Family  Sample_SQ0032   Sample_SQ2525   Sample_SQ2526   Sample_SQ2527   Sample_SQ2528   Sample_SQ2529   Sample_SQ2530   Sample_SQ2531
B       Actinoplanes friuliensis DSM 7358       0       0       0       0       3       0       0       0
B       Variovorax paradoxus B4 0       0       0       0       1       1       1       0
Bacteria        Achromobacter xylosoxidans      1       0       0       0       4       2       0       0
Bacteria        Achromobacter xylosoxidans A8   1       0       0       0       4       7       0       0
Bacteria        Achromobacter xylosoxidans C54  0       0       0       0       2       4       0       0
Bacteria        Acidiphilium multivorum AIU301  0       0       0       0       0       0       1       0
Bacteria        Acidovorax avenae subsp. avenae ATCC 19860      1       0       0       0       0       1       1       0
Bacteria        Acidovorax avenae subsp. citrulli AAC00-1       0       0       0       0       1       0       0       0
Bacteria        Acidovorax sp. KKS102   0       1       0       0       1       8       0       1
Bacteria        Acinetobacter baumannii 0       0       0       0       0       1       0       0

    """

    global RUN_ROOT, BUILD_ROOT

    # get the number of samples
    run_taxtable_file = os.path.join(RUN_ROOT, "%s.processed" % run_name,
                                     "taxonomy_analysis",
                                     "samples_taxonomy_table.txt")
    data_stream = from_tab_delimited_file(run_taxtable_file)

    sample_names = data_stream.next()[2:]
    #print "building distributions for samples %s in run %s"%(str(sample_names), run_name)
    saved_files = []

    for sample_name in sample_names:
        #print "processing %s"%sample_name
        saved_file = build_sample_tax_distribution(run_taxtable_file, run_name,
                                                   sample_name, tax_pattern,
                                                   name_infix, exclusions)
        #use_prbdf(saved_file)
        saved_files.append(saved_file)

    return saved_files
Exemplo n.º 4
0
def my_hit_provider(filename, *xargs):
    """
    transform the tab-delimited stream, to only yield the records that relates either to a hit
    or "no hit" . Note that sometimes this format reports multiple hits to the same target
    - we only want the top hit - this is provided by the next method
    """
    weighting_method = xargs[0]
    tuple_stream = from_tab_delimited_file(filename, *xargs[1:])

    atuple = tuple_stream.next()
    query = ""
    while True:
        #print "debug " + str(atuple)
        weight = 1.0
        query_match = re.search("^#\s+Query:\s+(.*)$", atuple[0].strip())
        if query_match is not None:
            query = query_match.groups()[0]
        if re.search(" 0 hits", atuple[0], re.IGNORECASE) is not None:
            if weighting_method == "tag_count":
                weighting_match = re.search("count=(\d*\.*\d*)\s*$", query)
                if weighting_match is not None:
                    weight = float(weighting_match.groups()[0])
            yield ((query, 'No hits', 'No hits'), weight)
        elif atuple[1:] == tuple((len(atuple) - 1) * [None]):
            pass
        elif atuple[1] is None or atuple[2] is None:
            raise Exception(
                "error - unexpected results %s from blast output - incomplete taxonomy tuple"
                % str(atuple))
        else:
            if weighting_method == "tag_count":
                weighting_match = re.search("count=(\d*\.*\d*)\s*$", query)
                if weighting_match is not None:
                    weight = float(weighting_match.groups()[0])
            yield (atuple, weight)

        atuple = tuple_stream.next()
Exemplo n.º 5
0
def build_sample_tax_distribution(datafile,
                                  run_name,
                                  sample_name,
                                  tax_pattern=None,
                                  name_infix="",
                                  exclusions=None):
    """
    each record - i.e. taxa - is a bin. Build a distribution of reads across
    these bins, for each sample in a run. This is already provided by the summary files - we just collate
    all summary files and store it our own sparse prbdf structure

    (tax_pattern and name_infix are there for selecting out and naming sub-sets of taxa)
    """
    global RUN_ROOT, BUILD_ROOT

    #print "building sample tax distribution for %s:%s using %s"%(run_name, sample_name, datafile)

    data_stream = from_tab_delimited_file(datafile)
    header = data_stream.next()
    sample_index = header.index(sample_name)
    if exclusions is None:
        if tax_pattern is None:
            data_stream = (
                (record[0], record[1], record[sample_index])
                for record in data_stream
                if float(record[sample_index]) > 0)  # taxname, count
        else:
            data_stream = (
                (record[0], record[1], record[sample_index])
                for record in data_stream
                if float(record[sample_index]) > 0 and re.search(
                    tax_pattern, record[0], re.IGNORECASE) is not None
            )  # taxname, count
    elif exclusions == "nohit":
        if tax_pattern is None:
            data_stream = (
                (record[0], record[1], record[sample_index])
                for record in data_stream if float(record[sample_index]) > 0
                and re.search("no\s*hit", record[0], re.IGNORECASE) is None)
        else:
            data_stream = (
                (record[0], record[1], record[sample_index])
                for record in data_stream
                if float(record[sample_index]) > 0 and re.search(
                    tax_pattern, record[0], re.IGNORECASE) is not None
                and re.search("no\s*hit", record[0], re.IGNORECASE) is None
            )  # taxname, count
    else:
        raise Exception("unsupported exclusions spec %s" % exclusions)

    distob = Distribution(None, 1, [data_stream])

    distob.interval_locator_funcs = [bin_discrete_value, bin_discrete_value]
    distob.assignments_files = ["kingdom_binning.txt", "family_binning.txt"]
    distob.weight_value_provider_func = my_weight_value_provider
    distdata = build(distob, "singlethread")
    save_filename = os.path.join(
        BUILD_ROOT, "%s_%s_%s.pickle" % (run_name, sample_name, name_infix))
    if len(name_infix) > 0:
        save_filename = os.path.join(
            BUILD_ROOT,
            "%s_%s_%s.pickle" % (run_name, sample_name, name_infix))
    else:
        save_filename = os.path.join(BUILD_ROOT,
                                     "%s_%s.pickle" % (run_name, sample_name))
    distob.save(save_filename)

    #print "Distribution %s:%s has %d points distributed over %d intervals, stored in %d parts"%(run_name, sample_name,distob.point_weight, len(distdata), len(distob.part_dict))

    return save_filename