Exemplo n.º 1
0
    def __init__(self, name, controller="Local"):

        ## obj name
        self.name = name
        print("New Assembly object `{}` created".format(self.name))

        ## launch ipcluster and register for later destruction
        self.__ipname__ = ipcontroller_init(controller)

        ## get binaries of dependencies
        self.vsearch, self.muscle, self.smalt, self.samtools = getbins()

        ## link a log history of executed workflow
        self.log = []
        self._stamp(self.name + " created")
        self.statsfiles = ObjDict()

        ## samples linked
        self.samples = ObjDict()

        ## multiplex files linked
        self.barcodes = ObjDict()

        ## an object for storing data directories for this Assembly
        self.dirs = ObjDict()

        ## the default params dict
        self.paramsdict = OrderedDict([
            ("working_directory", os.path.realpath(os.path.curdir)),
            ("raw_fastq_path",
             os.path.join(os.path.realpath(os.path.curdir), "*.fastq")),
            ("barcodes_path",
             os.path.join(os.path.realpath(os.path.curdir), "*.barcodes.txt")),
            ("sorted_fastq_path", ""), ("restriction_overhang", ("TGCAG", "")),
            ("max_low_qual_bases", 5), ("engines_per_job", 4),
            ("mindepth_statistical", 6), ("mindepth_majrule", 6),
            ("datatype", 'rad'), ("clust_threshold", .85), ("minsamp", 4),
            ("max_shared_heterozygosity", .25), ("prefix_outname", self.name),
            ("phred_Qscore_offset", 33), ("max_barcode_mismatch", 1),
            ("filter_adapters", 0), ("filter_min_trim_len", 35), ("ploidy", 2),
            ("max_stack_size", 1000), ("max_Ns_consens", (5, 5)),
            ("max_Hs_consens", (8, 8)), ("max_SNPs_locus", (100, 100)),
            ("max_Indels_locus", (5, 99)), ("trim_overhang", (1, 2, 2, 1)),
            ("hierarchical_clustering", 0), ("assembly_method", "denovo"),
            ("reference_sequence", "")
        ])
Exemplo n.º 2
0
    def __init__(self, name=""):
        ## a sample name
        self.name = name
        self.barcode = ""
        self.merged = 0

        ## stats dictionary
        self.stats = pd.Series(index=[
            "state",
            "reads_raw",
            "reads_filtered",
            "refseq_mapped_reads",
            "refseq_unmapped_reads",
            "clusters_total",
            "clusters_kept",
            "hetero_est",
            "error_est",
            "reads_consens",
        ])

        ## link to files
        self.files = ObjDict({
            "fastqs": [],
            "edits": [],
            "mapped_reads": [],
            "unmapped_reads": [],
            "clusters": [],
            "depths": [],
            "consens": [],
            "database": []
        })

        ## store cluster depth information
        self.depths = ObjDict()
        self.depths.total = []
        self.depths.mjmin = []
        self.depths.statmin = []

        ## assignments for hierarchical clustering
        self.group = []
Exemplo n.º 3
0
def consensus(args):
    """
    from a clust file handle, reads in all copies at a locus and sorts
    bases at each site, tests for errors at the site according to error 
    rate, calls consensus.
    """

    ## unpack args
    data, sample, tmpchunk, point = args

    ## read in cluster file 2 lines at a time
    infile = gzip.open(tmpchunk)  #sample.files["clusters"])
    duo = itertools.izip(*[iter(infile)] * 2)

    ## store read depth info for later output files
    datadict = {}

    ## counters
    locus = 0
    minsamp_filtered = 0
    nheteros = 0

    ## iterate over clusters
    while 1:
        try:
            first = duo.next()
        except StopIteration:
            break
        itera = [first[0], first[1]]
        fname = itera[0].split(";")[0]

        ## local containers and counters for this locus"
        locus += 1  ## recording n loci
        sloc = []  ## list for sequence data
        nloc = []  ## list for names used for gbs filters

        ## grab seqs until end of cluster
        while itera[0] != "//\n":
            ## append sequence * number of dereps "
            nreps = int(itera[0].split(";")[-2].split("=")[1])
            for _ in xrange(nreps):
                sloc.append(tuple(itera[1].strip()))
                nloc.append(itera[0])
            ## move on to the next sequence
            itera = duo.next()

        ## now that all seqs in this loc are read in
        ## check that none overlap leftjust overhang if gbs
        if data.paramsdict["datatype"] in ['gbs', 'merged']:
            ## TODO: test these new changes to gbs filter
            ## edge filters
            leftjust = rightjust = None
            rights = []

            ## get leftjust and rights
            for i, j in zip(nloc, sloc):
                leftjust, rights = gbs_edgefilter(i, j, leftjust, rights)
            if rights:
                ## record in name that there was a reverse hit"
                fname = fname[:-2] + "c1"
                try:
                    rightjust = min([min(i) for i in rights])
                except ValueError:
                    sloc = ""

            for seq in xrange(len(sloc)):
                sloc[seq] = sloc[seq][leftjust:]
                if rightjust:
                    sloc[seq] = sloc[seq][:rightjust + 1]

        ## Apply depth filter
        if (len(sloc) >= data.paramsdict["mindepth_majrule"]) and \
           (len(sloc) <= data.paramsdict["max_stack_size"]):

            ## this loc passed the minsamp filter
            minsamp_filtered += 1

            ## get stacks of bases at each site
            arrayed = numpy.array(sloc)
            stacked = [Counter(seq) for seq in arrayed.T]

            ## apply functions to list of sites in stacked
            ## filter by site for paralogs and make consens calls
            consens = [filter2(data, site) for site in stacked]

            ## filtered by locus for paralog
            if "@" not in consens:
                ## get hetero sites
                heteros = [i[0] for i in enumerate(consens) \
                                  if i[1] in list("RKSYWM")]

                ## filter for max number of hetero sites
                exceedmaxploid = 0
                if len(heteros) <= data.paramsdict["max_Hs_consens"]:
                    ## filter for more than x alleles given ploidy. Only
                    ## relevant if locus is polymorphic at more than one site
                    if len(heteros) > 1:
                        consens, exceedmaxploid = filter3(
                            data, consens, heteros, sloc)

                    ## if the locus passed paralog filtering
                    if not exceedmaxploid:

                        consens = "".join(consens).replace("-", "N")
                        ## if a site is stripped then I need to remove the site
                        ## from the site counter (stacked)
                        shortconl = consens.lstrip("N")
                        if len(shortconl) < len(consens):
                            stacked = stacked[-len(shortconl):]
                        shortcon = consens.rstrip("N")
                        if len(shortcon) < len(shortconl):
                            stacked = stacked[:len(shortcon)]

                        ## this function which removes low coverage sites next
                        ## to poly repeats that are likely sequencing errors
                        ## also edits 'stacked'
                        shortcon, stacked = removerepeat_Ns(shortcon, stacked)

                        ## only allow maxN internal "N"s in a locus
                        if shortcon.count("N") <= int(
                                data.paramsdict["max_Ns_consens"]):
                            ## minimum length for clustering in vsearch
                            if len(shortcon) >= 32:
                                ## keep for counter
                                nheteros += len(heteros)

                                ## store the consens seq
                                #consdic[fname] = shortcon

                                ## create dataobj w/ name fname
                                dataobj = ObjDict()
                                ## store qual and depth data
                                dataobj.seq = shortcon  #[len(cut1):]
                                dataobj.Cs = [i["C"] for i in stacked]
                                dataobj.As = [i["A"] for i in stacked]
                                dataobj.Ts = [i["T"] for i in stacked]
                                dataobj.Gs = [i["G"] for i in stacked]
                                #Cs = [i["C"] for i in stacked]
                                #As = [i["A"] for i in stacked]
                                #Ts = [i["T"] for i in stacked]
                                #Gs = [i["G"] for i in stacked]
                                #dfconsens = pd.DataFrame([list(shortcon),
                                #                          Cs, As, Ts, Gs])
                                tag = "_".join(fname.split("_")[-2:])
                                datadict[tag] = dataobj
                                #datadict[tag] = dfconsens
                        else:
                            pass  #print "maxN filtered loc", locus
                    else:
                        pass  #print "ploid filtered loc", locus
                else:
                    pass  #print "maxH filtered loc", locus
            else:
                pass  #print "third base filtered loc", locus
        else:
            pass  #print "mindepth filtered loc", locus

    data.dirs.consens = os.path.join(data.dirs.clusts, "consens")

    #if not os.path.exists(os.path.join(...)):
    #    os.mkdir(consensdir)
    ## get filename
    consenshandle = ""  #os.path.join([consensdir, sample.name+"consens.gz"])

    ## write to file
    with gzip.open(consenshandle, 'wb') as outfile:
        outfile.write("\n".join([">"+sample.name+"_"+obj+"\n"+\
                                 datadict[obj].seq for obj in datadict]))
        #for obj in datadict:
        #    outfile.write(">"+sample.name+"_"+obj+"\n"+datadict[obj].seq+"\n")

    ## count the number of polymorphic sites
    if 'ddrad' in data.paramsdict["datatype"]:
        if 'pair' in data.paramsdict["datatype"]:
            sub = 4 + len(data.paramsdict["restriction_overhang"][0])
        else:
            sub = len(data.paramsdict["restriction_overhang"][0])
    elif 'gbs' in data.paramsdict["datatype"]:
        if 'pair' in data.paramsdict["datatype"]:
            sub = 4 + len(data.paramsdict["restriction_overhang"][0]) * 2
            #  (len(params["cut"])*2)
        else:
            sub = len(data.paramsdict["restriction_overhang"][0])
    elif data.paramsdict["datatype"] == "merged":
        sub = len(data.paramsdict["restriction_overhang"][0]) * 2
    else:
        sub = len(data.paramsdict["restriction_overhang"][0])
    nsites = sum([len(datadict[i].seq) - sub for i in datadict])
    ldic = len(datadict)
    try:
        poly = nheteros / float(nsites)
    except ZeroDivisionError:
        poly = 0.

    ## dump the quality score and depth info into a pickle
    #pickleout = gzip.open(handle.replace("clustS.gz", "bindata"), 'wb')
    #pickle.dump(datadict, pickleout)
    #pickleout.close()

    return [sample.name, locus, minsamp_filtered, ldic, nsites, nheteros, poly]