Exemplo n.º 1
0
    def __init__(self, name, controller="Local"):

        ## obj name
        self.name = name
        print("New Assembly object `{}` created".format(self.name))

        ## launch ipcluster and register for later destruction
        self.__ipname__ = ipcontroller_init(controller)

        ## get binaries of dependencies
        self.vsearch, self.muscle, self.smalt, self.samtools = getbins()

        ## link a log history of executed workflow
        self.log = []
        self._stamp(self.name + " created")
        self.statsfiles = ObjDict()

        ## samples linked
        self.samples = ObjDict()

        ## multiplex files linked
        self.barcodes = ObjDict()

        ## an object for storing data directories for this Assembly
        self.dirs = ObjDict()

        ## the default params dict
        self.paramsdict = OrderedDict([
            ("working_directory", os.path.realpath(os.path.curdir)),
            ("raw_fastq_path",
             os.path.join(os.path.realpath(os.path.curdir), "*.fastq")),
            ("barcodes_path",
             os.path.join(os.path.realpath(os.path.curdir), "*.barcodes.txt")),
            ("sorted_fastq_path", ""), ("restriction_overhang", ("TGCAG", "")),
            ("max_low_qual_bases", 5), ("engines_per_job", 4),
            ("mindepth_statistical", 6), ("mindepth_majrule", 6),
            ("datatype", 'rad'), ("clust_threshold", .85), ("minsamp", 4),
            ("max_shared_heterozygosity", .25), ("prefix_outname", self.name),
            ("phred_Qscore_offset", 33), ("max_barcode_mismatch", 1),
            ("filter_adapters", 0), ("filter_min_trim_len", 35), ("ploidy", 2),
            ("max_stack_size", 1000), ("max_Ns_consens", (5, 5)),
            ("max_Hs_consens", (8, 8)), ("max_SNPs_locus", (100, 100)),
            ("max_Indels_locus", (5, 99)), ("trim_overhang", (1, 2, 2, 1)),
            ("hierarchical_clustering", 0), ("assembly_method", "denovo"),
            ("reference_sequence", "")
        ])
Exemplo n.º 2
0
    def __init__(self, name=""):
        ## a sample name
        self.name = name
        self.barcode = ""
        self.merged = 0

        ## stats dictionary
        self.stats = pd.Series(index=[
            "state",
            "reads_raw",
            "reads_filtered",
            "refseq_mapped_reads",
            "refseq_unmapped_reads",
            "clusters_total",
            "clusters_kept",
            "hetero_est",
            "error_est",
            "reads_consens",
        ])

        ## link to files
        self.files = ObjDict({
            "fastqs": [],
            "edits": [],
            "mapped_reads": [],
            "unmapped_reads": [],
            "clusters": [],
            "depths": [],
            "consens": [],
            "database": []
        })

        ## store cluster depth information
        self.depths = ObjDict()
        self.depths.total = []
        self.depths.mjmin = []
        self.depths.statmin = []

        ## assignments for hierarchical clustering
        self.group = []
Exemplo n.º 3
0
    def __init__(self, name, controller="Local"):

        ## obj name
        self.name = name    
        print("New Assembly object `{}` created".format(self.name))

        ## launch ipcluster and register for later destruction
        self.__ipname__ = ipcontroller_init(controller)

        ## get binaries of dependencies
        self.vsearch, self.muscle, self.smalt, self.samtools = getbins()

        ## link a log history of executed workflow
        self.log = []
        self._stamp(self.name+" created")
        self.statsfiles = ObjDict()

        ## samples linked 
        self.samples = ObjDict()

        ## multiplex files linked
        self.barcodes = ObjDict()

        ## an object for storing data directories for this Assembly
        self.dirs = ObjDict()

        ## the default params dict
        self.paramsdict = OrderedDict([
                       ("working_directory", os.path.realpath(
                                                os.path.curdir)),
                       ("raw_fastq_path", os.path.join(
                                            os.path.realpath(
                                                 os.path.curdir),
                                                 "*.fastq")),
                       ("barcodes_path", os.path.join(
                                            os.path.realpath(
                                                os.path.curdir),
                                                "*.barcodes.txt")),
                       ("sorted_fastq_path", ""),
                       ("restriction_overhang", ("TGCAG", "")),
                       ("max_low_qual_bases", 5),
                       ("engines_per_job", 4),
                       ("mindepth_statistical", 6), 
                       ("mindepth_majrule", 6), 
                       ("datatype", 'rad'), 
                       ("clust_threshold", .85),
                       ("minsamp", 4), 
                       ("max_shared_heterozygosity", .25), 
                       ("prefix_outname", self.name),
                       ("phred_Qscore_offset", 33),
                       ("max_barcode_mismatch", 1),
                       ("filter_adapters", 0), 
                       ("filter_min_trim_len", 35), 
                       ("ploidy", 2), 
                       ("max_stack_size", 1000),
                       ("max_Ns_consens", (5, 5)), 
                       ("max_Hs_consens", (8, 8)), 
                       ("max_SNPs_locus", (100, 100)), 
                       ("max_Indels_locus", (5, 99)), 
                       ("trim_overhang", (1, 2, 2, 1)), 
                       ("hierarchical_clustering", 0),
                       ("assembly_method", "denovo"),
                       ("reference_sequence", "")
        ])
Exemplo n.º 4
0
class Assembly(object):
    """ An ipyrad Assembly class object.

    The core object in ipyrad used to store and retrieve results, to
    call assembly functions, and to link to Sample objects.

    Parameters
    ----------
    name : str
        A name should be passed when creating a new Assembly object.
        This name will be used as a prefix for all files saved to disk
        associated with this Assembly. It is automatically set as the
        prefix name (parameter 14).          


    Attributes
    ----------
    name : str
        A name for the Assembly object. Used for all saved files on disk.
    samples : dict
        Returns a dict with Sample names as keys and Sample objects as values.
    barcodes : dict
        Returns a dictionary with Sample names as keys and barcodes as values.
        The barcodes information is fetched from parameter 3
        `[Assembly].paramsdict['barcodes_path']`.
    vsearch : str
        The path to the default vsearch executable. If not found, this can be 
        changed by setting [Assembly].vsearch = [newpath].
    muscle : str
        The path to the default muscle executable. If not found, this can be 
        changed by setting `[Assembly].muscle = [newpath]`.
    smalt : str
        The path to the default smalt executable. If not found, this can be 
        changed by setting `[Assembly].smalt = [newpath]`.
    samtools : str
        The path to the default samtools executable. If not found, this can be 
        changed by setting `[Assembly].samtools = [newpath]`.
    log : list
        A list of all modifications to the Assembly object and its Samples with
        time stamps. Use `print [Assembly].log` for easier viewing.
    dirs : dict
        Returns a dictionary with the location of directories that contain 
        linked Sample object files and stats results.

        
    Returns
    -------
    object
         A new assembly object is returned.


     """


    def __init__(self, name, controller="Local"):

        ## obj name
        self.name = name    
        print("New Assembly object `{}` created".format(self.name))

        ## launch ipcluster and register for later destruction
        self.__ipname__ = ipcontroller_init(controller)

        ## get binaries of dependencies
        self.vsearch, self.muscle, self.smalt, self.samtools = getbins()

        ## link a log history of executed workflow
        self.log = []
        self._stamp(self.name+" created")
        self.statsfiles = ObjDict()

        ## samples linked 
        self.samples = ObjDict()

        ## multiplex files linked
        self.barcodes = ObjDict()

        ## an object for storing data directories for this Assembly
        self.dirs = ObjDict()

        ## the default params dict
        self.paramsdict = OrderedDict([
                       ("working_directory", os.path.realpath(
                                                os.path.curdir)),
                       ("raw_fastq_path", os.path.join(
                                            os.path.realpath(
                                                 os.path.curdir),
                                                 "*.fastq")),
                       ("barcodes_path", os.path.join(
                                            os.path.realpath(
                                                os.path.curdir),
                                                "*.barcodes.txt")),
                       ("sorted_fastq_path", ""),
                       ("restriction_overhang", ("TGCAG", "")),
                       ("max_low_qual_bases", 5),
                       ("engines_per_job", 4),
                       ("mindepth_statistical", 6), 
                       ("mindepth_majrule", 6), 
                       ("datatype", 'rad'), 
                       ("clust_threshold", .85),
                       ("minsamp", 4), 
                       ("max_shared_heterozygosity", .25), 
                       ("prefix_outname", self.name),
                       ("phred_Qscore_offset", 33),
                       ("max_barcode_mismatch", 1),
                       ("filter_adapters", 0), 
                       ("filter_min_trim_len", 35), 
                       ("ploidy", 2), 
                       ("max_stack_size", 1000),
                       ("max_Ns_consens", (5, 5)), 
                       ("max_Hs_consens", (8, 8)), 
                       ("max_SNPs_locus", (100, 100)), 
                       ("max_Indels_locus", (5, 99)), 
                       ("trim_overhang", (1, 2, 2, 1)), 
                       ("hierarchical_clustering", 0),
                       ("assembly_method", "denovo"),
                       ("reference_sequence", "")
        ])

    def __str__(self):
        return "<ipyrad.Assembly object {}>".format(self.name)

    @property
    def stats(self):
        """ Returns a data frame with Sample data and state. """
        nameordered = self.samples.keys()
        nameordered.sort()
        return pd.DataFrame([self.samples[i].stats for i in nameordered], 
                      index=nameordered).dropna(axis=1, how='all')
                      #dtype=[int, int, int, int, int, float, float, int])

    @property
    def files(self):
        """ Returns a data frame with Sample files. Not very readable... """
        nameordered = self.samples.keys()
        nameordered.sort()
        ## replace curdir with . for shorter printing
        #fullcurdir = os.path.realpath(os.path.curdir)
        return pd.DataFrame([self.samples[i].files for i in nameordered], 
                      index=nameordered).dropna(axis=1, how='all')


                      
    def _stamp(self, event):
        """ Stamps an event into the log history. """
        tev = time.strftime("%m/%d/%y %H:%M:%S", time.gmtime())
        self.log.append((self.name, tev, event))



    def link_fastqs(self, path=None, merged=False, force=False, append=False):
        """ Create Sample objects for samples in sorted_fastq_path.

        Note
        ----
        link_fastqs() is called automatically during step2() if no Samples
        are yet present in the Assembly object (data were not demultiplexed
        in step1().) It looks for demultiplexed data files located in the
        [sorted_fastq_path].


        Parameters
        ----------
        path : str
            Path to the fastq files to be linked to Sample objects. The default
            location is to select all files in the 'sorted_fastq_path'. 
            Alternatively a different path can be entered here. 

        merged : bool
            Set to True if files represent first and second reads that were 
            merged using some external software such as `PEAR` or `VSEARCH`. 

        append : bool
            The default action is to overwrite fastq files linked to Samples if 
            they already have linked files. Use append=True to instead append 
            additional fastq files to a Sample (file names should be formatted 
            the same as usual, e.g., [name]_R1_[optional].fastq.gz).

        Returns
        -------
        str
            Prints the number of new Sample objects created and the number of 
            fastq files linked to Sample objects in the Assembly object. 
        
        """
        ## cannot both force and append at once
        if force and append:
            raise Exception("Cannot use force and append at the same time.")

        if self.samples and not (force or append):
            raise Exception("Files already linked to `{}`. ".format(self.name)\
                +"Use force=True to replace all files, or append=True to "
                +"add additional files to existing Samples.")

        ## get path to data files
        if not path:
            path = self.paramsdict["sorted_fastq_path"]

        ## does location exist, if no files selected, try selecting all
        if os.path.isdir(path):
            path += "*"

        ## grab fastqs/fq/gzip/all
        fastqs = glob.glob(path)
        fastqs = [i for i in fastqs if i.endswith(".gz") \
                                    or i.endswith(".fastq") \
                                    or i.endswith(".fq")]

        ## sort alphabetical
        fastqs.sort()

        ## link pairs into tuples        
        if 'pair' in self.paramsdict["datatype"]:
            ## check that names fit the paired naming convention
            r1_files = [i for i in fastqs if "_R1_" in i]
            r2_files = [i.replace("_R1_", "_R2_") for i in r1_files]

            if not any(["_R1_" in i for i in fastqs]) or \
                   (len(r1_files) != len(r2_files)):
                raise Exception("File name format error: paired file names " \
                +"must be identical except for _R1_ and _R2_ in their names.")
            fastqs = [(i, j) for i, j in zip(r1_files, r2_files)]

        ## data are not paired, create empty tuple pair
        else:
            if any(["_R2_" in i for i in fastqs]):
                print("Given the presence of '_R2_' in file names, this "\
              +"is a warning that if your data are paired-end you should set "\
              +"the Assembly object datatype to a paired type (e.g., "\
              +"pairddrad or pairgbs) prior to running link_fastqs().")
            fastqs = [(i, ) for i in fastqs]

        ## counters for the printed output
        created = 0
        linked = 0
        appended = 0
        for fastqtuple in list(fastqs):
            assert isinstance(fastqtuple, tuple), "fastqs not a tuple."
            ## local counters
            createdinc = 0
            linkedinc = 0
            appendinc = 0
            ## remove file extension from name
            sname = _name_from_file(fastqtuple[0])

            if sname not in self.samples:
                ## create new Sample
                self.samples[sname] = Sample(sname)
                self.samples[sname].stats.state = 1
                self.samples[sname].barcode = None 
                self.samples[sname].files.fastqs.append(fastqtuple)
                createdinc += 1
                linkedinc += 1
            else:
                ## if not forcing, shouldn't be here with existing Samples
                if append:
                    if fastqtuple not in self.samples[sname].files.fastqs:
                        self.samples[sname].files.fastqs.append(fastqtuple)
                        appendinc += 1
                    else:
                        print("The files {} are already in Sample {}, "\
                              .format(fastqtuple, sname) \
                              +"cannot append duplicate files to a Sample.\n")
                elif force:
                    ## create new Sample
                    self.samples[sname] = Sample(sname)
                    self.samples[sname].stats.state = 1
                    self.samples[sname].barcode = None 
                    self.samples[sname].files.fastqs.append(fastqtuple)
                    createdinc += 1
                    linkedinc += 1
                else:
                    print("The files {} are already in Sample.".format(sname) \
                    + " Use append=True to append additional files to a Sample"\
                    + " or force=True to replace all existing Samples.")

            ## record whether data were merged.
            if merged:
                self.samples[sname].merged = 1

            ## do not allow merged=False and .forward in file names
            if (merged == False) and ('forward' in fastqtuple[0]):
                print(\
                "If R1 and R2 data are merged (e.g., with PEAR) " \
              + "use link_fastqs(merge=True) to indicate this. You " \
              + "may need force=True to overwrite existing files.\n")

            ## if fastqs already demultiplexed, try to link stats
            if any([linkedinc, createdinc, appendinc]):
                gzipped = bool(fastqtuple[0].endswith(".gz"))
                nreads = 0
                ## iterate over files if there are multiple
                for alltuples in self.samples[sname].files.fastqs:
                    nreads += bufcount(alltuples[0], gzipped)
                self.samples[sname].stats.reads_raw = nreads/4
                created += createdinc
                linked += linkedinc
                appended += appendinc

        ## print if data were linked
        print("{} new Samples created in `{}`.".format(created, self.name))
        if linked:
            print("{} fastq files linked to {} new Samples.".\
                  format(linked, len(self.samples)))
        if appended:
            print("{} fastq files appended to {} existing Samples.".\
                  format(appended, len(self.samples)))



    def link_barcodes(self):
        """ creates a self.barcodes object to save barcodes info 
            as a dictionary, if there is a barcodes file in 
            self.paramsdict["barcodes_path"] """
        ## in case fuzzy selected
        try: 
            barcodefile = glob.glob(self.paramsdict["barcodes_path"])[0]
        except IndexError: 
            print("Barcodes file not found:", self.paramsdict["barcodes_path"])

        ## parse barcodefile
        bdf = pd.read_csv(barcodefile, header=None, delim_whitespace=1)
        bdf = bdf.dropna()
        ## make sure upper case
        bdf[1] = bdf[1].str.upper()
        ## set attribute on Assembly object
        self.barcodes = dict(zip(bdf[0], bdf[1]))

            # ## for each barcode create a Sample
            # for key in self.barcodes:
            #     samp = Sample(key)
            #     samp.state = 0
            #     samp.barcode = self.barcodes[key]
            #     if samp not in self.samples:
            #         self.samples[samp.name] = samp


    def get_params(self, param=""):
        """ pretty prints params if called as a function """
        fullcurdir = os.path.realpath(os.path.curdir)
        if not param:
            for index, (key, value) in enumerate(self.paramsdict.items()):
                if isinstance(value, str):
                    value = value.replace(fullcurdir, ".")
                sys.stdout.write("  {:<4}{:<28}{:<45}\n".format(index+1,
                           key, value))
        else:
            try:
                if int(param):
                    #sys.stdout.write(self.paramsdict.values()[int(param)-1])
                    return self.paramsdict.values()[int(param)-1]
            except (ValueError, TypeError, NameError, IndexError):
                return 'key not recognized'


        #def save(self, name=""):
        #    if not name:
        #        print("must enter a filename for saved object")
        #    else:
        #        json.dumps(self)


    def set_params(self, param, newvalue):
        """ Set a parameter to a new value. Raises error if newvalue 
        is wrong type.

        Note
        ----
        Use [Assembly].get_params() to see the parameter values currently
        linked to the Assembly object.

        Parameters
        ----------
        param : int or str
            The index (e.g., 1) or string name (e.g., "working_directory")
            for the parameter that will be changed.

        newvalue : int, str, or tuple
            The new value for the parameter selected for `param`. Use
            `ipyrad.get_params_info()` to get further information about
            a given parameter. If the wrong type is entered for newvalue
            (e.g., a str when it should be an int), an error will be raised.
            Further information about each parameter is also available
            in the documentation.

        Examples
        --------
        ## param 1 takes only a str as input
        [Assembly].set_params(1, 'new_directory')
        [Assembly].set_params('working_directory', 'new_directory')

        ## param 6 must be a tuple or str, if str it is converted to a tuple
        ## with the second entry empty.
        [Assembly].set_params(6, 'TGCAG')
        [Assembly].set_params('restriction_overhang', ('CTGCAG', 'CCGG')                            

        ## param 13 can be an int or a float:
        [Assembly].set_params(13, 4)
        [Assembly].set_params('max_shared_heterozygosity', 0.25)
            
        """

        ## require parameter recognition
        assert (param in range(50)) or \
               (param in [str(i) for i in range(50)]) or \
               (param in self.paramsdict.keys()), \
            "Parameter key not recognized: `{}`.".format(param)

        ## make string
        param = str(param)

        ## if matching
        if param in ['1', 'working_directory']:
            self.paramsdict['working_directory'] = expander(newvalue)
            self._stamp("[1] set to "+newvalue)
            self.dirs["working"] = self.paramsdict["working_directory"]


        elif param in ['2', 'raw_fastq_path']:
            fullrawpath = expander(newvalue)
            if os.path.isdir(fullrawpath):
                fullrawpath = os.path.join(fullrawpath, "*.gz")
            self.paramsdict['raw_fastq_path'] = fullrawpath
            self._stamp("[2] set to "+newvalue)
            #if not self.paramdict["raw_fastq_path"]:
            self.dirs["fastqs"] = os.path.dirname(
                                     self.paramsdict["raw_fastq_path"])


        elif param in ['3', 'barcodes_path']:
            #assert type(newvalue) is StringType, "arg must be a string"
            fullbarpath = expander(newvalue)
            if glob.glob(fullbarpath):
                self.paramsdict['barcodes_path'] = fullbarpath
                self.link_barcodes()
                self._stamp("[3] set to "+newvalue)
            elif not fullbarpath:
                self.paramsdict['barcodes_path'] = fullbarpath                
                self._stamp("[3] set to empty")
            else:
                print('cannot find barcodes file')


        elif param in ['4', 'sorted_fastq_path']:
            assert isinstance(newvalue, str), \
            "sorted_fastq_path must be a string, e.g., /home/data/fastqs/*"
            newvalue = expander(newvalue)
            if os.path.isdir(newvalue):
                newvalue = os.path.join(newvalue, "*.gz")
            self.paramsdict['sorted_fastq_path'] = newvalue
            ## link_fastqs will check that files exist
            #self.link_fastqs()
            self._stamp("[4] set to "+newvalue)
            self.dirs["fastqs"] = os.path.dirname(
                                   self.paramsdict["sorted_fastq_path"])


        elif param in ['5', 'restriction_overhang']:
            newvalue = tuplecheck(newvalue, str)                        
            assert isinstance(newvalue, tuple), \
            "cut site must be a tuple, e.g., (TGCAG, '') or (TGCAG, CCGG)"
            self.paramsdict['restriction_overhang'] = newvalue
            self._stamp("[5] set to "+str(newvalue))


        elif param in ['6', 'max_low_qual_bases']:
            self.paramsdict['max_low_qual_bases'] = int(newvalue)
            self._stamp("[6] set to "+str(newvalue))


        elif param in ['7', "engines_per_job"]:
            self.paramsdict['engines_per_job'] = int(newvalue)
            self._stamp("[7] set to "+str(newvalue))


        elif param in ['8', 'mindepth_statistical']:
            ## do not allow values below 5
            if int(newvalue) < 5:
                print("error: mindepth statistical cannot be set < 5")
            ## do not allow majrule to be > statistical
            elif int(newvalue) < self.paramsdict["mindepth_majrule"]:
                print("error: mindepth statistical cannot be less than \
                       mindepth_majrule")                
            else:
                self.paramsdict['mindepth_statistical'] = int(newvalue)
                self._stamp("[8] set to "+str(newvalue))


        elif param in ['9', 'mindepth_majrule']:
            if int(newvalue) > self.paramsdict["mindepth_statistical"]:
                print("error: mindepth_majrule cannot be > \
                       mindepth_statistical")
            else:
                self.paramsdict['mindepth_majrule'] = int(newvalue)
                self._stamp("[9] set to "+str(newvalue))


        elif param in ['10', 'datatype']:
            ## list of allowed datatypes
            datatypes = ['rad', 'gbs', 'ddrad', 'pairddrad',
                         'pairgbs', 'merged', '2brad']
            ## raise error if something else
            if self.paramsdict['datatype'] not in datatypes:
                print("error: datatype not recognized")
            else:
                self.paramsdict['datatype'] = str(newvalue)
                self._stamp("[10] set to "+newvalue)


        elif param in ['11', 'clust_threshold']:
            self.paramsdict['clust_threshold'] = float(newvalue)
            self._stamp("[11] set to {}".format(newvalue))


        elif param in ['12', 'minsamp']:
            self.paramsdict['minsamp'] = int(newvalue)
            self._stamp("[12] set to {}".format(int(newvalue)))


        elif param in ['13', 'max_shared_heterozygosity']:
            self.paramsdict['max_shared_heterozygosity'] = newvalue
            self._stamp("[13] set to {}".format(newvalue))


        elif param in ['14', 'prefix_outname']:
            self.paramsdict['prefix_outname'] = newvalue
            self._stamp("[14] set to {}".format(newvalue))


        elif param in ['15', 'phred_Qscore_offset']:
            self.paramsdict['phred_Qscore_offset'] = int(newvalue)
            self._stamp("[15] set to {}".format(int(newvalue)))


        elif param in ['16', 'max_barcode_mismatch']:
            self.paramsdict['max_barcode_mismatch'] = int(newvalue)
            self._stamp("[16] set to {}".format(int(newvalue)))

        ### ....
        elif param in ['17', 'filter_adapters']:
            self.paramsdict['filter_adapters'] = int(newvalue)
            self._stamp("[17] set to "+str(newvalue))


        elif param in ['18', 'filter_min_trim_len']:
            self.paramsdict['filter_min_trim_len'] = int(newvalue)
            self._stamp("[18] set to {}".format(int(newvalue)))


        elif param in ['19', 'ploidy']:
            self.paramsdict['ploidy'] = int(newvalue)
            self._stamp("[19] set to {}".format(int(newvalue)))


        elif param in ['20', 'max_stack_size']:
            self.paramsdict['max_stack_size'] = int(newvalue)
            self._stamp("[20] set to {}".format(int(newvalue)))


        elif param in ['21', 'max_Ns_consens']:
            newvalue = tuplecheck(newvalue)                        
            assert isinstance(newvalue, tuple), \
            "max_Ns_consens should be a tuple e.g., (8,8)"
            self.paramsdict['max_Ns_consens'] = newvalue
            self._stamp("[21] set to {}".format(newvalue))


        elif param in ['22', 'max_Hs_consens']:
            newvalue = tuplecheck(newvalue)                        
            assert isinstance(newvalue, tuple), \
            "max_Hs_consens should be a tuple e.g., (1,2,2,1)"
            self.paramsdict['max_Hs_consens'] = newvalue
            self._stamp("[22] set to {}".format(newvalue))


        elif param in ['23', 'max_SNPs_locus']:
            newvalue = tuplecheck(newvalue)                        
            assert isinstance(newvalue, tuple), \
            "max_SNPs_locus should be a tuple e.g., (20,20)"
            self.paramsdict['max_SNPs_locus'] = newvalue
            self._stamp("[23] set to {}".format(newvalue))


        elif param in ['24', 'max_Indels_locus']:
            newvalue = tuplecheck(newvalue)            
            assert isinstance(newvalue, tuple), \
            "max_Indels_locus should be a tuple e.g., (5, 100)" 
            self.paramsdict['max_Indels_locus'] = newvalue
            self._stamp("[24] set to {}".format(newvalue))


        elif param in ['25', 'trim_overhang']:
            newvalue = tuplecheck(newvalue)
            assert isinstance(newvalue, tuple), \
            "trim_overhang should be a tuple e.g., (1,2,2,1)"
            self.paramsdict['trim_overhang'] = newvalue
            self._stamp("[25] set to {}".format(newvalue))


        elif param in ['27', 'assembly_method']:
            self.paramsdict['assembly_method'] = newvalue
            LOGGER.info("assembly method set to %s", newvalue)
            assert self.paramsdict['assembly_method'] in \
                              ["denovo", "reference", "hybrid"], \
                 "The assembly_method option must be one of the following: "+\
                 "denovo, reference, or hybrid."
            self._stamp("[27] set to {}".format(newvalue))

        elif param in ['28', 'reference_sequence']:
            fullrawpath = expander(newvalue)
            if not os.path.isfile(fullrawpath):
                raise Exception(\
            "Reference sequence file not found. This must be an absolute path "\
            +"(/home/wat/ipyrad/data/referece.gz) or a path relative to the "\
            +"directory where you're running ipyrad (./data/reference.gz). ")
            self.paramsdict['reference_sequence'] = fullrawpath
            self._stamp("[28] set to "+fullrawpath)


    def copy(self, newname):
        """ Returns a copy of the Assemlbly object. Does not allow Assembly 
        object names to be replicated in namespace or path. """
        if (newname == self.name) or (os.path.exists(newname+".assembly")):
            print("Assembly object named {} already exists".format(newname))
        else:
            ## create a copy of the Assembly obj
            newobj = copy.deepcopy(self)
            newobj.name = newname
            newobj.set_params(14, newname)

            ## create copies of each Sample obj
            for sample in self.samples:
                newobj.samples[sample] = copy.deepcopy(self.samples[sample])
            return newobj



    def file_tree(self):
        """ prints the project data structure. TODO: this needs work.
        prints way too much other junk if [work] is home dir. """
        startpath = self.paramsdict["working_directory"]
        if startpath in [".", "", "./", os.path.expanduser(startpath)]:
            print("./")
        else:
            for root, _, files in os.walk(startpath):
                level = root.replace(startpath, '').count(os.sep)
                indent = ' ' * 4 * (level)
                print('{}{}/'.format(indent, os.path.basename(root)))
                subindent = ' ' * 4 * (level + 1)
                for fname in files:
                    print('{}{}'.format(subindent, fname))



    def _save(self):
        """ Pickle the Assembly object. Could be used for checkpointing before
        and after assembly steps. Currently it is called after assembly steps.
        """
        dillout = open(os.path.join(
                          self.paramsdict["working_directory"],
                          self.name+".assembly"), "wb")
        dill.dump(self, dillout)
        dillout.close()



    def step1(self, preview=0):
        """ step 1: demultiplex raw reads """

        ## launch parallel client within guarded statement
        try: 
            ipyclient = ipp.Client(cluster_id=self.__ipname__)

            if not self.samples:
                assemble.demultiplex.run(self, preview, ipyclient)
                self._stamp("s1_demultiplexing:")
            else:
                print("Samples already found in `{}`.".format(self.name) \
                    + "Use ip.merge() to combine samples \nfrom multiple " \
                    + "Assembly objects.\n")
        except (KeyboardInterrupt, SystemExit, AttributeError):
            logging.error("assembly step1 interrupted.")
            raise

        ## close client when done or if interrupted
        finally:
            ipyclient.shutdown(block=1)
            ipyclient.close()

        ## pickle the data obj
        self._save()


    ## TODO: make a step Class object
    def step2(self, samples="", preview=0, force=False):
        """ step 2: edit raw reads. Takes dictionary keys (sample names)
        either individually, or as a list, or it takes no argument to 
        select all samples in the Assembly object. Only samples in state
        =1 will be edited, all others are skipped. To overwrite data
        use the argument force=True. 

        """

        ## launch parallel client within guarded statement
        ipyclient = ipp.Client(cluster_id=self.__ipname__)        
        try:
            ipyclient = ipp.Client(cluster_id=self.__ipname__)

            if samples:
                ## if sample key, replace with sample obj
                assert isinstance(samples, list), \
                "to subselect samples enter as a list, e.g., [A, B]."
                for sample in samples:
                    ## get sample from dict key
                    sample = self.samples[sample]
                    assemble.rawedit.run(self, sample, ipyclient, force)
            else:
                ## TODO: Remove return of client
                if not self.samples:
                    assert self.samples, "No Samples in "+self.name
                for _, sample in self.samples.items():
                    assemble.rawedit.run(self, sample, ipyclient, force)


        except (KeyboardInterrupt, AttributeError, SystemExit):
            LOGGER.error("assembly step2 interrupted!")
            raise
            
        ## close parallel client if done or interrupted
        finally:
            logging.info("assembly step2 cleaning up.")
            ipyclient.shutdown(block=1)
            ipyclient.close()

        ## checkpoint the data obj
        self._save()



    def step3(self, samples=None, preview=0, noreverse=0, force=False):
        """ step 3: clustering within samples """

        ## Require reference seq for reference-based methods
        if self.paramsdict['assembly_method'] != "denovo":
            assert self.paramsdict['reference_sequence'], \
            "Reference or hybrid assembly requires a value for "+\
            "reference_sequence_path paramter."

            ## index the reference sequence
            index_reference_sequence(self)

        ## launch parallel client
        ipyclient = ipp.Client(cluster_id=self.__ipname__)

        try:
            ## sampling
            if samples:

                ## if string make a list(tuple)
                assert isinstance(samples, list), \
                "to subselect samples enter as a list, e.g., [A, B]."

                ## make into a tuple list with (key, sample)
                ## allows for names as keys or Sample objects
                subsamples = []
                for sample in samples:
                    if self.samples.get(sample):
                        subsamples.append((sample, self.samples[sample]))

                if subsamples:
                    print("Clustering {} samples using {} engines per job.".\
                      format(len(samples), self.paramsdict["engines_per_job"]))
                    ## run
                    assemble.cluster_within.run(self, subsamples, ipyclient, 
                                                preview, noreverse, force)
                else:
                    print("No samples found. Check that names are correct")
            else:
                ## if no samples selected and no samples exist
                assert self.samples, "no Samples found in {}".format(self.name)
                
                ## print to screen
                print("clustering {} samples using {} engines per job".\
                  format(len(self.samples), self.paramsdict["engines_per_job"]))
                ## run
                assemble.cluster_within.run(self, self.samples.items(), 
                                        ipyclient, preview, noreverse, force)

        except (KeyboardInterrupt, SystemExit):
            print("assembly step3 interrupted")
            raise
        ## close parallel client if done or interrupted
        finally:
            ipyclient.close()
            if preview:
                print(".")

        ## pickle the data object
        self._save()



    def step4(self, samples=None, preview=0, force=False, subsample=None):
        """ step 4: Joint estimation of error rate and heterozygosity. 
        If you want to overwrite data for a file, first set its state to 3:
        data.samples['sample'].stats['state'] = 3 """

        ## launch parallel client
        ipyclient = ipp.Client(cluster_id=self.__ipname__)

        try: 
            ## sampling
            if samples:
                ## make a list keys or samples
                if isinstance(samples, str):
                    samples = list([samples])
                else:
                    samples = list(samples)

                ## if keys are in list
                if any([isinstance(i, str) for i in samples]):
                    ## make into a subsampled sample dict
                    subsamples = {i: self.samples[i] for i in samples}

                ## send to function
                assemble.jointestimate.run(self, subsamples.values(), 
                                           ipyclient, force, subsample)
            else:
                ## if no sample, then do all samples
                if not self.samples:
                    ## if no samples in data, try linking edits from working dir
                    #self.link_clustfiles()
                    if not self.samples:
                        print("Assembly object has no samples in state 3.")
                ## run clustering for all samples
                assemble.jointestimate.run(self, self.samples.values(), 
                                           ipyclient, force, subsample)

        except (KeyboardInterrupt, SystemExit):
            print("assembly step4 interrupted")
            raise
        ## close parallel client if done or interrupted
        finally:
            ipyclient.close()
            if preview:
                print(".")

        ## pickle the data object
        self._save()




    def step5(self, samples="", preview=0):
        """ step 5: Consensus base calling from clusters within samples.
        If you want to overwrite data for a file, first set its state to 
        3 or 4. e.g., data.samples['sample'].stats['state'] = 3 """

        ## sampling
        if samples:
            ## make a list keys or samples
            if isinstance(samples, str):
                samples = list([samples])
            else:
                samples = list(samples)

            ## if keys are in list
            if any([isinstance(i, str) for i in samples]):
                ## make into a subsampled sample dict
                subsamples = {i: self.samples[i] for i in samples}

            ## send to function
            assemble.consens_se.run(self, subsamples.values())
        else:
            ## if no sample, then do all samples
            if not self.samples:
                ## if no samples in data, try linking edits from working dir
                #self.link_clustfiles()
                if not self.samples:
                    print("Assembly object has no samples in state=3")
            ## run clustering for all samples
            assemble.consens_se.run(self, self.samples.values())

        ## pickle the data object
        self._save()


    def run(self, steps=0, force=False, preview=False):
        """ Select steps of an analysis. If no steps are entered then all
        steps are run. Enter steps as a string, e.g., "1", "123", "12345" """
        if not steps:
            steps = "123457"
        else:
            steps = str(steps)
        if '1' in steps:
            self.step1(preview=preview)
        if '2' in steps:
            self.step2(force=force, preview=preview)
        if '3' in steps:
            self.step3(force=force, preview=preview)
        if '4' in steps:
            self.step4(force=force, preview=preview)            
Exemplo n.º 5
0
class Assembly(object):
    """ An ipyrad Assembly class object.

    The core object in ipyrad used to store and retrieve results, to
    call assembly functions, and to link to Sample objects.

    Parameters
    ----------
    name : str
        A name should be passed when creating a new Assembly object.
        This name will be used as a prefix for all files saved to disk
        associated with this Assembly. It is automatically set as the
        prefix name (parameter 14).          


    Attributes
    ----------
    name : str
        A name for the Assembly object. Used for all saved files on disk.
    samples : dict
        Returns a dict with Sample names as keys and Sample objects as values.
    barcodes : dict
        Returns a dictionary with Sample names as keys and barcodes as values.
        The barcodes information is fetched from parameter 3
        `[Assembly].paramsdict['barcodes_path']`.
    vsearch : str
        The path to the default vsearch executable. If not found, this can be 
        changed by setting [Assembly].vsearch = [newpath].
    muscle : str
        The path to the default muscle executable. If not found, this can be 
        changed by setting `[Assembly].muscle = [newpath]`.
    smalt : str
        The path to the default smalt executable. If not found, this can be 
        changed by setting `[Assembly].smalt = [newpath]`.
    samtools : str
        The path to the default samtools executable. If not found, this can be 
        changed by setting `[Assembly].samtools = [newpath]`.
    log : list
        A list of all modifications to the Assembly object and its Samples with
        time stamps. Use `print [Assembly].log` for easier viewing.
    dirs : dict
        Returns a dictionary with the location of directories that contain 
        linked Sample object files and stats results.

        
    Returns
    -------
    object
         A new assembly object is returned.


     """
    def __init__(self, name, controller="Local"):

        ## obj name
        self.name = name
        print("New Assembly object `{}` created".format(self.name))

        ## launch ipcluster and register for later destruction
        self.__ipname__ = ipcontroller_init(controller)

        ## get binaries of dependencies
        self.vsearch, self.muscle, self.smalt, self.samtools = getbins()

        ## link a log history of executed workflow
        self.log = []
        self._stamp(self.name + " created")
        self.statsfiles = ObjDict()

        ## samples linked
        self.samples = ObjDict()

        ## multiplex files linked
        self.barcodes = ObjDict()

        ## an object for storing data directories for this Assembly
        self.dirs = ObjDict()

        ## the default params dict
        self.paramsdict = OrderedDict([
            ("working_directory", os.path.realpath(os.path.curdir)),
            ("raw_fastq_path",
             os.path.join(os.path.realpath(os.path.curdir), "*.fastq")),
            ("barcodes_path",
             os.path.join(os.path.realpath(os.path.curdir), "*.barcodes.txt")),
            ("sorted_fastq_path", ""), ("restriction_overhang", ("TGCAG", "")),
            ("max_low_qual_bases", 5), ("engines_per_job", 4),
            ("mindepth_statistical", 6), ("mindepth_majrule", 6),
            ("datatype", 'rad'), ("clust_threshold", .85), ("minsamp", 4),
            ("max_shared_heterozygosity", .25), ("prefix_outname", self.name),
            ("phred_Qscore_offset", 33), ("max_barcode_mismatch", 1),
            ("filter_adapters", 0), ("filter_min_trim_len", 35), ("ploidy", 2),
            ("max_stack_size", 1000), ("max_Ns_consens", (5, 5)),
            ("max_Hs_consens", (8, 8)), ("max_SNPs_locus", (100, 100)),
            ("max_Indels_locus", (5, 99)), ("trim_overhang", (1, 2, 2, 1)),
            ("hierarchical_clustering", 0), ("assembly_method", "denovo"),
            ("reference_sequence", "")
        ])

    def __str__(self):
        return "<ipyrad.Assembly object {}>".format(self.name)

    @property
    def stats(self):
        """ Returns a data frame with Sample data and state. """
        nameordered = self.samples.keys()
        nameordered.sort()
        return pd.DataFrame([self.samples[i].stats for i in nameordered],
                            index=nameordered).dropna(axis=1, how='all')
        #dtype=[int, int, int, int, int, float, float, int])

    @property
    def files(self):
        """ Returns a data frame with Sample files. Not very readable... """
        nameordered = self.samples.keys()
        nameordered.sort()
        ## replace curdir with . for shorter printing
        #fullcurdir = os.path.realpath(os.path.curdir)
        return pd.DataFrame([self.samples[i].files for i in nameordered],
                            index=nameordered).dropna(axis=1, how='all')

    def _stamp(self, event):
        """ Stamps an event into the log history. """
        tev = time.strftime("%m/%d/%y %H:%M:%S", time.gmtime())
        self.log.append((self.name, tev, event))

    def link_fastqs(self, path=None, merged=False, force=False, append=False):
        """ Create Sample objects for samples in sorted_fastq_path.

        Note
        ----
        link_fastqs() is called automatically during step2() if no Samples
        are yet present in the Assembly object (data were not demultiplexed
        in step1().) It looks for demultiplexed data files located in the
        [sorted_fastq_path].


        Parameters
        ----------
        path : str
            Path to the fastq files to be linked to Sample objects. The default
            location is to select all files in the 'sorted_fastq_path'. 
            Alternatively a different path can be entered here. 

        merged : bool
            Set to True if files represent first and second reads that were 
            merged using some external software such as `PEAR` or `VSEARCH`. 

        append : bool
            The default action is to overwrite fastq files linked to Samples if 
            they already have linked files. Use append=True to instead append 
            additional fastq files to a Sample (file names should be formatted 
            the same as usual, e.g., [name]_R1_[optional].fastq.gz).

        Returns
        -------
        str
            Prints the number of new Sample objects created and the number of 
            fastq files linked to Sample objects in the Assembly object. 
        
        """
        ## cannot both force and append at once
        if force and append:
            raise Exception("Cannot use force and append at the same time.")

        if self.samples and not (force or append):
            raise Exception("Files already linked to `{}`. ".format(self.name)\
                +"Use force=True to replace all files, or append=True to "
                +"add additional files to existing Samples.")

        ## get path to data files
        if not path:
            path = self.paramsdict["sorted_fastq_path"]

        ## does location exist, if no files selected, try selecting all
        if os.path.isdir(path):
            path += "*"

        ## grab fastqs/fq/gzip/all
        fastqs = glob.glob(path)
        fastqs = [i for i in fastqs if i.endswith(".gz") \
                                    or i.endswith(".fastq") \
                                    or i.endswith(".fq")]

        ## sort alphabetical
        fastqs.sort()

        ## link pairs into tuples
        if 'pair' in self.paramsdict["datatype"]:
            ## check that names fit the paired naming convention
            r1_files = [i for i in fastqs if "_R1_" in i]
            r2_files = [i.replace("_R1_", "_R2_") for i in r1_files]

            if not any(["_R1_" in i for i in fastqs]) or \
                   (len(r1_files) != len(r2_files)):
                raise Exception("File name format error: paired file names " \
                +"must be identical except for _R1_ and _R2_ in their names.")
            fastqs = [(i, j) for i, j in zip(r1_files, r2_files)]

        ## data are not paired, create empty tuple pair
        else:
            if any(["_R2_" in i for i in fastqs]):
                print("Given the presence of '_R2_' in file names, this "\
              +"is a warning that if your data are paired-end you should set "\
              +"the Assembly object datatype to a paired type (e.g., "\
              +"pairddrad or pairgbs) prior to running link_fastqs().")
            fastqs = [(i, ) for i in fastqs]

        ## counters for the printed output
        created = 0
        linked = 0
        appended = 0
        for fastqtuple in list(fastqs):
            assert isinstance(fastqtuple, tuple), "fastqs not a tuple."
            ## local counters
            createdinc = 0
            linkedinc = 0
            appendinc = 0
            ## remove file extension from name
            sname = _name_from_file(fastqtuple[0])

            if sname not in self.samples:
                ## create new Sample
                self.samples[sname] = Sample(sname)
                self.samples[sname].stats.state = 1
                self.samples[sname].barcode = None
                self.samples[sname].files.fastqs.append(fastqtuple)
                createdinc += 1
                linkedinc += 1
            else:
                ## if not forcing, shouldn't be here with existing Samples
                if append:
                    if fastqtuple not in self.samples[sname].files.fastqs:
                        self.samples[sname].files.fastqs.append(fastqtuple)
                        appendinc += 1
                    else:
                        print("The files {} are already in Sample {}, "\
                              .format(fastqtuple, sname) \
                              +"cannot append duplicate files to a Sample.\n")
                elif force:
                    ## create new Sample
                    self.samples[sname] = Sample(sname)
                    self.samples[sname].stats.state = 1
                    self.samples[sname].barcode = None
                    self.samples[sname].files.fastqs.append(fastqtuple)
                    createdinc += 1
                    linkedinc += 1
                else:
                    print("The files {} are already in Sample.".format(sname) \
                    + " Use append=True to append additional files to a Sample"\
                    + " or force=True to replace all existing Samples.")

            ## record whether data were merged.
            if merged:
                self.samples[sname].merged = 1

            ## do not allow merged=False and .forward in file names
            if (merged == False) and ('forward' in fastqtuple[0]):
                print(\
                "If R1 and R2 data are merged (e.g., with PEAR) " \
              + "use link_fastqs(merge=True) to indicate this. You " \
              + "may need force=True to overwrite existing files.\n")

            ## if fastqs already demultiplexed, try to link stats
            if any([linkedinc, createdinc, appendinc]):
                gzipped = bool(fastqtuple[0].endswith(".gz"))
                nreads = 0
                ## iterate over files if there are multiple
                for alltuples in self.samples[sname].files.fastqs:
                    nreads += bufcount(alltuples[0], gzipped)
                self.samples[sname].stats.reads_raw = nreads / 4
                created += createdinc
                linked += linkedinc
                appended += appendinc

        ## print if data were linked
        print("{} new Samples created in `{}`.".format(created, self.name))
        if linked:
            print("{} fastq files linked to {} new Samples.".\
                  format(linked, len(self.samples)))
        if appended:
            print("{} fastq files appended to {} existing Samples.".\
                  format(appended, len(self.samples)))

    def link_barcodes(self):
        """ creates a self.barcodes object to save barcodes info 
            as a dictionary, if there is a barcodes file in 
            self.paramsdict["barcodes_path"] """
        ## in case fuzzy selected
        try:
            barcodefile = glob.glob(self.paramsdict["barcodes_path"])[0]
        except IndexError:
            print("Barcodes file not found:", self.paramsdict["barcodes_path"])

        ## parse barcodefile
        bdf = pd.read_csv(barcodefile, header=None, delim_whitespace=1)
        bdf = bdf.dropna()
        ## make sure upper case
        bdf[1] = bdf[1].str.upper()
        ## set attribute on Assembly object
        self.barcodes = dict(zip(bdf[0], bdf[1]))

        # ## for each barcode create a Sample
        # for key in self.barcodes:
        #     samp = Sample(key)
        #     samp.state = 0
        #     samp.barcode = self.barcodes[key]
        #     if samp not in self.samples:
        #         self.samples[samp.name] = samp

    def get_params(self, param=""):
        """ pretty prints params if called as a function """
        fullcurdir = os.path.realpath(os.path.curdir)
        if not param:
            for index, (key, value) in enumerate(self.paramsdict.items()):
                if isinstance(value, str):
                    value = value.replace(fullcurdir, ".")
                sys.stdout.write("  {:<4}{:<28}{:<45}\n".format(
                    index + 1, key, value))
        else:
            try:
                if int(param):
                    #sys.stdout.write(self.paramsdict.values()[int(param)-1])
                    return self.paramsdict.values()[int(param) - 1]
            except (ValueError, TypeError, NameError, IndexError):
                return 'key not recognized'

        #def save(self, name=""):
        #    if not name:
        #        print("must enter a filename for saved object")
        #    else:
        #        json.dumps(self)

    def set_params(self, param, newvalue):
        """ Set a parameter to a new value. Raises error if newvalue 
        is wrong type.

        Note
        ----
        Use [Assembly].get_params() to see the parameter values currently
        linked to the Assembly object.

        Parameters
        ----------
        param : int or str
            The index (e.g., 1) or string name (e.g., "working_directory")
            for the parameter that will be changed.

        newvalue : int, str, or tuple
            The new value for the parameter selected for `param`. Use
            `ipyrad.get_params_info()` to get further information about
            a given parameter. If the wrong type is entered for newvalue
            (e.g., a str when it should be an int), an error will be raised.
            Further information about each parameter is also available
            in the documentation.

        Examples
        --------
        ## param 1 takes only a str as input
        [Assembly].set_params(1, 'new_directory')
        [Assembly].set_params('working_directory', 'new_directory')

        ## param 6 must be a tuple or str, if str it is converted to a tuple
        ## with the second entry empty.
        [Assembly].set_params(6, 'TGCAG')
        [Assembly].set_params('restriction_overhang', ('CTGCAG', 'CCGG')                            

        ## param 13 can be an int or a float:
        [Assembly].set_params(13, 4)
        [Assembly].set_params('max_shared_heterozygosity', 0.25)
            
        """

        ## require parameter recognition
        assert (param in range(50)) or \
               (param in [str(i) for i in range(50)]) or \
               (param in self.paramsdict.keys()), \
            "Parameter key not recognized: `{}`.".format(param)

        ## make string
        param = str(param)

        ## if matching
        if param in ['1', 'working_directory']:
            self.paramsdict['working_directory'] = expander(newvalue)
            self._stamp("[1] set to " + newvalue)
            self.dirs["working"] = self.paramsdict["working_directory"]

        elif param in ['2', 'raw_fastq_path']:
            fullrawpath = expander(newvalue)
            if os.path.isdir(fullrawpath):
                fullrawpath = os.path.join(fullrawpath, "*.gz")
            self.paramsdict['raw_fastq_path'] = fullrawpath
            self._stamp("[2] set to " + newvalue)
            #if not self.paramdict["raw_fastq_path"]:
            self.dirs["fastqs"] = os.path.dirname(
                self.paramsdict["raw_fastq_path"])

        elif param in ['3', 'barcodes_path']:
            #assert type(newvalue) is StringType, "arg must be a string"
            fullbarpath = expander(newvalue)
            if glob.glob(fullbarpath):
                self.paramsdict['barcodes_path'] = fullbarpath
                self.link_barcodes()
                self._stamp("[3] set to " + newvalue)
            elif not fullbarpath:
                self.paramsdict['barcodes_path'] = fullbarpath
                self._stamp("[3] set to empty")
            else:
                print('cannot find barcodes file')

        elif param in ['4', 'sorted_fastq_path']:
            assert isinstance(newvalue, str), \
            "sorted_fastq_path must be a string, e.g., /home/data/fastqs/*"
            newvalue = expander(newvalue)
            if os.path.isdir(newvalue):
                newvalue = os.path.join(newvalue, "*.gz")
            self.paramsdict['sorted_fastq_path'] = newvalue
            ## link_fastqs will check that files exist
            #self.link_fastqs()
            self._stamp("[4] set to " + newvalue)
            self.dirs["fastqs"] = os.path.dirname(
                self.paramsdict["sorted_fastq_path"])

        elif param in ['5', 'restriction_overhang']:
            newvalue = tuplecheck(newvalue, str)
            assert isinstance(newvalue, tuple), \
            "cut site must be a tuple, e.g., (TGCAG, '') or (TGCAG, CCGG)"
            self.paramsdict['restriction_overhang'] = newvalue
            self._stamp("[5] set to " + str(newvalue))

        elif param in ['6', 'max_low_qual_bases']:
            self.paramsdict['max_low_qual_bases'] = int(newvalue)
            self._stamp("[6] set to " + str(newvalue))

        elif param in ['7', "engines_per_job"]:
            self.paramsdict['engines_per_job'] = int(newvalue)
            self._stamp("[7] set to " + str(newvalue))

        elif param in ['8', 'mindepth_statistical']:
            ## do not allow values below 5
            if int(newvalue) < 5:
                print("error: mindepth statistical cannot be set < 5")
            ## do not allow majrule to be > statistical
            elif int(newvalue) < self.paramsdict["mindepth_majrule"]:
                print("error: mindepth statistical cannot be less than \
                       mindepth_majrule")
            else:
                self.paramsdict['mindepth_statistical'] = int(newvalue)
                self._stamp("[8] set to " + str(newvalue))

        elif param in ['9', 'mindepth_majrule']:
            if int(newvalue) > self.paramsdict["mindepth_statistical"]:
                print("error: mindepth_majrule cannot be > \
                       mindepth_statistical")
            else:
                self.paramsdict['mindepth_majrule'] = int(newvalue)
                self._stamp("[9] set to " + str(newvalue))

        elif param in ['10', 'datatype']:
            ## list of allowed datatypes
            datatypes = [
                'rad', 'gbs', 'ddrad', 'pairddrad', 'pairgbs', 'merged',
                '2brad'
            ]
            ## raise error if something else
            if self.paramsdict['datatype'] not in datatypes:
                print("error: datatype not recognized")
            else:
                self.paramsdict['datatype'] = str(newvalue)
                self._stamp("[10] set to " + newvalue)

        elif param in ['11', 'clust_threshold']:
            self.paramsdict['clust_threshold'] = float(newvalue)
            self._stamp("[11] set to {}".format(newvalue))

        elif param in ['12', 'minsamp']:
            self.paramsdict['minsamp'] = int(newvalue)
            self._stamp("[12] set to {}".format(int(newvalue)))

        elif param in ['13', 'max_shared_heterozygosity']:
            self.paramsdict['max_shared_heterozygosity'] = newvalue
            self._stamp("[13] set to {}".format(newvalue))

        elif param in ['14', 'prefix_outname']:
            self.paramsdict['prefix_outname'] = newvalue
            self._stamp("[14] set to {}".format(newvalue))

        elif param in ['15', 'phred_Qscore_offset']:
            self.paramsdict['phred_Qscore_offset'] = int(newvalue)
            self._stamp("[15] set to {}".format(int(newvalue)))

        elif param in ['16', 'max_barcode_mismatch']:
            self.paramsdict['max_barcode_mismatch'] = int(newvalue)
            self._stamp("[16] set to {}".format(int(newvalue)))

        ### ....
        elif param in ['17', 'filter_adapters']:
            self.paramsdict['filter_adapters'] = int(newvalue)
            self._stamp("[17] set to " + str(newvalue))

        elif param in ['18', 'filter_min_trim_len']:
            self.paramsdict['filter_min_trim_len'] = int(newvalue)
            self._stamp("[18] set to {}".format(int(newvalue)))

        elif param in ['19', 'ploidy']:
            self.paramsdict['ploidy'] = int(newvalue)
            self._stamp("[19] set to {}".format(int(newvalue)))

        elif param in ['20', 'max_stack_size']:
            self.paramsdict['max_stack_size'] = int(newvalue)
            self._stamp("[20] set to {}".format(int(newvalue)))

        elif param in ['21', 'max_Ns_consens']:
            newvalue = tuplecheck(newvalue)
            assert isinstance(newvalue, tuple), \
            "max_Ns_consens should be a tuple e.g., (8,8)"
            self.paramsdict['max_Ns_consens'] = newvalue
            self._stamp("[21] set to {}".format(newvalue))

        elif param in ['22', 'max_Hs_consens']:
            newvalue = tuplecheck(newvalue)
            assert isinstance(newvalue, tuple), \
            "max_Hs_consens should be a tuple e.g., (1,2,2,1)"
            self.paramsdict['max_Hs_consens'] = newvalue
            self._stamp("[22] set to {}".format(newvalue))

        elif param in ['23', 'max_SNPs_locus']:
            newvalue = tuplecheck(newvalue)
            assert isinstance(newvalue, tuple), \
            "max_SNPs_locus should be a tuple e.g., (20,20)"
            self.paramsdict['max_SNPs_locus'] = newvalue
            self._stamp("[23] set to {}".format(newvalue))

        elif param in ['24', 'max_Indels_locus']:
            newvalue = tuplecheck(newvalue)
            assert isinstance(newvalue, tuple), \
            "max_Indels_locus should be a tuple e.g., (5, 100)"
            self.paramsdict['max_Indels_locus'] = newvalue
            self._stamp("[24] set to {}".format(newvalue))

        elif param in ['25', 'trim_overhang']:
            newvalue = tuplecheck(newvalue)
            assert isinstance(newvalue, tuple), \
            "trim_overhang should be a tuple e.g., (1,2,2,1)"
            self.paramsdict['trim_overhang'] = newvalue
            self._stamp("[25] set to {}".format(newvalue))

        elif param in ['27', 'assembly_method']:
            self.paramsdict['assembly_method'] = newvalue
            LOGGER.info("assembly method set to %s", newvalue)
            assert self.paramsdict['assembly_method'] in \
                              ["denovo", "reference", "hybrid"], \
                 "The assembly_method option must be one of the following: "+\
                 "denovo, reference, or hybrid."
            self._stamp("[27] set to {}".format(newvalue))

        elif param in ['28', 'reference_sequence']:
            fullrawpath = expander(newvalue)
            if not os.path.isfile(fullrawpath):
                raise Exception(\
            "Reference sequence file not found. This must be an absolute path "\
            +"(/home/wat/ipyrad/data/referece.gz) or a path relative to the "\
            +"directory where you're running ipyrad (./data/reference.gz). ")
            self.paramsdict['reference_sequence'] = fullrawpath
            self._stamp("[28] set to " + fullrawpath)

    def copy(self, newname):
        """ Returns a copy of the Assemlbly object. Does not allow Assembly 
        object names to be replicated in namespace or path. """
        if (newname == self.name) or (os.path.exists(newname + ".assembly")):
            print("Assembly object named {} already exists".format(newname))
        else:
            ## create a copy of the Assembly obj
            newobj = copy.deepcopy(self)
            newobj.name = newname
            newobj.set_params(14, newname)

            ## create copies of each Sample obj
            for sample in self.samples:
                newobj.samples[sample] = copy.deepcopy(self.samples[sample])
            return newobj

    def file_tree(self):
        """ prints the project data structure. TODO: this needs work.
        prints way too much other junk if [work] is home dir. """
        startpath = self.paramsdict["working_directory"]
        if startpath in [".", "", "./", os.path.expanduser(startpath)]:
            print("./")
        else:
            for root, _, files in os.walk(startpath):
                level = root.replace(startpath, '').count(os.sep)
                indent = ' ' * 4 * (level)
                print('{}{}/'.format(indent, os.path.basename(root)))
                subindent = ' ' * 4 * (level + 1)
                for fname in files:
                    print('{}{}'.format(subindent, fname))

    def _save(self):
        """ Pickle the Assembly object. Could be used for checkpointing before
        and after assembly steps. Currently it is called after assembly steps.
        """
        dillout = open(
            os.path.join(self.paramsdict["working_directory"],
                         self.name + ".assembly"), "wb")
        dill.dump(self, dillout)
        dillout.close()

    def step1(self, preview=0):
        """ step 1: demultiplex raw reads """

        ## launch parallel client within guarded statement
        try:
            ipyclient = ipp.Client(cluster_id=self.__ipname__)

            if not self.samples:
                assemble.demultiplex.run(self, preview, ipyclient)
                self._stamp("s1_demultiplexing:")
            else:
                print("Samples already found in `{}`.".format(self.name) \
                    + "Use ip.merge() to combine samples \nfrom multiple " \
                    + "Assembly objects.\n")
        except (KeyboardInterrupt, SystemExit, AttributeError):
            logging.error("assembly step1 interrupted.")
            raise

        ## close client when done or if interrupted
        finally:
            ipyclient.shutdown(block=1)
            ipyclient.close()

        ## pickle the data obj
        self._save()

    ## TODO: make a step Class object
    def step2(self, samples="", preview=0, force=False):
        """ step 2: edit raw reads. Takes dictionary keys (sample names)
        either individually, or as a list, or it takes no argument to 
        select all samples in the Assembly object. Only samples in state
        =1 will be edited, all others are skipped. To overwrite data
        use the argument force=True. 

        """

        ## launch parallel client within guarded statement
        ipyclient = ipp.Client(cluster_id=self.__ipname__)
        try:
            ipyclient = ipp.Client(cluster_id=self.__ipname__)

            if samples:
                ## if sample key, replace with sample obj
                assert isinstance(samples, list), \
                "to subselect samples enter as a list, e.g., [A, B]."
                for sample in samples:
                    ## get sample from dict key
                    sample = self.samples[sample]
                    assemble.rawedit.run(self, sample, ipyclient, force)
            else:
                ## TODO: Remove return of client
                if not self.samples:
                    assert self.samples, "No Samples in " + self.name
                for _, sample in self.samples.items():
                    assemble.rawedit.run(self, sample, ipyclient, force)

        except (KeyboardInterrupt, AttributeError, SystemExit):
            LOGGER.error("assembly step2 interrupted!")
            raise

        ## close parallel client if done or interrupted
        finally:
            logging.info("assembly step2 cleaning up.")
            ipyclient.shutdown(block=1)
            ipyclient.close()

        ## checkpoint the data obj
        self._save()

    def step3(self, samples=None, preview=0, noreverse=0, force=False):
        """ step 3: clustering within samples """

        ## Require reference seq for reference-based methods
        if self.paramsdict['assembly_method'] != "denovo":
            assert self.paramsdict['reference_sequence'], \
            "Reference or hybrid assembly requires a value for "+\
            "reference_sequence_path paramter."

            ## index the reference sequence
            index_reference_sequence(self)

        ## launch parallel client
        ipyclient = ipp.Client(cluster_id=self.__ipname__)

        try:
            ## sampling
            if samples:

                ## if string make a list(tuple)
                assert isinstance(samples, list), \
                "to subselect samples enter as a list, e.g., [A, B]."

                ## make into a tuple list with (key, sample)
                ## allows for names as keys or Sample objects
                subsamples = []
                for sample in samples:
                    if self.samples.get(sample):
                        subsamples.append((sample, self.samples[sample]))

                if subsamples:
                    print("Clustering {} samples using {} engines per job.".\
                      format(len(samples), self.paramsdict["engines_per_job"]))
                    ## run
                    assemble.cluster_within.run(self, subsamples, ipyclient,
                                                preview, noreverse, force)
                else:
                    print("No samples found. Check that names are correct")
            else:
                ## if no samples selected and no samples exist
                assert self.samples, "no Samples found in {}".format(self.name)

                ## print to screen
                print("clustering {} samples using {} engines per job".\
                  format(len(self.samples), self.paramsdict["engines_per_job"]))
                ## run
                assemble.cluster_within.run(self, self.samples.items(),
                                            ipyclient, preview, noreverse,
                                            force)

        except (KeyboardInterrupt, SystemExit):
            print("assembly step3 interrupted")
            raise
        ## close parallel client if done or interrupted
        finally:
            ipyclient.close()
            if preview:
                print(".")

        ## pickle the data object
        self._save()

    def step4(self, samples=None, preview=0, force=False, subsample=None):
        """ step 4: Joint estimation of error rate and heterozygosity. 
        If you want to overwrite data for a file, first set its state to 3:
        data.samples['sample'].stats['state'] = 3 """

        ## launch parallel client
        ipyclient = ipp.Client(cluster_id=self.__ipname__)

        try:
            ## sampling
            if samples:
                ## make a list keys or samples
                if isinstance(samples, str):
                    samples = list([samples])
                else:
                    samples = list(samples)

                ## if keys are in list
                if any([isinstance(i, str) for i in samples]):
                    ## make into a subsampled sample dict
                    subsamples = {i: self.samples[i] for i in samples}

                ## send to function
                assemble.jointestimate.run(self, subsamples.values(),
                                           ipyclient, force, subsample)
            else:
                ## if no sample, then do all samples
                if not self.samples:
                    ## if no samples in data, try linking edits from working dir
                    #self.link_clustfiles()
                    if not self.samples:
                        print("Assembly object has no samples in state 3.")
                ## run clustering for all samples
                assemble.jointestimate.run(self, self.samples.values(),
                                           ipyclient, force, subsample)

        except (KeyboardInterrupt, SystemExit):
            print("assembly step4 interrupted")
            raise
        ## close parallel client if done or interrupted
        finally:
            ipyclient.close()
            if preview:
                print(".")

        ## pickle the data object
        self._save()

    def step5(self, samples="", preview=0):
        """ step 5: Consensus base calling from clusters within samples.
        If you want to overwrite data for a file, first set its state to 
        3 or 4. e.g., data.samples['sample'].stats['state'] = 3 """

        ## sampling
        if samples:
            ## make a list keys or samples
            if isinstance(samples, str):
                samples = list([samples])
            else:
                samples = list(samples)

            ## if keys are in list
            if any([isinstance(i, str) for i in samples]):
                ## make into a subsampled sample dict
                subsamples = {i: self.samples[i] for i in samples}

            ## send to function
            assemble.consens_se.run(self, subsamples.values())
        else:
            ## if no sample, then do all samples
            if not self.samples:
                ## if no samples in data, try linking edits from working dir
                #self.link_clustfiles()
                if not self.samples:
                    print("Assembly object has no samples in state=3")
            ## run clustering for all samples
            assemble.consens_se.run(self, self.samples.values())

        ## pickle the data object
        self._save()

    def run(self, steps=0, force=False, preview=False):
        """ Select steps of an analysis. If no steps are entered then all
        steps are run. Enter steps as a string, e.g., "1", "123", "12345" """
        if not steps:
            steps = "123457"
        else:
            steps = str(steps)
        if '1' in steps:
            self.step1(preview=preview)
        if '2' in steps:
            self.step2(force=force, preview=preview)
        if '3' in steps:
            self.step3(force=force, preview=preview)
        if '4' in steps:
            self.step4(force=force, preview=preview)
Exemplo n.º 6
0
def consensus(args):
    """
    from a clust file handle, reads in all copies at a locus and sorts
    bases at each site, tests for errors at the site according to error 
    rate, calls consensus.
    """

    ## unpack args
    data, sample, tmpchunk, point = args

    ## read in cluster file 2 lines at a time
    infile = gzip.open(tmpchunk)  #sample.files["clusters"])
    duo = itertools.izip(*[iter(infile)] * 2)

    ## store read depth info for later output files
    datadict = {}

    ## counters
    locus = 0
    minsamp_filtered = 0
    nheteros = 0

    ## iterate over clusters
    while 1:
        try:
            first = duo.next()
        except StopIteration:
            break
        itera = [first[0], first[1]]
        fname = itera[0].split(";")[0]

        ## local containers and counters for this locus"
        locus += 1  ## recording n loci
        sloc = []  ## list for sequence data
        nloc = []  ## list for names used for gbs filters

        ## grab seqs until end of cluster
        while itera[0] != "//\n":
            ## append sequence * number of dereps "
            nreps = int(itera[0].split(";")[-2].split("=")[1])
            for _ in xrange(nreps):
                sloc.append(tuple(itera[1].strip()))
                nloc.append(itera[0])
            ## move on to the next sequence
            itera = duo.next()

        ## now that all seqs in this loc are read in
        ## check that none overlap leftjust overhang if gbs
        if data.paramsdict["datatype"] in ['gbs', 'merged']:
            ## TODO: test these new changes to gbs filter
            ## edge filters
            leftjust = rightjust = None
            rights = []

            ## get leftjust and rights
            for i, j in zip(nloc, sloc):
                leftjust, rights = gbs_edgefilter(i, j, leftjust, rights)
            if rights:
                ## record in name that there was a reverse hit"
                fname = fname[:-2] + "c1"
                try:
                    rightjust = min([min(i) for i in rights])
                except ValueError:
                    sloc = ""

            for seq in xrange(len(sloc)):
                sloc[seq] = sloc[seq][leftjust:]
                if rightjust:
                    sloc[seq] = sloc[seq][:rightjust + 1]

        ## Apply depth filter
        if (len(sloc) >= data.paramsdict["mindepth_majrule"]) and \
           (len(sloc) <= data.paramsdict["max_stack_size"]):

            ## this loc passed the minsamp filter
            minsamp_filtered += 1

            ## get stacks of bases at each site
            arrayed = numpy.array(sloc)
            stacked = [Counter(seq) for seq in arrayed.T]

            ## apply functions to list of sites in stacked
            ## filter by site for paralogs and make consens calls
            consens = [filter2(data, site) for site in stacked]

            ## filtered by locus for paralog
            if "@" not in consens:
                ## get hetero sites
                heteros = [i[0] for i in enumerate(consens) \
                                  if i[1] in list("RKSYWM")]

                ## filter for max number of hetero sites
                exceedmaxploid = 0
                if len(heteros) <= data.paramsdict["max_Hs_consens"]:
                    ## filter for more than x alleles given ploidy. Only
                    ## relevant if locus is polymorphic at more than one site
                    if len(heteros) > 1:
                        consens, exceedmaxploid = filter3(
                            data, consens, heteros, sloc)

                    ## if the locus passed paralog filtering
                    if not exceedmaxploid:

                        consens = "".join(consens).replace("-", "N")
                        ## if a site is stripped then I need to remove the site
                        ## from the site counter (stacked)
                        shortconl = consens.lstrip("N")
                        if len(shortconl) < len(consens):
                            stacked = stacked[-len(shortconl):]
                        shortcon = consens.rstrip("N")
                        if len(shortcon) < len(shortconl):
                            stacked = stacked[:len(shortcon)]

                        ## this function which removes low coverage sites next
                        ## to poly repeats that are likely sequencing errors
                        ## also edits 'stacked'
                        shortcon, stacked = removerepeat_Ns(shortcon, stacked)

                        ## only allow maxN internal "N"s in a locus
                        if shortcon.count("N") <= int(
                                data.paramsdict["max_Ns_consens"]):
                            ## minimum length for clustering in vsearch
                            if len(shortcon) >= 32:
                                ## keep for counter
                                nheteros += len(heteros)

                                ## store the consens seq
                                #consdic[fname] = shortcon

                                ## create dataobj w/ name fname
                                dataobj = ObjDict()
                                ## store qual and depth data
                                dataobj.seq = shortcon  #[len(cut1):]
                                dataobj.Cs = [i["C"] for i in stacked]
                                dataobj.As = [i["A"] for i in stacked]
                                dataobj.Ts = [i["T"] for i in stacked]
                                dataobj.Gs = [i["G"] for i in stacked]
                                #Cs = [i["C"] for i in stacked]
                                #As = [i["A"] for i in stacked]
                                #Ts = [i["T"] for i in stacked]
                                #Gs = [i["G"] for i in stacked]
                                #dfconsens = pd.DataFrame([list(shortcon),
                                #                          Cs, As, Ts, Gs])
                                tag = "_".join(fname.split("_")[-2:])
                                datadict[tag] = dataobj
                                #datadict[tag] = dfconsens
                        else:
                            pass  #print "maxN filtered loc", locus
                    else:
                        pass  #print "ploid filtered loc", locus
                else:
                    pass  #print "maxH filtered loc", locus
            else:
                pass  #print "third base filtered loc", locus
        else:
            pass  #print "mindepth filtered loc", locus

    data.dirs.consens = os.path.join(data.dirs.clusts, "consens")

    #if not os.path.exists(os.path.join(...)):
    #    os.mkdir(consensdir)
    ## get filename
    consenshandle = ""  #os.path.join([consensdir, sample.name+"consens.gz"])

    ## write to file
    with gzip.open(consenshandle, 'wb') as outfile:
        outfile.write("\n".join([">"+sample.name+"_"+obj+"\n"+\
                                 datadict[obj].seq for obj in datadict]))
        #for obj in datadict:
        #    outfile.write(">"+sample.name+"_"+obj+"\n"+datadict[obj].seq+"\n")

    ## count the number of polymorphic sites
    if 'ddrad' in data.paramsdict["datatype"]:
        if 'pair' in data.paramsdict["datatype"]:
            sub = 4 + len(data.paramsdict["restriction_overhang"][0])
        else:
            sub = len(data.paramsdict["restriction_overhang"][0])
    elif 'gbs' in data.paramsdict["datatype"]:
        if 'pair' in data.paramsdict["datatype"]:
            sub = 4 + len(data.paramsdict["restriction_overhang"][0]) * 2
            #  (len(params["cut"])*2)
        else:
            sub = len(data.paramsdict["restriction_overhang"][0])
    elif data.paramsdict["datatype"] == "merged":
        sub = len(data.paramsdict["restriction_overhang"][0]) * 2
    else:
        sub = len(data.paramsdict["restriction_overhang"][0])
    nsites = sum([len(datadict[i].seq) - sub for i in datadict])
    ldic = len(datadict)
    try:
        poly = nheteros / float(nsites)
    except ZeroDivisionError:
        poly = 0.

    ## dump the quality score and depth info into a pickle
    #pickleout = gzip.open(handle.replace("clustS.gz", "bindata"), 'wb')
    #pickle.dump(datadict, pickleout)
    #pickleout.close()

    return [sample.name, locus, minsamp_filtered, ldic, nsites, nheteros, poly]
Exemplo n.º 7
0
def consensus(args):
    """
    from a clust file handle, reads in all copies at a locus and sorts
    bases at each site, tests for errors at the site according to error 
    rate, calls consensus.
    """

    ## unpack args
    data, sample, tmpchunk, point = args

    ## read in cluster file 2 lines at a time
    infile = gzip.open(tmpchunk) #sample.files["clusters"])
    duo = itertools.izip(*[iter(infile)]*2)

    ## store read depth info for later output files
    datadict = {}

    ## counters 
    locus = 0
    minsamp_filtered = 0
    nheteros = 0

    ## iterate over clusters
    while 1:
        try: 
            first = duo.next()
        except StopIteration:
            break
        itera = [first[0], first[1]]
        fname = itera[0].split(";")[0]

        ## local containers and counters for this locus"
        locus += 1         ## recording n loci
        sloc = []          ## list for sequence data 
        nloc = []          ## list for names used for gbs filters

        ## grab seqs until end of cluster
        while itera[0] != "//\n":
            ## append sequence * number of dereps "
            nreps = int(itera[0].split(";")[-2].split("=")[1])
            for _ in xrange(nreps):
                sloc.append(tuple(itera[1].strip())) 
                nloc.append(itera[0])
            ## move on to the next sequence
            itera = duo.next()

        ## now that all seqs in this loc are read in 
        ## check that none overlap leftjust overhang if gbs
        if data.paramsdict["datatype"] in ['gbs', 'merged']:
            ## TODO: test these new changes to gbs filter
            ## edge filters
            leftjust = rightjust = None
            rights = []

            ## get leftjust and rights
            for i, j in zip(nloc, sloc):
                leftjust, rights = gbs_edgefilter(i, j, leftjust, rights)
            if rights:
                ## record in name that there was a reverse hit"
                fname = fname[:-2]+"c1"
                try: 
                    rightjust = min([min(i) for i in rights])
                except ValueError:
                    sloc = ""
            
            for seq in xrange(len(sloc)):
                sloc[seq] = sloc[seq][leftjust:]
                if rightjust:
                    sloc[seq] = sloc[seq][:rightjust+1]

        ## Apply depth filter
        if (len(sloc) >= data.paramsdict["mindepth_majrule"]) and \
           (len(sloc) <= data.paramsdict["max_stack_size"]):

            ## this loc passed the minsamp filter
            minsamp_filtered += 1

            ## get stacks of bases at each site
            arrayed = numpy.array(sloc)
            stacked = [Counter(seq) for seq in arrayed.T] 

            ## apply functions to list of sites in stacked
            ## filter by site for paralogs and make consens calls
            consens = [filter2(data, site) for site in stacked]

            ## filtered by locus for paralog
            if "@" not in consens:
                ## get hetero sites
                heteros = [i[0] for i in enumerate(consens) \
                                  if i[1] in list("RKSYWM")]

                ## filter for max number of hetero sites
                exceedmaxploid = 0
                if len(heteros) <= data.paramsdict["max_Hs_consens"]:
                    ## filter for more than x alleles given ploidy. Only 
                    ## relevant if locus is polymorphic at more than one site
                    if len(heteros) > 1:
                        consens, exceedmaxploid = filter3(data, consens,
                                                          heteros, sloc)

                    ## if the locus passed paralog filtering
                    if not exceedmaxploid:

                        consens = "".join(consens).replace("-", "N")
                        ## if a site is stripped then I need to remove the site
                        ## from the site counter (stacked)
                        shortconl = consens.lstrip("N")
                        if len(shortconl) < len(consens):
                            stacked = stacked[-len(shortconl):]
                        shortcon = consens.rstrip("N")
                        if len(shortcon) < len(shortconl):
                            stacked = stacked[:len(shortcon)]                            

                        ## this function which removes low coverage sites next 
                        ## to poly repeats that are likely sequencing errors 
                        ## also edits 'stacked'
                        shortcon, stacked = removerepeat_Ns(shortcon, stacked)

                        ## only allow maxN internal "N"s in a locus
                        if shortcon.count("N") <= int(
                                           data.paramsdict["max_Ns_consens"]):
                            ## minimum length for clustering in vsearch
                            if len(shortcon) >= 32:
                                ## keep for counter
                                nheteros += len(heteros)

                                ## store the consens seq
                                #consdic[fname] = shortcon

                                ## create dataobj w/ name fname
                                dataobj = ObjDict()
                                ## store qual and depth data
                                dataobj.seq = shortcon #[len(cut1):]
                                dataobj.Cs = [i["C"] for i in stacked] 
                                dataobj.As = [i["A"] for i in stacked] 
                                dataobj.Ts = [i["T"] for i in stacked] 
                                dataobj.Gs = [i["G"] for i in stacked]
                                #Cs = [i["C"] for i in stacked] 
                                #As = [i["A"] for i in stacked] 
                                #Ts = [i["T"] for i in stacked] 
                                #Gs = [i["G"] for i in stacked]
                                #dfconsens = pd.DataFrame([list(shortcon), 
                                #                          Cs, As, Ts, Gs])
                                tag = "_".join(fname.split("_")[-2:])
                                datadict[tag] = dataobj
                                #datadict[tag] = dfconsens
                        else:
                            pass #print "maxN filtered loc", locus
                    else:
                        pass #print "ploid filtered loc", locus
                else:
                    pass #print "maxH filtered loc", locus
            else:
                pass #print "third base filtered loc", locus
        else:
            pass #print "mindepth filtered loc", locus

    data.dirs.consens = os.path.join(data.dirs.clusts,
                                     "consens")

    #if not os.path.exists(os.path.join(...)):
    #    os.mkdir(consensdir)
    ## get filename
    consenshandle = "" #os.path.join([consensdir, sample.name+"consens.gz"])

    ## write to file
    with gzip.open(consenshandle, 'wb') as outfile:
        outfile.write("\n".join([">"+sample.name+"_"+obj+"\n"+\
                                 datadict[obj].seq for obj in datadict]))
        #for obj in datadict:
        #    outfile.write(">"+sample.name+"_"+obj+"\n"+datadict[obj].seq+"\n")

    ## count the number of polymorphic sites
    if 'ddrad' in data.paramsdict["datatype"]:
        if 'pair' in data.paramsdict["datatype"]:
            sub = 4 + len(data.paramsdict["restriction_overhang"][0])
        else:
            sub = len(data.paramsdict["restriction_overhang"][0])
    elif 'gbs' in data.paramsdict["datatype"]:
        if 'pair' in data.paramsdict["datatype"]:
            sub = 4 + len(data.paramsdict["restriction_overhang"][0])*2 
            #  (len(params["cut"])*2)
        else:
            sub = len(data.paramsdict["restriction_overhang"][0])
    elif data.paramsdict["datatype"] == "merged":
        sub = len(data.paramsdict["restriction_overhang"][0])*2 
    else:
        sub = len(data.paramsdict["restriction_overhang"][0])
    nsites = sum([len(datadict[i].seq)-sub for i in datadict])
    ldic = len(datadict)
    try: 
        poly = nheteros/float(nsites)
    except ZeroDivisionError:
        poly = 0.

    ## dump the quality score and depth info into a pickle
    #pickleout = gzip.open(handle.replace("clustS.gz", "bindata"), 'wb')
    #pickle.dump(datadict, pickleout)
    #pickleout.close()

    return [sample.name, locus, minsamp_filtered, ldic, nsites, nheteros, poly]