示例#1
0
    def get_short_seq_from_concat(self, percentage=0.37):
        """Finds short sequences, all below a certain threshold will be removed,
        to avoid having really low coverage in the aln. Default = 0.37.

        Note percentage is a bit misleading, the cutoff is 37% of the whole concatenated
        alignment, but the sequences length is calculated without gaps present.
        The default is so low, as I want to keep taxa that have only a single locus
        and which is not the longest among the loci within the aln.
        """
        physcraper.debug("get_short_seq_from_concat")
        seq_len = {}
        num_tax = 0
        for tax, seq in self.concatenated_aln.items():
            seq = seq.symbols_as_string().replace("-", "").replace("?", "")
            seq_len[tax] = len(seq)
            num_tax += 1
        total_len = 0
        for tax, seq in self.concatenated_aln.items():
            total_len = len(seq)
            break
        assert total_len != 0
        min_len = total_len * percentage
        prune_shortest = []
        for tax, len_seq in seq_len.items():
            if len_seq < min_len:
                prune_shortest.append(tax)
        self.short_concat_seq = prune_shortest
示例#2
0
 def make_sp_gene_dict(self):
     """Is the build around to make the dicts that are used to make it into a dendropy aln
     """
     physcraper.debug("make_sp_gene_dict")
     if self.concatfile is not None:
         self.user_defined_concat()
     else:
         sp_to_keep = self.sp_to_keep()
         self.tmp_dict = deepcopy(self.sp_acc_comb)
         while len(self.tmp_dict.keys()) >= 1:
             del_acc = {}
             for spn in self.tmp_dict.keys():
                 sp_to_keep_list = sp_to_keep.keys()
                 if spn.replace(" ", "_") in sp_to_keep_list:
                     tmp_gene = deepcopy(self.genes_present)
                     for gene in self.tmp_dict[spn]:
                         tmp_gene.remove(gene)
                         del_acc = self.select_rnd_seq(spn, gene, del_acc)
                     for item in tmp_gene:
                         self.make_empty_seq(spn, item)
                     self.rm_rnd_sp(del_acc)
                     del self.tmp_dict[spn]
                 else:
                     for gene in self.tmp_dict[spn]:
                         del_acc = self.select_rnd_seq(spn, gene, del_acc)
                     self.rm_rnd_sp(del_acc)
                 self.rm_empty_spn_entries(del_acc)
     self.rename_drop_tips()
示例#3
0
    def sp_seq_counter(self):
        """Counts how many seq per sp and genes there are -is used by sp_to_keep.

        Note: has test

        :return: builds self.sp_counter
        """
        physcraper.debug("sp_seq_counter")
        for spn in self.sp_acc_comb:
            tmp_gene = deepcopy(self.genes_present)
            for gene in self.sp_acc_comb[spn]:
                tmp_gene.remove(gene)
                spn_new = spn.replace(" ", "_")
                if spn_new in self.sp_counter:
                    self.sp_counter[spn_new][gene] = len(
                        self.sp_acc_comb[spn][gene])
                else:
                    self.sp_counter[spn_new] = {
                        gene: len(self.sp_acc_comb[spn][gene])
                    }
            for item in tmp_gene:
                if spn_new in self.sp_counter:
                    self.sp_counter[spn_new][item] = 0
                else:
                    self.sp_counter[spn_new] = {item: 0}
        physcraper.debug(self.sp_counter)
示例#4
0
 def est_full_tree(self):
     """Full raxml run from the placement tree as starting tree.
     """
     physcraper.debug("run full tree")
     cwd = os.getcwd()
     os.chdir(self.workdir)
     if os.path.exists("place_resolve.tre"):
         starting_fn = "place_resolve.tre"
     else:
         starting_fn = "starting_red.tre"
     if os.path.exists("concat_red.fasta.reduced"):
         aln = "concat_red.fasta.reduced"
         partition = "partition.reduced"
     else:
         aln = "concat_red.fasta"
         partition = "partition"
     try:
         num_threads = int(self.config.num_threads)
         print(num_threads)
         subprocess.call([
             "raxmlHPC-PTHREADS", "-T", "{}".format(num_threads), "-m",
             "GTRCAT", "-s", aln, "--print-identical-sequences", "-t",
             "{}".format(starting_fn), "-p", "1", "-q", partition, "-n",
             "concat"
         ])
     except:
         subprocess.call([
             "raxmlHPC", "-m", "GTRCAT", "-s", aln,
             "--print-identical-sequences", "-t", "{}".format(starting_fn),
             "-p", "1", "-q", partition, "-n", "concat"
         ])
     os.chdir(cwd)
示例#5
0
 def make_alns_dict(self):
     """Makes dendropy aln out of dict self.comb_seq for all genes.
     """
     physcraper.debug("make_alns_dict")
     firstelement = True
     count = 0
     for gene in self.comb_seq.keys():
         if count == 0:
             len1 = len(self.comb_seq[gene].keys())
             len2 = len1
             count = 1
         else:
             len2 = len(self.comb_seq[gene].keys())
         assert len1 == len2
     for gene in self.comb_seq.keys():
         if firstelement:
             aln1 = DnaCharacterMatrix.from_dict(self.comb_seq[gene])
             firstelement = False
             self.aln_all[count] = aln1
             aln1.write(path="{}/aln_0.fas".format(self.workdir),
                        schema="fasta")
         else:
             aln = DnaCharacterMatrix.from_dict(
                 self.comb_seq[gene], taxon_namespace=aln1.taxon_namespace)
             self.aln_all[count] = aln
             aln.write(path="{}/aln_{}.fas".format(self.workdir, count),
                       schema="fasta")
         count += 1
示例#6
0
    def select_rnd_seq(self, spn, gene, del_acc):
        """Select a random seq from spn and gene to combine it with a random other one from another gene,
        but same spn. Is used if the user does not give a concatenation input file.

        Note: has test

        :param spn: taxon name
        :param gene:  gene name
        :param del_acc: dictionary that contains gene name: dict(spn: concat_id of random seq)
        :return: del_acc
        """
        physcraper.debug("select_rnd_seq")
        count = 2
        random_gen = random.choice(list(self.tmp_dict[spn][gene]))
        self.sp_acc_comb[spn][gene][random_gen][
            "concat:status"] = "used in concat"
        seq = str(self.tmp_dict[spn][gene][random_gen]["seq"])
        spn_ = spn.replace(" ", "_")
        spn_ = spn_.replace(".", "").replace("'", "")
        if gene in self.comb_seq.keys():
            if spn_ not in self.comb_seq[gene].keys():
                self.comb_seq[gene][spn_] = seq
                if gene in self.comb_acc:
                    self.comb_acc[gene][spn_] = random_gen
                else:
                    self.comb_acc[gene] = {spn_: random_gen}
                if gene in del_acc.keys():
                    if spn_ not in del_acc[gene].keys():
                        del_acc[gene][spn] = random_gen
                else:
                    del_acc[gene] = {spn: random_gen}
            else:
                spn_new = "{}_{}".format(spn_, count)
                while spn_new in self.comb_seq[gene].keys():
                    count += 1
                    spn_new = "{}_{}".format(spn_, count)
                self.comb_seq[gene][spn_new] = seq
                self.comb_acc[gene][spn_new] = random_gen
                self.sp_acc_comb[spn][gene][random_gen][
                    "new tipname"] = spn_new
                if gene in del_acc.keys():
                    if spn_ not in del_acc[gene].keys():
                        del_acc[gene][spn] = random_gen
                    else:
                        del_acc[gene][spn] = random_gen
                else:
                    del_acc[gene] = {spn: random_gen}
        else:
            self.comb_seq[gene] = {spn_: seq}
            self.comb_acc[gene] = {spn_: random_gen}
            if gene in del_acc.keys():
                if spn_ not in del_acc[gene].keys():
                    del_acc[gene][spn] = random_gen
                else:
                    del_acc[gene] = {spn: random_gen}
            else:
                del_acc[gene] = {spn: random_gen}
        self.otu_to_spn(spn, gene, del_acc[gene][spn])
        return del_acc
示例#7
0
 def rm_rnd_sp(self, del_acc):
     """Removes the random selected seq from the tmp_dict, so that it cannot be selected again.
     """
     physcraper.debug("rm_rnd sp")
     for spn2 in self.tmp_dict:
         for gene2 in self.tmp_dict[spn2]:
             if gene2 in del_acc:
                 if spn2 in del_acc[gene2]:
                     key = del_acc[gene2][spn2]
                     if key in self.tmp_dict[spn2][gene2]:
                         del self.tmp_dict[spn2][gene2][key]
示例#8
0
    def combine(self):
        """Combines several PhyScraper objects to make a concatenated run dict.

        Is a wrapper function around make_concat_id_dict(). It produces the parameters needed for the function.
        """
        physcraper.debug("combine")
        self.num_of_genes = len(self.single_runs)
        concat_id_counter = 1
        for genename in self.single_runs:
            self.genes_present.append(genename)
            for otu in self.single_runs[genename].data.aln.taxon_namespace:
                concat_id = "concat_{}".format(concat_id_counter)
                self.make_concat_id_dict(otu.label, genename, concat_id)
                concat_id_counter += 1
        return
示例#9
0
    def load_single_genes(self, workdir, pickle_fn, genename):
        """Load PhyScraper class objects and make a single dict per run.

        Removes abandoned nodes first.

        :param workdir: directory of single gene run
        :param pickle_fn: path to pickled file of the Physcraper run
        :param genename: string, name for locus provided by user
        :return: self.single_runs
        """
        physcraper.debug("load_single_genes: {}".format(genename))
        scrape = pickle.load(open("{}/{}".format(workdir, pickle_fn), "rb"))
        scrape = remove_aln_tre_leaf(scrape)
        self.single_runs[genename] = deepcopy(scrape)
        return
示例#10
0
 def rename_drop_tips(self):
     """ Removes tips from tre as start that are not present in the concatenated aln
     and renames tips that are present.
     """
     physcraper.debug("rename_drop_tips")
     # leaf.taxon is never in concat_tips
     for leaf in self.tre_as_start.leaf_nodes():
         if leaf.taxon.label not in self.concat_tips.keys():
             self.tre_as_start.prune_taxa([leaf])
             self.tre_as_start.prune_taxa_with_labels([leaf.label])
             self.tre_as_start.prune_taxa_with_labels([leaf])
             self.tre_as_start.prune_taxa_with_labels([leaf.taxon.label])
             self.tre_as_start.taxon_namespace.remove_taxon_label(
                 leaf.taxon.label)
         else:
             for otu in self.concat_tips.keys():
                 if otu == leaf.taxon.label:
                     leaf.taxon.label = self.concat_tips[otu]
示例#11
0
 def rm_empty_spn_entries(self, del_acc):
     """Removes keys from tmp dict, if the key/sp has no value anymore. Helper function.
     """
     physcraper.debug("rm_empty_spn_entries")
     del_sp = None
     for spn2 in self.tmp_dict:
         for gene2 in self.tmp_dict[spn2]:
             if gene2 in del_acc:
                 if spn2 in del_acc[gene2]:
                     if len(self.tmp_dict[spn2][gene2]) == 0:
                         del_sp = spn2
     if del_sp is not None:
         for item in self.sp_acc_comb[del_sp]:
             for otu in self.sp_acc_comb[del_sp][item]:
                 if self.sp_acc_comb[del_sp][item][otu][
                         "concat:status"] != "used in concat":
                     self.sp_acc_comb[del_sp][item][otu][
                         "concat:status"] = "deleted, because not enough seq are present"
         del self.tmp_dict[del_sp]
示例#12
0
def filter_data_run(seqaln,
                    mattype,
                    trfn,
                    schema_trf,
                    workdir,
                    threshold,
                    id_to_spn,
                    configfi,
                    selectby="blast",
                    downtorank=None,
                    blacklist=None,
                    add_unpubl_seq=None,
                    id_to_spn_addseq_json=None,
                    ingroup_mrca=None,
                    shared_blast_folder=None):
    """looks for pickeled file to continue run, or builds and runs 
    new analysis for as long as new seqs are found. 
    This uses the FilterBlast subclass to be able to filter the blast output.
    """
    license_print()
    debug("Debugging mode is on")
    print(workdir)
    print(os.path.exists(workdir))
    if not os.path.exists(workdir):
        print("make wd")
        os.makedirs(workdir)
    conf = ConfigObj(configfi)
    ids = load_ids_obj(conf, workdir)

    make_otujsondict(id_to_spn, workdir, ids)
    # make json file for unpublished database
    if add_unpubl_seq is not None:
        make_otujsondict(id_to_spn_addseq_json, workdir, ids, local=True)

    # Generate an linked Alignment-Tree-Taxa object
    data_obj = load_own_data(conf, seqaln, mattype, trfn, schema_trf, workdir,
                             ingroup_mrca)
    filteredScrape = PS_filter_run(add_unpubl_seq, blacklist, data_obj,
                                   downtorank, id_to_spn_addseq_json, ids,
                                   selectby, shared_blast_folder, threshold,
                                   ingroup_mrca)
    save_copy_code(workdir)
    return filteredScrape
示例#13
0
def filter_OTOL(
        study_id,
        tree_id,
        seqaln,
        mattype,
        workdir,
        configfi,
        threshold,
        selectby="blast",
        downtorank=None,
        blacklist=None,
        add_unpubl_seq=None,  # path to local seq
        id_to_spn_addseq_json=None,
        ingroup_mrca=None,
        shared_blast_folder=None):
    """looks for pickeled file to continue run, or builds and runs
    new analysis for as long as new seqs are found.

    This uses the FilterBlast subclass to be able to filter the blast output using data from OToL."""
    license_print()

    debug("Debugging mode is on")
    if not os.path.exists(workdir):
        os.makedirs(workdir)
    # read the config file into a configuration object
    conf = ConfigObj(configfi)
    # Generate an linked Alignment-Tree-Taxa object
    data_obj = load_otol_data(conf, ingroup_mrca, mattype, seqaln, study_id,
                              tree_id, workdir)
    ids = load_ids_obj(conf, workdir)

    # make json file for unpublished database
    if add_unpubl_seq is not None:
        make_otujsondict(id_to_spn_addseq_json, workdir, ids, local=True)

    # Now combine the data, the ids, and the configuration into a single physcraper scrape object
    filteredScrape = PS_filter_run(add_unpubl_seq, blacklist, data_obj,
                                   downtorank, id_to_spn_addseq_json, ids,
                                   selectby, shared_blast_folder, threshold,
                                   ingroup_mrca)
    save_copy_code(workdir)
    return filteredScrape
示例#14
0
 def concatenate_alns(self):
     """Concatenate all alns into one aln.
     """
     physcraper.debug("concat alns")
     count = 0
     for gene in self.aln_all:
         if count == 0:
             aln1 = self.aln_all[gene]
             aln1.write(path="{}/aln1.fas".format(self.workdir),
                        schema="fasta")
             count = 1
         else:
             aln2 = self.aln_all[gene]
             count += 1
             aln2.write(path="{}/aln{}.fas".format(self.workdir, count),
                        schema="fasta")
             assert aln1.taxon_namespace == aln2.taxon_namespace
             aln1 = DnaCharacterMatrix.concatenate([aln1, aln2])
     aln1.write(path="{}/concat.fas".format(self.workdir), schema="fasta")
     self.concatenated_aln = aln1
示例#15
0
    def sp_to_keep(self):
        """Uses the sp_counter to make a list of sp that should be kept in concatenated alignment,
        because they are the only representative of the sp.

        Note: has test

        :return: dictionary with taxon name and number saying how many genes are missing
        """
        physcraper.debug("sp to keep")
        sp_to_keep = {}
        for spn in self.sp_counter:
            seq_counter = True
            not_present = 0
            for gene in self.sp_counter[spn]:
                if self.sp_counter[spn][gene] == 0:
                    seq_counter = False
                    not_present += 1
            if not seq_counter:
                sp_to_keep[spn] = not_present
        # physcraper.debug(sp_to_keep)
        return sp_to_keep
示例#16
0
def own_data_run(seqaln,
                 mattype,
                 trfn,
                 schema_trf,
                 workdir,
                 id_to_spn,
                 configfi,
                 ingroup_mrca=None,
                 shared_blast_folder=None):
    """This is the wrapper function to start a PhyScraper standard run with your own data.
    You need:
         seqaln = path to sequence alignment file
         mattype = the format name of you alignment
         trfn = path to file with the phylogeny to update
         schema_trf = format type of your phylogeny
         workdir = define where your analysis files shall be stored
         sp_info_jsonfi = a json file which has the otu_dict stored, which is generated by the OtuJsonDict function
                            (usually, just leave it like it is in the example scripts.).
         configfi = path to your config file
         ingroup_mrca = not necessary, if you want to limit your run to a certain clade, give the OpenTree ID here,
                        can be obtained bu running: python scripts/get_ott.py ingroup_name
         shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation),
                                give the path to the folder with the shared runs.
    """
    license_print()
    debug("Debugging mode is on")
    if not os.path.exists(workdir):
        os.mkdir(workdir)
    conf = ConfigObj(configfi)
    ids = load_ids_obj(conf, workdir)

    make_otujsondict(id_to_spn, workdir, ids)
    data_obj = load_own_data(conf, seqaln, mattype, trfn, schema_trf, workdir,
                             ingroup_mrca)
    # Mapping identifiers between original data and NCBI requires an identifier dict object
    # scraper = PhyscraperScrape(data_obj, ids)
    scraper = PS_standard_run(data_obj, ids, shared_blast_folder)
    save_copy_code(workdir)
    return 1
示例#17
0
 def remove_short_seq(self):
     """Removes short seq that were found with get_short_seq
     and write it to file.
     """
     physcraper.debug("remove_short_seq")
     self.concatenated_aln.remove_sequences(self.short_concat_seq)
     for leaf in self.tre_as_start.leaf_nodes():
         for tax in self.short_concat_seq:
             if tax.label == leaf.taxon.label.replace(" ", "_"):
                 self.tre_as_start.prune_taxa([leaf])
                 self.tre_as_start.prune_taxa_with_labels([leaf.label])
                 self.tre_as_start.prune_taxa_with_labels([leaf])
                 self.tre_as_start.prune_taxa_with_labels(
                     [leaf.taxon.label])
                 self.tre_as_start.taxon_namespace.remove_taxon_label(
                     leaf.taxon.label)
             else:
                 leaf.taxon.label = leaf.taxon.label.replace(" ", "_")
     tre_as_start_str = self.tre_as_start.as_string(
         schema="newick",
         # preserve_underscores=True,
         unquoted_underscores=True,
         suppress_rooting=True)
     fi = open("{}/{}".format(self.workdir, "starting_red.tre"), "w")
     fi.write(tre_as_start_str)
     fi.close()
     for tax in self.concatenated_aln.taxon_namespace:
         tax.label = tax.label.replace(" ", "_")
     self.concatenated_aln.write(path="{}/{}".format(
         self.workdir, "concat_red.fasta"),
                                 schema="fasta")
     tre_ids = set()
     for tax in self.tre_as_start.taxon_namespace:
         tre_ids.add(tax.label)
     aln_ids = set()
     for tax in self.concatenated_aln.taxon_namespace:
         aln_ids.add(tax.label)
示例#18
0
 def write_partition(self):
     """Write the partitioning file for RAxML.
     """
     physcraper.debug("write_partition")
     count = 0
     len_gene = 0
     for gene in self.single_runs:
         for tax, seq in self.single_runs[gene].data.aln.items():
             len_gene = len(seq.symbols_as_string())
             break
         if count == 0:
             with open("{}/partition".format(self.workdir),
                       "w") as partition:
                 partition.write("DNA, {} = 1-{}\n".format(gene, len_gene))
             self.part_len = len_gene
             count = 1
         else:
             start = self.part_len + 1
             end = self.part_len + len_gene
             self.part_len = self.part_len + len_gene
             with open("{}/partition".format(self.workdir),
                       "a") as partition:
                 partition.write("DNA, {} = {}-{}\n".format(
                     gene, start, end))
示例#19
0
 def get_largest_tre(self):
     """Find the single gene tree with the most tips, which will be used as
     starting tree for concat phylo reconstruction.
     """
     physcraper.debug("get_largest_tre")
     first = True
     len_all_taxa = {}
     for gene in self.single_runs:
         len_aln_taxa = len(self.single_runs[gene].data.aln.taxon_namespace)
         len_all_taxa[gene] = len_aln_taxa
     len_max = 0
     gene_max = 0
     for gene, len_item in len_all_taxa.items():
         if first:
             len_max = len_item
             gene_max = gene
             assert len_max != 0
             assert gene_max != 0
             first = False
         if len_item > len_max:
             len_max = len_item
             gene_max = gene
     self.tre_as_start = self.single_runs[gene_max].data.tre
     self.tre_start_gene = gene_max
示例#20
0
    def make_concat_id_dict(self, otu, genename, concat_id):
        """Makes a concat_id entry with all information

        Note: has test

        :param otu: otu_id
        :param genename: name of single gene run
        :param concat_id: unique identifier in the concat class
        :return: modified self.sp_acc_comb
        """
        data = self.single_runs[genename].data.otu_dict[otu]
        seq = str(self.single_runs[genename].data.aln[otu])
        spn = None
        if "^ot:ottTaxonName" in data:
            spn = self.get_taxon_info("^ot:ottTaxonName", data)
            if spn not in self.sp_acc_comb:
                self.sp_acc_comb[spn] = {}
            if genename not in self.sp_acc_comb[spn]:
                self.sp_acc_comb[spn][genename] = {}
        elif "^user:TaxonName" in data:
            spn = self.get_taxon_info("^user:TaxonName", data)
            if spn not in self.sp_acc_comb:
                self.sp_acc_comb[spn] = {}
            if genename not in self.sp_acc_comb[spn]:
                self.sp_acc_comb[spn][genename] = {}
        else:
            # we should never get here....
            physcraper.debug("THERE IS A SERIOUS PROBLEM....")
        assert spn is not None
        if concat_id not in self.sp_acc_comb[spn][genename]:
            if "^ncbi:accession" in data:
                unique_id = data["^ncbi:accession"]
            elif u"^ot:originalLabel" in data:
                unique_id = data[u"^ot:originalLabel"]
            concat_dict = {
                "unique_id": unique_id,
                "seq": seq,
                "spn": spn,
                "original_PS_id": otu,
                "concat:status": "single run",
            }
            self.sp_acc_comb[spn][genename][concat_id] = concat_dict
        else:
            physcraper.debug(
                "something goes wrong, you should not try to add the same id several times...."
            )
        if concat_dict["spn"] is None:
            # we should never get here....
            sys.stderr.write(
                "There is no species name for the seq. Do not know how to concatenate then. "
                "Please remove seq from aln: {}.".format(
                    data["^ncbi:accession"]))
            physcraper.debug("THERE IS A SERIOUS PROBLEM....spn is none")
            spn = self.get_taxon_info("^ot:ottTaxonName", data)
            self.sp_acc_comb[spn] = self.sp_acc_comb[unique_id]
            del self.sp_acc_comb[unique_id]
示例#21
0
def standard_run(study_id,
                 tree_id,
                 seqaln,
                 mattype,
                 workdir,
                 configfi,
                 ingroup_mrca=None,
                 shared_blast_folder=None):
    """looks for a json file to continue run, or builds and runs
    new analysis for as long as new seqs are found

    This is the wrapper function to start a PhyScraper run with tree and alignment ids from Open Tree of Life.
    You need:
         seqaln = ID of alignment file
         mattype = the format name of you alignment
         trfn = Id of phylogeny to update
         workdir = define where your analysis files shall be stored
         configfi = path to your config file
         ingroup_mrca = define the mrca, by supplying the Open Tree of Life identifier of the clade of interest

         shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation),
                                give the path to the folder with the shared runs.
    """
    license_print()
    debug("Debugging mode is on")
    if not os.path.exists(workdir):
        os.mkdir(workdir)
    conf = ConfigObj(configfi)
    data_obj = load_otol_data(conf, ingroup_mrca, mattype, seqaln, study_id,
                              tree_id, workdir)
    # Mapping identifiers between OpenTree and NCBI requires an identifier dict object
    ids = load_ids_obj(conf, workdir, ingroup_mrca)
    # Now combine the data, the ids, and the configuration into a single physcraper scrape object
    scraper = PS_standard_run(data_obj, ids, shared_blast_folder)
    save_copy_code(workdir)
    return scraper
示例#22
0
    def place_new_seqs(self):
        """Places the new seqs (that are only found in loci which is not the starting tree)
        onto one of the single run trees.
        """
        physcraper.debug("place_new_seqs")
        if len(self.concatenated_aln.taxon_namespace) - len(
                self.short_concat_seq) > len(self.tre_as_start.leaf_nodes()):
            if os.path.exists("RAxML_labelledTree.PLACE"):
                os.rename("RAxML_labelledTree.PLACE",
                          "RAxML_labelledTreePLACE.tmp")
            cwd = os.getcwd()
            os.chdir(self.workdir)

            physcraper.debug("make place-tree")
            try:
                num_threads = int(self.config.num_threads)
                print(num_threads)
                subprocess.call([
                    "raxmlHPC-PTHREADS", "-T", "{}".format(num_threads), "-m",
                    "GTRCAT", "-f", "v", "-q", "partition", "-s",
                    "concat_red.fasta", "-t", "starting_red.tre", "-n", "PLACE"
                ])
            except:
                subprocess.call([
                    "raxmlHPC", "-m", "GTRCAT", "-f", "v", "-q", "partition",
                    "-s", "concat_red.fasta", "-t", "starting_red.tre", "-n",
                    "PLACE"
                ])
            os.chdir(cwd)
            physcraper.debug("read place tree")
            placetre = Tree.get(path="{}/starting_red.tre".format(
                self.workdir),
                                schema="newick",
                                preserve_underscores=True,
                                suppress_internal_node_taxa=True,
                                suppress_leaf_node_taxa=True)
            physcraper.debug("resolve polytomies")
            placetre.resolve_polytomies()
            placetre.write(path="{}/place_resolve.tre".format(self.workdir),
                           schema="newick",
                           unquoted_underscores=True)
示例#23
0
def filter_data_run(seqaln,
                    mattype,
                    trfn,
                    schema_trf,
                    workdir,
                    threshold,
                    spInfoDict,
                    configfi,
                    selectby="blast",
                    downtorank=None,
                    blacklist=None,
                    add_unpubl_seq=None,
                    id_to_spn_addseq_json=None,
                    ingroup_mrca=None,
                    shared_blast_folder=None):
    """looks for pickeled file to continue run, or builds and runs 
    new analysis for as long as new seqs are found. 
    This uses the FilterBlast subclass to be able to filter the blast output.
    """
    debug("Debugging mode is on")

    # debug(shared_blast_folder)
    # debug(some)
    # if _DEBUG_MK == 1:
    #     random.seed(3269235691)
    print(workdir)
    if os.path.isfile("{}/scrape_checkpoint.p".format(workdir)):
        sys.stdout.write("Reloading from pickled scrapefile: scrape\n")
        filteredScrape = pickle.load(open("{}/scrape_checkpoint.p".format(workdir), 'rb'))
        filteredScrape.repeat = 1   
    else:   
        sys.stdout.write("setting up Data Object\n")
        sys.stdout.flush()
        # read the config file into a configuration object
        conf = ConfigObj(configfi, interactive=True)
        # Generate an linked Alignment-Tree-Taxa object
        data_obj = generate_ATT_from_files(seqaln=seqaln, 
                                           mattype=mattype,
                                           workdir=workdir,
                                           treefile=trfn,
                                           schema_trf=schema_trf,
                                           otu_json=spInfoDict,
                                           ingroup_mrca=ingroup_mrca)

        # Prune sequnces below a certain length threshold
        # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems.
        data_obj.prune_short()
        data_obj.write_files()
        data_obj.write_labelled(label="^ot:ottTaxonName", add_gb_id=True)
        data_obj.write_otus("otu_info", schema="table")
        data_obj.dump()
        sys.stdout.write("setting up id dictionaries\n")
        sys.stdout.flush()
        ids = IdDicts(conf, workdir=workdir, mrca=ingroup_mrca)

        # Now combine the data, the ids, and the configuration into a single physcraper scrape object
        filteredScrape = FilterBlast(data_obj, ids)
        filteredScrape.add_setting_to_self(downtorank, threshold)
        filteredScrape.blacklist = blacklist
        if add_unpubl_seq is not None:
            filteredScrape.unpublished = True
        if filteredScrape.unpublished is True:  # use unpublished data
            sys.stdout.write("Blasting against local unpublished data")
            filteredScrape.unpublished = True
            filteredScrape.write_unpubl_blastdb(add_unpubl_seq)
            filteredScrape.run_blast_wrapper(delay=14)
            print("add unpubl otu json")
            filteredScrape.data.unpubl_otu_json = id_to_spn_addseq_json
            print(filteredScrape.data.unpubl_otu_json)

            filteredScrape.read_blast_wrapper()
            filteredScrape.remove_identical_seqs()
            filteredScrape.generate_streamed_alignment()
            filteredScrape.unpublished = False
        else:
            # run the analysis
            sys.stdout.write("BLASTing input sequences\n")
            if shared_blast_folder:
                filteredScrape.blast_subdir = shared_blast_folder
            else:
                shared_blast_folder = None
            filteredScrape.run_blast_wrapper(delay=14)
            filteredScrape.read_blast_wrapper(blast_dir=shared_blast_folder)
            filteredScrape.remove_identical_seqs()
            filteredScrape.dump()
            sys.stdout.write("Filter the sequences\n")
            if threshold is not None:

                filteredScrape.sp_dict(downtorank)
                filteredScrape.make_sp_seq_dict()
                filteredScrape.how_many_sp_to_keep(threshold=threshold, selectby=selectby)
                filteredScrape.replace_new_seq()
            sys.stdout.write("Calculate the phylogeny\n")
            filteredScrape.generate_streamed_alignment()
            filteredScrape.data.write_otus("otu_info", schema="table")
            filteredScrape.write_otu_info(downtorank)

            filteredScrape.dump()
    while filteredScrape.repeat == 1:
        filteredScrape.data.write_labelled(label="^ot:ottTaxonName", add_gb_id=True)
        filteredScrape.data.write_otus("otu_info", schema="table")
        sys.stdout.write("BLASTing input sequences\n")
        if shared_blast_folder:
            filteredScrape.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        filteredScrape.run_blast_wrapper(delay=14)
        filteredScrape.read_blast_wrapper(blast_dir=shared_blast_folder)
        filteredScrape.remove_identical_seqs()
        sys.stdout.write("Filter the sequences\n")
        if threshold is not None:
            filteredScrape.sp_dict(downtorank)
            filteredScrape.make_sp_seq_dict()
            filteredScrape.how_many_sp_to_keep(threshold=threshold, selectby=selectby)
            filteredScrape.replace_new_seq()
        filteredScrape.data.prune_short(0.75)
        sys.stdout.write("calculate the phylogeny\n")
        filteredScrape.generate_streamed_alignment()
        filteredScrape.dump()
        filteredScrape.write_otu_info(downtorank)
        # print(some)
    filteredScrape.write_otu_info(downtorank)
    return filteredScrape
示例#24
0
def add_different_rank(seqaln,
                       mattype,
                       trfn,
                       schema_trf,
                       workdir,
                       threshold,
                       id_to_spn,
                       new_confifi,
                       selectby="blast",
                       downtorank=None,
                       blacklist=None,
                       add_unpubl_seq=None,
                       id_to_spn_addseq_json=None,
                       ingroup_mrca=None,
                       shared_blast_folder=None,
                       backbone=False):
    """looks for pickeled file to continue run, or builds and runs 
    new analysis for as long as new seqs are found. 
    This uses the FilterBlast subclass to be able to filter the blast output.
    """
    license_print()
    debug("Debugging mode is on")

    dump_fn = "add_different_rank{}_{}.run".format(ingroup_mrca, downtorank)
    # if files does not exists, this loop was not yet run, if exitsts, go to next
    if os.path.isfile("{}/{}".format(workdir, dump_fn)):
        filteredScrape = pickle.load(
            open("{}/scrape_checkpoint.p".format(workdir), 'rb'))
    else:

        assert os.path.isfile("{}/scrape_checkpoint.p".format(workdir))

        sys.stdout.write("Reloading from pickled scrapefile: scrape\n")
        filteredScrape = pickle.load(
            open("{}/scrape_checkpoint.p".format(workdir), 'rb'))

        # copy previous files to different folder
        count = 1
        while os.path.exists("{}/update_{}".format(workdir, count)):
            count += 1
        os.mkdir("{}/update_{}".format(workdir, count))
        old_runs = "{}/update_{}".format(workdir, count)

        src_files = os.listdir(workdir)
        for file_name in src_files:
            full_file_name = os.path.join(workdir, file_name)
            if os.path.isfile(full_file_name):
                shutil.copy(full_file_name, old_runs)

        filteredScrape.repeat = 1
        conf = ConfigObj(new_confifi)
        # add new config
        assert filteredScrape.config != conf
        filteredScrape.config = conf
        assert filteredScrape.config == conf

        # set new ingroup_mrca
        filteredScrape.data.mrca_ott = ingroup_mrca
        filteredScrape.mrca_ncbi = filteredScrape.ids.ott_to_ncbi[
            filteredScrape.data.ott_mrca]
        assert filteredScrape.data.ott_mrca == ingroup_mrca

        with open(filteredScrape.logfile, "a") as log:
            log.write(
                "You run 'add_different_rank' with the following settings: rank: {} and ingroup_mrca: {}. \n"
                .format(downtorank, ingroup_mrca))

        # here the filter standard function continues...
        if backbone is True:
            filteredScrape.backbone = backbone
            filteredScrape.data.write_files(treepath="backbone.tre",
                                            alnpath="backbone.fas")
        else:
            filteredScrape.backbone = False
        # set new downtorank and numbers:
        filteredScrape.add_setting_to_self(downtorank, threshold)
        filteredScrape.blacklist = blacklist

        if add_unpubl_seq is not None:
            filteredScrape.unpublished = True
        if filteredScrape.unpublished is True:  # use unpublished data
            sys.stdout.write("Blasting against local unpublished data")
            filteredScrape.data.unpubl_otu_json = json.load(
                open("{}/otu_dict_localseq.json".format(workdir)))
            filteredScrape.write_unpubl_blastdb(add_unpubl_seq)
            filteredScrape.run_blast_wrapper()
            filteredScrape.read_blast_wrapper()
            filteredScrape.remove_identical_seqs()
            filteredScrape.generate_streamed_alignment()
            filteredScrape.unpublished = False
            if backbone:
                filteredScrape.repeat = 1
        else:
            sys.stdout.write("BLASTing input sequences\n")
            if shared_blast_folder:
                filteredScrape.blast_subdir = shared_blast_folder
            else:
                shared_blast_folder = None
            filteredScrape.run_blast_wrapper()
            filteredScrape.read_blast_wrapper(blast_dir=shared_blast_folder)
            filteredScrape.remove_identical_seqs()
            sys.stdout.write("Filter the sequences\n")
            if threshold is not None:
                if len(filteredScrape.new_seqs_otu_id) > 0:
                    filteredScrape.sp_dict(downtorank)
                    filteredScrape.make_sp_seq_dict()
                    filteredScrape.how_many_sp_to_keep(selectby=selectby)
                    filteredScrape.replace_new_seq()
            sys.stdout.write("Calculate the phylogeny\n")
            filteredScrape.generate_streamed_alignment()
            filteredScrape.dump()
            filteredScrape.data.write_otus("otu_info", schema="table")
            write_out_files(filteredScrape, downtorank)
            if backbone:
                filteredScrape.repeat = 0
            # set back to normal - only used to reassess formerly discarded seq in first round
        while filteredScrape.repeat == 1:
            filteredScrape.data.write_labelled(label="^ot:ottTaxonName",
                                               add_gb_id=True)
            filteredScrape.data.write_otus("otu_info", schema="table")
            sys.stdout.write("BLASTing input sequences\n")
            if shared_blast_folder:
                filteredScrape.blast_subdir = shared_blast_folder
            else:
                shared_blast_folder = None
            filteredScrape.run_blast_wrapper()
            filteredScrape.read_blast_wrapper(blast_dir=shared_blast_folder)
            filteredScrape.remove_identical_seqs()
            sys.stdout.write("Filter the sequences\n")
            if threshold is not None:
                if len(filteredScrape.new_seqs_otu_id) > 0:
                    filteredScrape.sp_dict(downtorank)
                    filteredScrape.make_sp_seq_dict()
                    filteredScrape.how_many_sp_to_keep(selectby=selectby)
                    filteredScrape.replace_new_seq()
            filteredScrape.data.prune_short()
            sys.stdout.write("calculate the phylogeny\n")
            filteredScrape.generate_streamed_alignment()
            filteredScrape.dump()
            write_out_files(filteredScrape, downtorank)
            if backbone:
                filteredScrape.repeat = 0
        writeinfofiles.get_additional_GB_info(filteredScrape)
        filteredScrape.dump()
    dump_fn = "add_different_rank{}_{}.run".format(ingroup_mrca, downtorank)
    fn = open(dump_fn, "w")
    fn.write(
        "add different rank with following settings {} and {} finished".format(
            ingroup_mrca, downtorank))
    fn.close()
    return filteredScrape
示例#25
0
def test_blacklist():

    workdir = "tests/output/test_blacklist"
    configfi = "tests/data/test.config"

    # make one run without blacklist
    debug("run without blacklist")
    blacklist = None
    noblack = os.path.join(workdir, "noblacklist")
    absworkdir = os.path.abspath(noblack)
    if not os.path.exists(os.path.join(absworkdir, "current_blast_run/")):
        os.makedirs(os.path.join(absworkdir, "current_blast_run/"))

    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    noblackScrape = FilterBlast(data_obj, ids)
    noblackScrape._blasted = 1
    src = "tests/data/precooked/fixed/tte_blast_files"
    src_files = os.listdir(src)
    for file_name in src_files:
        dest = os.path.join(absworkdir, "current_blast_run/")
        # print(dest)
        full_file_name = os.path.join(src, file_name)
        if (os.path.isfile(full_file_name)):
            shutil.copy(full_file_name, dest)
    noblackScrape.read_blast_wrapper()
    noblackScrape.remove_identical_seqs()
    noblackScrape.generate_streamed_alignment()

    # one run with blacklist
    debug("run with blacklist")

    blacklist = ['JX895340.1']
    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = FilterBlast(data_obj, ids)
    filteredScrape.blacklist = blacklist
    filteredScrape._blasted = 1
    if not os.path.exists(os.path.join(absworkdir, "current_blast_run/")):
        os.makedirs(os.path.join(absworkdir, "current_blast_run/"))
    src = "tests/data/precooked/fixed/tte_blast_files"
    src_files = os.listdir(src)
    for file_name in src_files:
        dest = os.path.join(absworkdir, "current_blast_run/")
        full_file_name = os.path.join(src, file_name)
        if (os.path.isfile(full_file_name)):
            shutil.copy(full_file_name, dest)
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper()
    filteredScrape.remove_identical_seqs()
    filteredScrape.generate_streamed_alignment()

    print("RUN TESTS!")
    gi_l = []
    gi_l_2 = []
    for tax in filteredScrape.data.tre.taxon_namespace:
        gi_id = filteredScrape.data.otu_dict[tax.label].get("^ncbi:accession")
        gi_l.append(gi_id)
    print(gi_l)
    for tax in noblackScrape.data.tre.taxon_namespace:
        # print(filteredScrape.data.otu_dict[tax.label])
        gi_id = noblackScrape.data.otu_dict[tax.label].get("^ncbi:accession")
        gi_l_2.append(gi_id)
    print(gi_l_2)
    for item in blacklist:
        assert item not in gi_l
        print("RUN TESTS2!")
        assert item in gi_l_2

        #     # print("seq was not added in blacklist run")
        #     print("inbetween step works")
# test if it removes blacklist gi from already added aln:
    print("run with later blacklist")

    # else:
    #     print("blacklist gi was added in previous run")
    # print("now we want to remove it.")
    len_before = (len(noblackScrape.data.tre.taxon_namespace))
    noblackScrape.blacklist = blacklist
    noblackScrape.generate_streamed_alignment()
    assert len_before - 1 == len(noblackScrape.data.tre.taxon_namespace)
示例#26
0
def run_with_settings(settings):
    """looks for pickeled file to continue run, or builds and runs
    new analysis for as long as new seqs are found. 
    This uses the FilterBlast subclass to be able to filter the blast output."""
    debug("Debugging mode is on")
    if os.path.isfile("{}/scrape_checkpoint.p".format(settings.workdir)):
        sys.stdout.write("Reloading from pickled scrapefile: scrape\n")
        filteredScrape = pickle.load(
            open("{}/scrape_checkpoint.p".format(settings.workdir), "rb")
        )
        filteredScrape.repeat = 1
    else:
        conf = ConfigObj(settings.configfi)
        # print("config")
        debug(dir(conf))
        debug(conf.email)

        # Generate an linked Alignment-Tree-Taxa object
        data_obj = generate_ATT_from_files(seqaln=settings.seqaln, 
                                           mattype=settings.mattype,
                                           workdir=settings.workdir,
                                           treefile=settings.trfn,
                                           schema_trf=settings.schema_trf,
                                           otu_json=settings.spInfoDict,
                                           ingroup_mrca=None)

        # Prune sequences below a certain length threshold
        # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems.
        data_obj.prune_short()
        data_obj.write_files()

        data_obj.write_labelled(label="^ot:ottTaxonName", add_gb_id=True)
        data_obj.write_otus("otu_info", schema="table")
        data_obj.dump()

        ids = IdDicts(conf, workdir=settings.workdir)

        filteredScrape = FilterBlast(data_obj, ids, settings)
        filteredScrape.add_setting_to_self(settings.downtorank, settings.threshold)

        filteredScrape.write_otu_info(settings.downtorank)

        if settings.add_unpubl_seq is not None:
            filteredScrape.unpublished = True
        if filteredScrape.unpublished is True:  # use unpublished data
            sys.stdout.write("Blasting against local unpublished data")
            filteredScrape.write_unpubl_blastdb(settings.add_unpubl_seq)
            filteredScrape.run_blast_wrapper(settings.delay)
            filteredScrape.local_otu_json = settings.id_to_spn_addseq_json
            filteredScrape.read_blast_wrapper()
            filteredScrape.remove_identical_seqs()
            filteredScrape.generate_streamed_alignment()
            filteredScrape.unpublished = False

        # run the ananlyses
        if filteredScrape.unpublished is not True:
            filteredScrape.run_blast_wrapper(settings.delay)
            filteredScrape.read_blast_wrapper(blast_dir=settings.shared_blast_folder)
            filteredScrape.remove_identical_seqs()
            filteredScrape.dump()
            if settings.threshold is not None:
                filteredScrape.sp_dict(settings.downtorank)
                filteredScrape.make_sp_seq_dict()
                filteredScrape.how_many_sp_to_keep(threshold=settings.threshold, selectby=settings.selectby)
                filteredScrape.replace_new_seq()
            debug("from replace to streamed aln")
            filteredScrape.generate_streamed_alignment()
            filteredScrape.dump()
    while filteredScrape.repeat is 1:
        filteredScrape.data.write_labelled(label="^ot:ottTaxonName", add_gb_id=True)
        filteredScrape.data.write_otus("otu_info", schema="table")
        filteredScrape.run_blast_wrapper(settings.delay)
        filteredScrape.read_blast_wrapper(blast_dir=settings.shared_blast_folder)
        filteredScrape.remove_identical_seqs()
        if settings.threshold is not None:
            filteredScrape.sp_dict(settings.downtorank)
            filteredScrape.make_sp_seq_dict()
            filteredScrape.how_many_sp_to_keep(threshold=settings.threshold, selectby=settings.selectby)
            filteredScrape.replace_new_seq()
        filteredScrape.generate_streamed_alignment()
        filteredScrape.dump()
        filteredScrape.write_otu_info(settings.downtorank)
        return filteredScrape
示例#27
0
def standard_run(study_id,
                 tree_id,
                 seqaln,
                 mattype,
                 workdir,
                 configfi,
                 ingroup_mrca=None,
                 shared_blast_folder=None):
    """looks for a json file to continue run, or builds and runs
    new analysis for as long as new seqs are found

    This is the wrapper function to start a PhyScraper run with tree and alignment ids from Open Tree of Life.
    You need:
         seqaln = ID of alignment file
         mattype = the format name of you alignment
         trfn = Id of phylogeny to update
         workdir = define where your analysis files shall be stored
         configfi = path to your config file
         ingroup_mrca = define the mrca, by supplying the Open Tree of Life identifier of the clade of interest

         shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation),
                                give the path to the folder with the shared runs.
    """
    debug("Debugging mode is on")

    conf = ConfigObj(configfi, interactive=False)
    if os.path.isfile("{}/att_checkpoint.p".format(workdir)):
        sys.stdout.write("Reloading data object from pickle file\n")
        data_obj = pickle.load(open("{}/att_checkpoint.p".format(workdir), "rb"))
#        scraper.repeat = 1
    else:
        sys.stdout.write("setting up Data Object\n")
        sys.stdout.flush()
        # read the config file into a configuration object
        conf = ConfigObj(configfi, interactive=False)
        aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)
        # Generate an linked Alignment-Tree-Taxa object
        data_obj = generate_ATT_from_phylesystem(aln=aln,
                                                 workdir=workdir,
                                                 study_id=study_id,
                                                 tree_id=tree_id,
                                                 phylesystem_loc=conf.phylesystem_loc,
                                                 ingroup_mrca=ingroup_mrca)
        # Mapping identifiers between OpenTree and NCBI requires and identifier dict object
        # ids = IdDicts(conf, workdir="example")
        # Prune sequences below a certain length threshold
        # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems.
        data_obj.prune_short()
        data_obj.write_files()
        data_obj.write_labelled(label="^ot:ottTaxonName")
        data_obj.write_otus("otu_info", schema="table")
        data_obj.dump()
        # Mapping identifiers between OpenTree and NCBI requires and identifier dict object
    if os.path.isfile(conf.id_pickle):
        sys.stdout.write("Reloading id dicts from {}\n".format(conf.id_pickle))
        ids = pickle.load(open(conf.id_pickle, "rb"))
    else:
        sys.stdout.write("setting up id dictionaries\n")
        sys.stdout.flush()
        ids = IdDicts(conf, workdir=workdir)
        ids.dump()
    # Now combine the data, the ids, and the configuration into a single physcraper scrape object
    scraper = PhyscraperScrape(data_obj, ids)
    # run the analyses
    if shared_blast_folder:
        scraper.blast_subdir = shared_blast_folder
    else:
        shared_blast_folder = None
    scraper.run_blast_wrapper(delay=14)
    scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
    scraper.remove_identical_seqs()
    scraper.generate_streamed_alignment()
    while scraper.repeat == 1:
        scraper.data.write_labelled(label="^ot:ottTaxonName")
        scraper.data.write_otus("otu_info", schema="table")
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        scraper.run_blast_wrapper(delay=14)
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
    # scraper.write_otu_info()

    return scraper
示例#28
0
def own_data_run(seqaln,
                 mattype,
                 trfn,
                 schema_trf,
                 workdir,
                 sp_info_jsonfi,
                 configfi,
                 ingroup_mrca=None,
                 shared_blast_folder=None):
    """This is the wrapper function to start a PhyScraper run with your own data.
    You need:
         seqaln = path to sequence alignment file
         mattype = the format name of you alignment
         trfn = path to file with the phylogeny to update
         schema_trf = format type of your phylogeny
         workdir = define where your analysis files shall be stored
         sp_info_jsonfi = a json file which has the otu_dict stored, which is generated by the OtuJsonDict function
                            (usually, just leave it like it is in the example scripts.).
         configfi = path to your config file
         ingroup_mrca = not necessary, if you want to limit your run to a certain clade, give the OpenTree ID here,
                        can be obtained bu running: python scripts/get_ott.py ingroup_name
         shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation),
                                give the path to the folder with the shared runs.
    """

    debug("Debugging mode is on")

    if os.path.isfile("{}/scrape_checkpoint.p".format(workdir)):
        sys.stdout.write("Reloading from pickled scrapefile: ATT\n")
        scraper = pickle.load(open("{}/scrape_checkpoint.p".format(workdir), "rb"))
        scraper.repeat = 1
    else:
        sys.stdout.write("setting up Data Object\n")
        sys.stdout.flush()
        # read the config file into a configuration object
        conf = ConfigObj(configfi, interactive=False)
        # Generate an linked Alignment-Tree-Taxa object
        data_obj = generate_ATT_from_files(seqaln=seqaln, 
                                           mattype=mattype,
                                           workdir=workdir,
                                           treefile=trfn,
                                           schema_trf=schema_trf,
                                           otu_json=sp_info_jsonfi,
                                           ingroup_mrca=ingroup_mrca)

        # Prune sequences below a certain length threshold
        data_obj.prune_short()
        data_obj.write_files()
        data_obj.write_labelled(label="^ot:ottTaxonName")
        data_obj.write_otus("otu_info", schema="table")
        data_obj.dump()

        sys.stdout.write("setting up ID dictionaries\n")
        sys.stdout.flush()
        ids = IdDicts(conf, workdir=workdir)
        scraper = PhyscraperScrape(data_obj, ids)
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        # run the analyses
        scraper.run_blast_wrapper(delay=14)
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
    while scraper.repeat == 1:
        scraper.run_blast_wrapper(delay=14)
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
    return 1
示例#29
0
 def user_defined_concat(self):
     """If a user gave an input file to concatenate data. Fills in the data for self.comb_seq, self.comb_acc
     (is the replacement function for select_rnd_seq).
     """
     physcraper.debug("user_defined_concat")
     with open("{}/{}".format(self.workdir, self.concatfile),
               mode="r") as infile:
         reader = csv.reader(infile)
         sp_concat = dict((rows[0], rows[1]) for rows in reader)
     for otu in sp_concat.keys():
         global_spn = None
         concat_l = sp_concat[otu]
         if concat_l[:1] == "[":
             concat_l = concat_l[1:-1]
         concat_l = concat_l.split(", ")
         for item in concat_l:
             gene_l = []
             if item[:1] == "'":
                 item = item[1:-1]
             item = item.encode("utf-8")
             for gene in self.single_runs:
                 spn = None
                 for key, val in self.single_runs[gene].data.otu_dict.items(
                 ):
                     if item.isdigit():
                         if "^ncbi:accession" in val:
                             if item == val["^ncbi:accession"]:
                                 spn = val["^ot:ottTaxonName"]
                                 gene_l.append(gene)
                     else:
                         if "^ncbi:accession" in val:
                             if item == val["^ncbi:accession"]:
                                 spn = val["^ot:ottTaxonName"]
                                 gene_l.append(gene)
                         elif u"^ot:originalLabel" in val:
                             if item == val[u"^ot:originalLabel"]:
                                 spn = val["^ot:ottTaxonName"]
                                 gene_l.append(gene)
                     if spn is not None:
                         global_spn = spn.replace(".", "").replace("'", "")
                         spn = spn.replace(".", "").replace("'", "")
                         for key2, val2 in self.sp_acc_comb[spn][
                                 gene].items():
                             cond = False
                             if len(item.split(".")
                                    ) >= 2 and val2["unique_id"] == item:
                                 cond = True
                             else:
                                 if val2["unique_id"] == item:
                                     cond = True
                             if cond:
                                 concat_id = key2
                                 self.sp_acc_comb[spn][gene][concat_id][
                                     "concat:status"] = "used in concat"
                                 seq = str(self.sp_acc_comb[spn][gene]
                                           [concat_id]["seq"])
                                 otu_ = otu.replace(" ", "_")
                                 otu_ = otu_.replace(".",
                                                     "").replace("'", "")
                                 if gene in self.comb_seq.keys():
                                     if otu_ not in self.comb_seq[
                                             gene].keys():
                                         self.comb_seq[gene][otu_] = seq
                                         if gene in self.comb_acc:
                                             self.comb_acc[gene][
                                                 otu_] = concat_id
                                         else:
                                             self.comb_acc[gene] = {
                                                 otu_: concat_id
                                             }
                                     else:
                                         self.comb_seq[gene][otu_] = seq
                                         self.comb_acc[gene][
                                             otu_] = concat_id
                                 else:
                                     self.comb_seq[gene] = {otu_: seq}
                                     self.comb_acc[gene] = {otu_: concat_id}
                                 if spn != otu:
                                     self.sp_acc_comb[spn][gene][concat_id][
                                         "new tipname"] = otu_
                                 self.otu_to_spn(spn, gene, concat_id)
                                 break
                     if spn is not None:
                         break
             if len(gene_l) == len(concat_l):
                 missing_gene = [
                     item for item in self.genes_present
                     if item not in gene_l
                 ]
                 for genes in missing_gene:
                     self.make_empty_seq(global_spn, genes)