Exemplo n.º 1
0
    def __init__(self,
                 workbench,
                 data_path="/home/moritz/DataBases/genomes/RefSeq/",
                 clean=False):
        Database.__init__(self, workbench=workbench, data_path=data_path)

        if not os.path.exists(self.metadata_file) or clean:
            ftp = FTP(ncbi)

            print "Getting metadata from ncbi"

            FNULL = open(os.devnull, 'w')
            ftp.login()
            ftp.cwd('genomes/refseq/bacteria/')
            info = StringIO.StringIO()
            ftp.retrbinary("RETR " + "assembly_summary.txt", info.write)
            info.seek(0)
            self.metadata = DataFrame.from_csv(info, sep="\t", header=1)
            ftp.close()
            self.metadata['assembly_level'] = self.metadata[
                'assembly_level'].apply(lambda x: x.replace(" ", "_"))
            self.metadata = self.metadata.transpose().to_dict()

            DataFrame.from_dict(self.metadata).to_csv(self.metadata_file)

        else:
            print "Loading metadata"
            self.metadata = DataFrame.from_csv(self.metadata_file).to_dict()

        print "Loading genomes"
        for k, v in tqdm(self.metadata.items()):
            genome_path = pjoin(self.data_path,
                                v['assembly_level'].replace(" ", "_"), k)
            genome_file = pjoin(genome_path, k + ".fna")
            self.genomes += [
                Genome(k,
                       genome_path,
                       ref=genome_file,
                       manual_metadata=v,
                       taxDb=self.taxDb,
                       workbench=self.workbench)
            ]
Exemplo n.º 2
0
google_data = pjoin(root, "OD1s and more - Sheet1.csv")
manual_metadata = DataFrame.from_csv(google_data).transpose().to_dict()
cpus = 1
all_genomes = []

for dir in os.listdir(data_root):
    dir = pjoin(data_root, dir)
    for g in os.listdir(dir):
        g_dir = pjoin(dir, g)
        fasta = [
            f for f in os.listdir(g_dir)
            if ".fasta" in f and not ".fasta." in f
        ]
        assert len(fasta) == 1
        all_genomes += [
            Genome(g, g_dir, pjoin(g_dir, fasta[0]), manual_metadata[g])
        ]

all_genomes.sort(key=lambda x: x.size, reverse=True)
all_clusters = [g for g in all_genomes if g.name == g.cluster]

name_map = {
    g.name: g.conv_name()
    for g in all_genomes if g.name != g.conv_name()
}
rev_name_map = {v: k for k, v in name_map.iteritems()}
name_map.update({g.conv_name(): g.conv_name() for g in all_genomes})
rev_name_map.update({g.name: g.name for g in all_genomes})

mcl = orthoMCL(pjoin(analyses_root, "orthoMCL/"), all_genomes,
               "big_clustering")
Exemplo n.º 3
0
    def __init__(self,
                 workbench,
                 data_path="/home/moritz/people/MoreData/genomes/TOBG/",
                 clean=False):
        Database.__init__(self, workbench=workbench, data_path=data_path)

        wb = load_workbook("metadata/Table3_GenomeStats.xlsx")
        t_metadata = DataFrame(
            [l for i, l in enumerate(wb['Sheet1'].values) if i > 1],
            columns=[l for l in wb['Sheet1'].values][1])
        corrected = {
            u'\xc2Gemmatimonadetes': 'Gemmatimonadetes',
            'marinegroup': 'Puniceicoccaceae',
            'Urania1B19': 'Phycisphaerae',
            'Thalassopira': 'Thalassospira',
            'SM1A02': 'Phycisphaerae',
            'SAR324cluster': 'SAR324 cluster',
            'unclassifiedAlphaproteobacteria': 'Alphaproteobacteria',
            'SAR202-2': 'SAR202 cluster',
            'SAR202-1': 'SAR202 cluster',
            'SAR116cluster': 'SAR116 cluster',
            'OPB35soil': 'unidentified Verrucomicrobium group OPB35',
            'Pla3': 'Planctomycetes',
            'OM190': 'Planctomycetes',
            'NovelClass_B': 'Ignavibacteriae',
            'Nitropelagicus': 'Candidatus Nitrosopelagicus',
            'Nanoarchaoeta': 'Nanoarchaeota',
            'Methylobacterum': 'Methylobacterium',
            'JL-ENTP-F27': 'Phycisphaerae',
            'FS140-16B-02marinegroup': 'Phycisphaerae',
            'Epsilonbacteraeota': 'Bacteria',
            'DEV007': 'Verrucomicrobiales',
            'CandidatusPuniceispirillum': 'Candidatus Puniceispirillum',
            'CandidatePhylaRadiation': 'Bacteria candidate phyla',
            'CaThioglobus': 'Candidatus Thioglobus',
            'CaAtelocyanobacterium': 'Candidatus Atelocyanobacterium',
            '0319-6G20': 'Bdellovibrionales',
            'Euryarcheota': 'Euryarchaeota',
            'SBR1093': 'Bacteria',
            'Euryarcheoata': 'Euryarchaeota'
        }

        regions = {
            'NP': 'North_Pacific',
            'NAT': 'North_Atlantic',
            'MED': 'Mediterranean',
            'ARS': 'Arabian_Sea',
            'RS': 'Red_Sea',
            'IN': 'Indian_Ocean',
            'EAC': 'East_Africa_Coastal',
            'SAT': 'South_Atlantic',
            'CPC': 'Chile_Peru_Coastal',
            'SP': 'South_Pacific'
        }

        wb2 = load_workbook("metadata/Table4_Phylogeny.xlsx")
        taxos = {
            l[0]:
            [v for v in l[:-1] if v != 'null' and not v[0:4] == "nove"][-1]
            for l in wb2.get_sheet_by_name('Hug set').values
        }
        taxos = {
            k: corrected[v] if corrected.has_key(v) else v
            for k, v in taxos.items()
        }

        tax_2_id = self.taxDb.get_name_translator(taxos.values())
        tax_ids = {
            g: tax_2_id.get(taxos[g])[0]
            for g in t_metadata['Genome ID'] if taxos.has_key(g)
        }
        t_metadata['species_taxid'] = [
            tax_ids[g] if tax_ids.has_key(g) else 131567
            for g in t_metadata['Genome ID']
        ]
        t_metadata.index = Index(t_metadata['Genome ID'])
        t_metadata['region'] = [
            regions[g.split("_")[1].split("-")[0]]
            for g in t_metadata['Genome ID']
        ]
        self.metadata = t_metadata.transpose().to_dict()

        print "Loading genomes"
        if os.path.exists(pjoin(self.data_path, 'TOBGGENOMES.tar.gz')):
            os.system("tar xzvf " +
                      pjoin(self.data_path, 'TOBGGENOMES.tar.gz'))
            os.remove(pjoin(self.data_path, 'TOBGGENOMES.tar.gz'))

        for k, v in tqdm(self.metadata.items()):
            genome_path = pjoin(self.data_path, v['region'], k)
            genome_file = pjoin(genome_path, k + ".fna")
            if not os.path.exists(genome_file):
                os.makedirs(pjoin(genome_path, 'original_files'))
                shutil.move(self.data_path + k + ".fna",
                            pjoin(genome_path, 'original_files'))
            self.genomes += [
                Genome(k,
                       genome_path,
                       ref=pjoin(genome_path, 'original_files', k + ".fna"),
                       manual_metadata=v,
                       taxDb=self.taxDb,
                       workbench=self.workbench)
            ]
Exemplo n.º 4
0
root = "test_data/"
data_root = pjoin(root, "data/")
analyses_root = pjoin(root, "analyses/")
google_data = pjoin(root, "metadata.csv")
manual_metadata = DataFrame.from_csv(google_data).transpose().to_dict()
cpus = 1
all_genomes = []

all_files = check_output(["find", data_root]).split()
for g in manual_metadata.keys():
    fasta = [
        f for f in all_files if ".fasta" in f and g in f and ".fasta." not in g
    ]
    assert len(fasta) == 1
    all_genomes += [
        Genome(g, os.path.dirname(fasta[0]), fasta[0], manual_metadata[g])
    ]

if not core:
    try:
        print "testing genome clustering with NICsimilarity"
        #		cluster_genomes(all_genomes,pjoin(analyses_root,"rifle_clusters.tsv"),cutoff=0.95)
        print "testing annotation"
        annotation(
            all_genomes,
            cpus=8,
        )
    except:
        printerr("non-core functions broken")
    print "non-core functions work"
Exemplo n.º 5
0
from micompy.common.utils.iotl_annotations import *
from dendropy import *
import csv

root = "/home/moritz/people/moritz/CDs/"
data_root = pjoin(root, "data/")
analyses_root = pjoin(root, "analyses")
google_data = pjoin(root, "OD1s and more - Sheet1.csv")
manual_metadata = DataFrame.from_csv(google_data).transpose().to_dict()
cpus = 16

all_raws = set(os.listdir(pjoin(data_root, "raw")))
assert all([k + ".fasta" in all_raws for k in manual_metadata.keys()])

all_genomes = [
    Genome(g, pjoin(data_root, "genomes", g),
           pjoin(data_root, 'raw', g + ".fasta"), manual_metadata[g])
    for g, m in manual_metadata.iteritems()
]

for g in tqdm(all_genomes):
    if not g.size:
        g.compute_size()

all_genomes.sort(key=lambda x: x.completness(), reverse=True)

annotation(all_genomes, cpus)

all_genomes = [g for g in all_genomes if g.is_good()]

derep_clusters = cluster_genomes_mash(
    genomes=all_genomes,
Exemplo n.º 6
0
if not os.path.exists(checkm_dir):
    os.makedirs(checkm_dir)


cpus = 11

all_closed = os.listdir(pjoin(data_root, "closed"))
all_cultfree = os.listdir(pjoin(data_root, "cultivation_frees"))

metadata = DataFrame.from_csv(pjoin(data_root,"taxontable18602_09-jun-2017.xls"), sep = "\t")
metadata.index = Index(metadata.index.to_series().apply(str))
metadata['short_name'] = nan #metadata['taxon_oid'].apply(str)
# check if all genomes are in the metadata sheet
assert all([g[:-6] in metadata.index for g in all_closed + all_cultfree])

metadata = metadata.transpose().to_dict()

all_closed_genomes = [ Genome(g[:-6], pjoin(data_root,"processed_genomes",g[:-6]), pjoin(data_root,'closed', g ), metadata[g[:-6]]) for g in all_closed]
all_cultfree_genomes = [ Genome(g[:-6], pjoin(data_root,"processed_genomes",g[:-6]), pjoin(data_root,'cultivation_frees', g ), metadata[g[:-6]]) for g in all_cultfree]

all_genomes = all_closed_genomes + all_cultfree_genomes

def run():
    annotation(all_genomes, cpus=cpus)
    checkm(all_genomes, cpus=cpus, output = pjoin(checkm_dir, "checkm_all.txt"))
    phylophlan(all_genomes, cpus=cpus, output = pjoin(checkm_dir, "phyluo.txt"))
    renaming_tree(pjoin(checkm_dir, "phyluo.txt"), pjoin(checkm_dir, "phyluo_renamed.txt"), {  g.name : "|".join([g.metadata['Class'],g.metadata['Order'],g.metadata['Family'], g.metadata['Genus']] )for g in all_genomes })renaming_tree(pjoin(checkm_dir, "phyluo.txt"), pjoin(checkm_dir, "phyluo_renamed.txt"), {  g.name : "|".join([g.metadata['Class'],g.metadata['Order'],g.metadata['Family'], g.metadata['Genus']] )for g in all_genomes })

    clade_data(all_genomes, {g.name : g.metadata['Class'] for g in all_genomes}, pjoin(checkm_dir, "phylophlan_class.txt")  )
    class_data(all_genomes, {g.name : g.metadata['Culture Type'] for g in all_genomes if g.metadata['Culture Type'] == g.metadata['Culture Type']}, pjoin(checkm_dir, "phylophlan_type.txt")  )
Exemplo n.º 7
0
import os, sys
from os.path import join as pjoin
from tqdm import tqdm

home = os.environ['HOME']
sys.path.append(pjoin(home, "repos/moritz/MiComPy/"))
from micompy.databases.database import Database
from micompy.common.genome import Genome
from micompy.common.tools.workbench import WorkBench

bench = WorkBench()
bench.default_bench()
g = Genome("GCF_000005845.2", ".", "GCF_000005845.2.fna", workbench=bench)
Db = Database("test_db", [g])
Db.process()
test = bench['HMMer'].hmmsearch_pfam_presence(g)
Exemplo n.º 8
0
from micompy.gene_clusterings.orthomcl.clustering import Clustering as MCLClustering
from micompy.gene_clusterings.clustering import Clustering
from micompy.gene_clusterings.pfam_clusters.clustering import PfamClustering
from itertools import groupby
from pylab import *
from micompy.common.utils.iotl_annotations import *

root = "/home/moritz/people/sarahi/all_enrichmentss/"
data_root = pjoin(root, "all_AGs/")
analyses_root = pjoin(root, "")
google_data = pjoin(root, "ag_metadata.csv")
manual_metadata = DataFrame.from_csv(google_data).transpose().to_dict()
cpus = 16

all_genomes = [
    Genome(g, pjoin(data_root, g), pjoin(data_root, m['genomes']),
           manual_metadata[g]) for g, m in manual_metadata.iteritems()
]

for g in tqdm(all_genomes):
    if not g.size:
        g.compute_size()

all_genomes.sort(key=lambda x: x.size, reverse=True)

annotation(all_genomes, cpus)
sh.cat(*[g.proteom.replace(".faa", ".gff") for g in all_genomes],
       _out="temp.gff")
sh.grep("CDS", "temp.gff", _out=pjoin(analyses_root, "all_gff.gff"))

#checkm(all_genomes, pjoin(analyses_root,"checkm"), cpus)