#os.system('gunzip dataflow/01-nucl/mags/*.gz')

# rename the headers and concatenate all of the files

files = df_mags['file_unzip'].tolist()
files_rename = []

for file in files:
    file_obj = sc.Fasta(file, 'dataflow/01-nucl/mags/')
    outname = file.split('.fa')[0] + '_rename.fasta'
    files_rename.append(outname)
    file_obj.setOutputName(outname)
    file_obj.setOutputLocation('dataflow/01-nucl/mags/')
    #file_obj.headerrename()

sg.concat(inputfolder='dataflow/01-nucl/mags/',
          outputpath='dataflow/01-nucl/stewart2019_mags.fasta',
          filenames=files_rename)

# predict the ORFs (nucl and prot) for all the MAGS

file_obj = sc.Fasta('stewart2019_mags.fasta', 'dataflow/01-nucl/')
file_obj.setOutputName('stewart2019_mags_genes.fasta')
file_obj.setOutputLocation('dataflow/01-nucl/')
file_obj.runprodigal(type='nucl')

file_obj = sc.Fasta('stewart2019_mags.fasta', 'dataflow/01-nucl/')
file_obj.setOutputName('stewart2019_mags_prot.fasta')
file_obj.setOutputLocation('dataflow/01-prot/')
file_obj.runprodigal()
예제 #2
0
    '4309680-submission.assembly_59', '3964017-submission.assembly_7',
    '3643350-assembly_6', '3394949-submission.assembly_17', 'RUG117_52'
]

file_obj = sc.Fasta('rumen_genomes.fasta', 'dataflow/01-nucl/')
file_obj.setOutputName('subclade_island.fasta')
file_obj.setOutputLocation('dataflow/01-nucl/')
file_obj.subsetfasta(seqlist=genes, headertag='none')

# 4309680-submission.assembly_59 was then blasted against the
# NCBI nucleotide collection (nr/nt) using web-based blastn and the full-length sequence for each of the top 50 hits
# was downloaded and concatenated into island2_pathogens.fasta. With the contigs selected above into rumen_genomes_island2_pathogens.fasta.

files = ['island2_pathogens.fasta', 'subclade_island.fasta']
sg.concat(inputfolder='dataflow/01-nucl/',
          outputpath='dataflow/01-nucl/rumen_genomes_island2_pathogens.fasta',
          filenames=files)

# a blast database was then made with the contigs of interest, including 4309680-submission.assembly_59

file = "subclade_island.fasta"
indir = 'dataflow/01-nucl/'
blastdir = 'dataflow/02-blast/'
blastdbdir = 'dataflow/02-blast-db/'

file_obj = sc.Fasta(file, indir)
file_obj.setOutputName(file)
file_obj.setOutputLocation(blastdbdir)
file_obj.runmakeblastdb(dbtype='nucl')

file = "rumen_genomes_island2_pathogens.fasta"
from modules import seq_gen_lin as sg

# Genomes that were downloaded (Figure 1 and Figure 2) were combined into fig1_fig3_ncbi_nucl_hits.fasta
# Note the file name was made before fig 3 was moved to fig 2. These were then combined
# with the rumen genomes (pathogens_rumen.fasta) and made into a blast database.

file = "fig1_fig3_ncbi_nucl_hits.fasta"

file_obj = sc.Fasta(file, 'dataflow/01-nucl/')
file_obj.setOutputName(file)
file_obj.setOutputLocation('dataflow/01-prot/')
file_obj.runprodigal()

seqs_concatn = ['rumen_genomes.fasta', 'fig1_fig3_ncbi_nucl_hits.fasta']

sg.concat(inputfolder='dataflow/01-prot/', outputpath='dataflow/01-prot/pathogens_rumen.fasta', filenames=seqs_concatn)

file = "pathogens_rumen.fasta"
blastdbdir = 'dataflow/02-blast-db/'

file_obj = sc.Fasta(file, 'dataflow/01-prot/')
file_obj.setOutputName(file)
file_obj.setOutputLocation(blastdbdir)
file_obj.runmakeblastdb(dbtype='prot')

# The two version of ANT6 (v1_v2_4309680.fasta) were then blasted against the pathogen and rumen genomes.

indir = 'dataflow/01-prot/'
blastdir = 'dataflow/02-blast/'
file = "v1_v2_4309680.fasta"
file_obj = sc.Fasta(file, 'dataflow/01-nucl/')
outputfilename = file.split(".f")[0] + '_extractedCONTIGs_all_rumen' + '.fasta'
file_obj.setOutputName(outputfilename)
file_obj.setOutputLocation('dataflow/01-nucl/')
file_obj.extractORFs_gff3(
    gff3_table_loc='dataflow/00-meta/resistance_blast_hit_cotigs_all_rumen.csv'
)

files = [
    "resistance_island_blast_hits_concatenated_extractedCONTIGs_3rumen.fasta",
    "rumen_genomes_extractedCONTIGs_all_rumen.fasta"
]

sg.concat(
    inputfolder='dataflow/01-nucl/',
    outputpath='dataflow/01-nucl/rumen_genomes_extractedCONTIGs_all.fasta',
    filenames=files)

file = "rumen_genomes_extractedCONTIGs_all.fasta"
indir = 'dataflow/01-nucl/'
blastdir = 'dataflow/02-blast/'

file_obj = sc.Fasta(file, indir)
file_obj.setOutputLocation(blastdir)

outputfilename = "resistance_island_mapping2.txt"
blastdb = "rumen_genomes_resistance_genes.fasta"

file_obj.setOutputName(outputfilename)
file_obj.runblast(blast='blastn',
                  db=blastdb,
예제 #5
0
from modules import seq_gen_lin as sg

# concatenate all of the downloaded pathogen assemblies and then make blast DBs for each

dirs = [
    'staphylococcus_aureus', 'campylobacter_jejuni', 'campylobacter_coli',
    'clostridioides_difficile', 'acinetobacter_baumannii',
    'streptococcus_pneumoniae'
]
head_dir = 'dataflow/01-nucl/'

for dir in dirs:
    path_dir = head_dir + dir + '/'
    unzip_command = 'gunzip ' + path_dir + '*.gz'
    os.system(unzip_command)
    lis = [f for f in os.listdir(path_dir) if f.endswith(".fna")]
    output_file = head_dir + dir + '.fasta'
    sg.concat(inputfolder=path_dir, outputpath=output_file, filenames=lis)

files = [
    'staphylococcus_aureus', 'campylobacter_jejuni', 'campylobacter_coli',
    'clostridioides_difficile', 'acinetobacter_baumannii',
    'streptococcus_pneumoniae'
]

for file in files:
    file_obj = sc.Fasta(file, 'dataflow/01-nucl/')
    file_obj.setOutputName(file)
    file_obj.setOutputLocation('dataflow/02-blast-db/')
    file_obj.runmakeblastdb(dbtype='nucl')