def get_patric_annotation_files(): #pathway_directory = '%spatric_pathway/' % (parse_midas_data.data_directory) os.system("mkdir -p %s" % patric_directory) #os.system("mkdir -p %s" % pathway_directory) #intermediate_filename = intermediate_filename_template % (pairwise_directory, species_name) # get a list of specis to run this script on. good_species_list = parse_midas_data.parse_good_species_list() for species_name in good_species_list: core_genes = core_gene_utils.parse_core_genes(species_name) genome_ids = set([ ".".join(core_gene.split(".", 2)[:2]) for core_gene in core_genes ]) for genome_id in genome_ids: print(species_name, genome_id) cmnd_subsystem = "curl ftp://ftp.patricbrc.org/genomes/%s/%s.PATRIC.subsystem.tab -o %s/%s.PATRIC.subsystem.tab" % ( genome_id, genome_id, patric_directory, genome_id) cmnd_pathway = "curl ftp://ftp.patricbrc.org/genomes/%s/%s.PATRIC.pathway.tab -o %s/%s.PATRIC.pathway.tab" % ( genome_id, genome_id, patric_directory, genome_id) cmnd_features = "curl ftp://ftp.patricbrc.org/genomes/%s/%s.PATRIC.features.tab -o %s/%s.PATRIC.features.tab" % ( genome_id, genome_id, patric_directory, genome_id) cmnd_cat = "cat %s/%s.PATRIC.features.tab | cut -f6,21 > %s/%s.kegg.txt" % ( patric_directory, genome_id, patric_directory, genome_id) cmnd_bzip2 = "bzip2 -k %s/%s.kegg.txt" % (patric_directory, genome_id) cmnd_rRNAs = "curl ftp://ftp.patricbrc.org/genomes/%s/%s.PATRIC.frn -o %s/%s.PATRIC.frn" % ( genome_id, genome_id, patric_directory, genome_id)
def load_centroid_gene_map(desired_species_name=None): if desired_species_name == None: import parse_midas_data desired_speciess = parse_midas_data.parse_good_species_list() else: desired_speciess = [desired_species_name] for desired_species_name in desired_speciess: # First load reference genes reference_genes = load_reference_genes(desired_species_name) gene_info_file = gzip.open( "%span_genomes/%s/gene_info.txt.gz" % (config.midas_directory, desired_species_name), 'r') gene_info_file.readline() # header centroid_gene_map = {} for line in gene_info_file: items = line.split("\t") gene_id = items[0].strip() centroid_id = items[3].strip() if centroid_id not in centroid_gene_map: centroid_gene_map[centroid_id] = centroid_id if (gene_id in reference_genes) and (centroid_id not in reference_genes): centroid_gene_map[centroid_id] = gene_id gene_info_file.close() return centroid_gene_map
################################################################################ min_sample_size = config.between_host_min_sample_size # 46 gives at least 1000 low_divergence_threshold = config.between_low_divergence_threshold allowed_variant_types = set(['4D']) #focal_speciess = ['Bacteroides_vulgatus_57955', 'Roseburia_inulinivorans_61943'] #focal_speciess = ['Bacteroides_vulgatus_57955', 'Faecalibacterium_prausnitzii_62201'] focal_speciess = ['Bacteroides_vulgatus_57955', 'Akkermansia_muciniphila_55290'] focal_colors = ['b','g'] #supplemental_focal_species = ['Bacteroides_fragilis_54507', 'Alistipes_putredinis_61533', 'Eubacterium_rectale_56927'] supplemental_focal_species = ['Bacteroides_fragilis_54507', 'Parabacteroides_distasonis_56985', 'Alistipes_shahii_62199'] # 'Ruminococcus_bromii_62047'] good_species_list = parse_midas_data.parse_good_species_list() if debug: good_species_list = good_species_list[0:2] sys.stderr.write("Loading sample metadata...\n") subject_sample_map = sample_utils.parse_subject_sample_map() sample_continent_map = sample_utils.parse_sample_continent_map() sys.stderr.write("Done!\n") #################################################### # # Set up Figure (2 panels, arranged in 2x1 grid) # #################################################### pylab.figure(1,figsize=(5,6))
import parse_midas_data if len(sys.argv) < 3: sys.stderr.write( "Usage: python loop_over_species_wrapper.py all|debug|species command...\n" ) sys.exit(1) # First argument is either 'all', 'debug', or a species name debug_flag = "" if sys.argv[1] == 'debug': species_names = [parse_midas_data.debug_species_name] debug_flag = "--debug" elif sys.argv[1] == 'all': species_names = parse_midas_data.parse_good_species_list() else: good_species_names = parse_midas_data.parse_good_species_list() species_names = [] pattern = sys.argv[1] for species_name in good_species_names: if species_name.startswith(pattern): species_names.append(species_name) # Remaining arguments are command to run, with species name appended as last argument command = " ".join(sys.argv[2:]) sys.stderr.write("Running command: %s\n" % command) sys.stderr.write("for %d species...\n\n" % len(species_names)) for species_name in species_names:
f = float(items[1]) gene_freq_map[gene_name] = f file.close() return gene_freq_map # Actually calculate the core genes if __name__ == '__main__': import parse_midas_data os.system('mkdir -p %s' % core_genes_directory) os.system('mkdir -p %s' % external_core_genes_directory) pangenome_species = parse_midas_data.parse_good_species_list() cmin = config.core_genome_min_copynum cmax = config.core_genome_max_copynum shared_cmin = config.shared_genome_min_copynum min_good_fraction = config.core_genome_min_prevalence min_coverage = 5 # (for assessing core genome, we'll use a lower coverage value than when we look at real changes) output_filename = default_core_gene_filename output_file = gzip.GzipFile(output_filename, "w") stringent_output_filename = default_stringent_core_gene_filename stringent_output_file = gzip.GzipFile(stringent_output_filename, "w") shared_output_file = gzip.GzipFile(default_shared_gene_filename, "w")
def get_16S_fasta(): good_species_list = parse_midas_data.parse_good_species_list() frn = open(frn_path, 'w') for species_name in good_species_list: core_genes = core_gene_utils.parse_core_genes(species_name) genome_ids = list( set([ ".".join(core_gene.split(".", 2)[:2]) for core_gene in core_genes ])) genome_id = genome_ids[0] species_name_frn_path = "%s/%s.PATRIC.frn" % (patric_directory, genome_id) species_name_frn = classFASTA(species_name_frn_path).readFASTA() counted_rRNA = False for species_name_frn_name, species_name_frn_seq in species_name_frn: if 'ssuRNA' not in species_name_frn_name: continue if len(species_name_frn_seq) < 1200: continue if counted_rRNA == False: #print(species_name, species_name_frn_name, len(species_name_frn_seq)) species_name_frn_name_split = species_name_frn_name.split() frn_header = species_name + '|' + species_name_frn_name_split[ 0].split('|')[1] frn.write('>%s\n' % frn_header) seq_split = [ species_name_frn_seq[i:i + 80] for i in range(0, len(species_name_frn_seq), 80) ] for seq in seq_split: frn.write('%s\n' % seq) frn.write('\n') counted_rRNA = True else: continue # same for outgroup outgroup = classFASTA(outgroup_path).readFASTA() print(outgroup[0][0]) frn.write('>%s\n' % outgroup[0][0]) seq_outgroup_split = [ outgroup[0][1][i:i + 80] for i in range(0, len(outgroup[0][1]), 80) ] for seq in seq_outgroup_split: frn.write('%s\n' % seq) frn.write('\n') frn.close() os.system('muscle -in %s -out %s' % (frn_path, frn_aligned_path))