help="File with the genome list. Format GenomeID, FullName, ShortName", required=True) parser.add_argument("-a", "--annotation_folder", type=str, help="Folder with the annotation files from JGI", required=True) parser.add_argument("-c", "--cluster_file", type=str, help="Cluster file", required=True) parser.add_argument("-o", "--output_directory", type=str, help="Output folder", required=True) args = parser.parse_args() #Create the output directory if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) #####Read the genome list genome_id_dictionary, genome_count = ClusterTools.read_genome_list(args.genome_list_index) ###Read the annotation information protein_annotation, function_definitions = AnnotationTools.parse_annotation_folder(genome_id_dictionary.keys(), args.annotation_folder) ##Read the cluster information total_clusters = ClusterTools.get_cluster_information(args.cluster_file) ##Print log file logfile = open(args.output_directory + "/logfile.txt", 'w') ##Total number of clusters logfile.write("Total number of analyzed clusters: %d" % len(total_clusters) + "\n") features_to_annotate = ["COG", "KO", "PFAM", "Product"]
parser.add_argument("-c", "--cluster_file", type=str, help="Ortholog file, generated by OrthoMCL", required=True) parser.add_argument("-f", "--fasta_aa_directory", type=str, help="Directory with the fasta files", required=True) parser.add_argument("-g", "--group_information", type=str, help="Group file") parser.add_argument("-o", "--output_directory", type=str, help="Output directory", required=True) args = parser.parse_args() #Create the output directory if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) #Create a log file run_summary = open(args.output_directory + "/logfile.txt", 'w') #####Read the genome list genome_id_dictionary, genome_count = ClusterTools.read_genome_list(args.genome_list_index) run_summary.write("Genomes in the genome list: %d" % genome_count + "\n") ######Read the cluster information, and check that everything is ok #cluster_information, set_of_proteins_in_clusters, unique_cluster_count, total_clusters, removed_clusters = \ # get_orthomcl_results(args.cluster_file, [i for i in genome_id_dictionary.itervalues()]) cluster_information, set_of_proteins_in_clusters, unique_cluster_count, total_clusters, removed_clusters = \ get_orthomcl_results(args.cluster_file, genome_id_dictionary.keys()) run_summary.write("Total number of clusters: %d" % len(cluster_information) + "\n") run_summary.write("Total number of protein in clusters: %d" % len(set_of_proteins_in_clusters) + "\n") run_summary.write("Total number of removed clusters (not present in the genome file): %d" % removed_clusters + "\n") #Check the counts, to see if everything is going ok
dna_aligned_folder = args.output_directory + "/dna_aligned" protein_tree_folder = args.output_directory + "/protein_trees" dna_tree_folder = args.output_directory + "/dna_trees" folder_list = [ args.output_directory, protein_unaligned_folder, protein_alignment_folder, dna_unaligned_folder, dna_aligned_folder, protein_tree_folder, dna_tree_folder ] for folder in folder_list: if not os.path.exists(folder): os.makedirs(folder) #Get the cluster information cluster_information = ClusterTools.get_cluster_information( args.cluster_file) #Create the sequence dictionary dna_sequence_dic = create_sequence_dictionary(args.fasta_nuc_directory) #Iterate over each cluster and generate the alignments frameshift_cases = [] inframe_stops = [] clusters_too_short = [] nucleotide_not_found = [] for cluster in cluster_information: protein_list = cluster_information[cluster].split(",") curated_protein_list = {}
protein_unaligned_folder = args.output_directory + "/protein_unaligned" protein_alignment_folder = args.output_directory + "/protein_alignment" dna_unaligned_folder = args.output_directory + "/dna_unaligned" dna_aligned_folder = args.output_directory + "/dna_aligned" protein_tree_folder = args.output_directory + "/protein_trees" dna_tree_folder = args.output_directory + "/dna_trees" folder_list = [args.output_directory, protein_unaligned_folder, protein_alignment_folder, dna_unaligned_folder, dna_aligned_folder, protein_tree_folder, dna_tree_folder] for folder in folder_list: if not os.path.exists(folder): os.makedirs(folder) #Get the cluster information cluster_information = ClusterTools.get_cluster_information(args.cluster_file) #Create the sequence dictionary dna_sequence_dic = create_sequence_dictionary(args.fasta_nuc_directory) #Iterate over each cluster and generate the alignments frameshift_cases = [] inframe_stops = [] clusters_too_short = [] nucleotide_not_found = [] for cluster in cluster_information: protein_list = cluster_information[cluster].split(",") curated_protein_list = {}