def parallel_predict(self, species, genome_file, output, strand="both", gene_model=None, output_gff3=True, other_options="", split_dir="splited_input", splited_output_dir="splited_output_dir", config_dir=None, combine_output_to_single_file=True, use_softmasking=None, hints_file=None, extrinsicCfgFile=None, predict_UTR=None, external_process_pool=None, async_run=False, min_intron_len=None, parsing_mode="parse"): common_options = self.parse_options(species, genome_file="", strand=strand, gene_model=gene_model, output_gff3=output_gff3, other_options=other_options, config_dir=config_dir, use_softmasking=use_softmasking, hints_file=hints_file, extrinsicCfgFile=extrinsicCfgFile, predict_UTR=predict_UTR, min_intron_len=min_intron_len) splited_dir = FileRoutines.check_path(split_dir) splited_out_dir = FileRoutines.check_path(splited_output_dir) FileRoutines.safe_mkdir(splited_dir) FileRoutines.safe_mkdir(splited_out_dir) self.split_fasta_by_seq_len(genome_file, splited_dir, parsing_mode=parsing_mode) input_list_of_files = sorted(os.listdir(splited_dir)) list_of_output_files = [] options_list = [] for filename in input_list_of_files: input_file = "%s%s" % (splited_dir, filename) output_file = "%s%s.gff" % (splited_out_dir, filename) list_of_output_files.append(output_file) options = common_options options += " %s" % input_file options += " > %s" % output_file options_list.append(options) self.parallel_execute(options_list, external_process_pool=external_process_pool, async_run=async_run) if combine_output_to_single_file: CGAS.cat(list_of_output_files, output=output)
def parallel_blast(self, blast_command, seqfile, database, outfile=None, blast_options=None, split_dir="splited_fasta", splited_output_dir="splited_output_dir", evalue=None, output_format=None, threads=None, num_of_seqs_per_scan=None, combine_output_to_single_file=True, async_run=False, external_process_pool=None): splited_dir = FileRoutines.check_path(split_dir) splited_out_dir = FileRoutines.check_path(splited_output_dir) self.safe_mkdir(splited_dir) self.safe_mkdir(splited_out_dir) number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files) input_list_of_files = sorted(os.listdir(splited_dir)) list_of_files = [] for filename in input_list_of_files: filename_prefix = FileRoutines.split_filename(filename)[1] input_file = "%s%s" % (splited_dir, filename) output_file = "%s%s.hits" % (splited_out_dir, filename_prefix) list_of_files.append((input_file, output_file)) options_list = [] out_files = [] for in_file, out_filename in list_of_files: options = " -out %s" % out_filename options += " -db %s" % database options += " -query %s" % in_file options += " %s" % blast_options if blast_options else "" options += " -evalue %s" % evalue if evalue else "" options += " -outfmt %i" % output_format if output_format else "" options_list.append(options) out_files.append(out_filename) self.parallel_execute(options_list, cmd=blast_command, threads=threads, async_run=async_run, external_process_pool=external_process_pool) if combine_output_to_single_file: CGAS.cat(out_files, output=outfile)
def extract_sequences_from_selected_clusters( self, clusters_id_file, cluster_file, seq_file, output_dir="./", seq_format="fasta", out_prefix=None, create_dir_for_each_cluster=False, skip_cluster_if_no_sequence_for_element=True): from Routines import SequenceRoutines, FileRoutines cluster_id_list = IdList() cluster_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_cluster = True if out_prefix else create_dir_for_each_cluster if clusters_id_file: cluster_id_list.read(clusters_id_file) cluster_dict.read(cluster_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db( "tmp.idx", FileRoutines.make_list_of_path_to_files(seq_file), format=seq_format) number_of_skipped_clusters = 0 for fam_id in cluster_id_list if clusters_id_file else cluster_dict: if skip_cluster_if_no_sequence_for_element: absent_elements = self.check_absence_of_cluster_elements( cluster_dict[fam_id], protein_dict) if absent_elements: print "Skipping cluster %s due to absent element(%s)" % ( fam_id, ",".join(absent_elements)) number_of_skipped_clusters += 1 continue if fam_id in cluster_dict: if create_directory_for_each_cluster: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.fasta" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.fasta" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, cluster_dict[fam_id], verbose=True), out_file, format=seq_format) os.remove("tmp.idx") print "%i of %i clusters were skipped due to absent elements" % ( number_of_skipped_clusters, len(cluster_dict)) return number_of_skipped_clusters
def extract_proteins_from_alignments(dir_with_alignments, output_dir): out_dir = FileRoutines.check_path(output_dir) print type(FileRoutines) input_files = make_list_of_path_to_files( [dir_with_alignments] if isinstance(dir_with_alignments, str ) else dir_with_alignments) FileRoutines.safe_mkdir(out_dir) from Routines import MultipleAlignmentRoutines for filename in input_files: filename_list = FileRoutines.split_filename(filename) output_file = "%s%s%s" % (out_dir, filename_list[1], filename_list[2]) MultipleAlignmentRoutines.extract_sequences_from_alignment( filename, output_file)
def read_cluster_files_from_dir(dir_with_cluster_files): cluster_files_list = sorted(os.listdir(dir_with_cluster_files)) clusters_dict = OrderedDict() for filename in cluster_files_list: filepath = "%s%s" % ( FileRoutines.check_path(dir_with_cluster_files), filename) filename_list = FileRoutines.split_filename(filepath) clusters_dict[filename_list[1]] = SynDict() clusters_dict[filename_list[1]].read(filepath, header=False, separator="\t", allow_repeats_of_key=False, split_values=True, values_separator=",", key_index=0, value_index=1, comments_prefix="#") return clusters_dict
def extract_proteins_from_selected_families( families_id_file, fam_file, pep_file, output_dir="./", pep_format="fasta", out_prefix=None, create_dir_for_each_family=False): from Routines import SequenceRoutines, FileRoutines fam_id_list = IdList() fam_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_family = True if out_prefix else create_dir_for_each_family if families_id_file: fam_id_list.read(families_id_file) fam_dict.read(fam_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format) for fam_id in fam_id_list if families_id_file else fam_dict: if fam_id in fam_dict: if create_directory_for_each_family: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.pep" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.pep" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, fam_dict[fam_id], verbose=True), out_file, format=pep_format) else: print("%s was not found" % fam_id) os.remove("tmp.idx")
def split_proteins_per_species(dir_with_proteins, output_dir, input_format="fasta", output_format="fasta"): input_files = FileRoutines.make_list_of_path_to_files( [dir_with_proteins] if isinstance(dir_with_proteins, str ) else dir_with_proteins) out_dir = FileRoutines.check_path(output_dir) FileRoutines.safe_mkdir(out_dir) protein_dict = SeqIO.index_db("temp.idx", input_files, format=input_format) syn_dict = SynDict() for protein_id in protein_dict: taxa_id = protein_id.split(".")[0] # pep_id = ".".join(tmp_list[1:]) if taxa_id not in syn_dict: syn_dict[taxa_id] = [] syn_dict[taxa_id].append(protein_id) def renamed_records_generator(record_dict, taxa_id): for record_id in syn_dict[taxa_id]: record = deepcopy(record_dict[record_id]) #print(record) record.id = ".".join(record_id.split(".")[1:]) yield record for taxa_id in syn_dict: out_file = "%s%s.pep" % (out_dir, taxa_id) SeqIO.write(renamed_records_generator(protein_dict, taxa_id), out_file, format=output_format)
parser.add_argument("-i", "--input_vcf", action="store", dest="input_vcf", required=True, help="Input vcf file") parser.add_argument("-o", "--output_vcf", action="store", dest="output_vcf", required=True, help="Output vcf file") parser.add_argument("-r", "--reference", action="store", dest="reference", required=True, help="Fasta with reference genome") parser.add_argument("-g", "--gatk_directory", action="store", dest="gatk_dir", default="", help="Directory with GATK jar") args = parser.parse_args() SelectVariants.jar_path = FileRoutines.check_path(args.gatk_dir) SelectVariants.remove_entries_with_filters(args.reference, args.input_vcf, args.output_vcf)
action="store", dest="max_memory_per_thread", default="1G", help="Maximum memory per thread. Default - 1G") args = parser.parse_args() if args.prepare_bam and ((not args.prepared_bam_prefix) or (not args.temp_dir)): raise ValueError( "Options -e/--prepared_bam_prefix and -m/--temp_dir must be set if -p/--prepare_bam option is used" ) SamtoolsV1.threads = args.threads if args.prepare_bam or args.mix_ends: FileRoutines.safe_mkdir(FileRoutines.check_path(args.temp_dir)) prepared_pe_bam_file = "%s.bam" % args.prepared_bam_prefix prepared_unpaired_bam_file = ( "%s.unpaired.bam" % args.prepared_bam_prefix) if args.mix_ends else None """ SamtoolsV1.prepare_bam_for_read_extraction(args.input, args.prepared_bam, temp_file_prefix=args.temp_dir, max_memory_per_thread=args.max_memory_per_thread) """ SamtoolsV1.prepare_bam_for_read_extraction( args.input, prepared_pe_bam_file, temp_file_prefix=args.temp_dir, max_memory_per_thread=args.max_memory_per_thread, bam_file_to_write_unpaired_reads=prepared_unpaired_bam_file) if args.paired:
def extract_sequences_by_clusters(self, dir_with_cluster_files, dir_with_sequence_files, output_dir, file_with_white_list_cluster_ids=None, mode="families", sequence_file_extension="fasta", sequence_file_format="fasta", label_species=False, separator_for_labeling="@", species_label_first=True): """ basenames of cluster and sequence files must be same mode: clusters - extract sequences from clusters in separate files, species - extract sequences from species to separate files """ white_list_ids = None if file_with_white_list_cluster_ids: white_list_ids = IdSet() white_list_ids.read(file_with_white_list_cluster_ids) clusters_dict = self.read_cluster_files_from_dir( dir_with_cluster_files) cluster_names = self.get_cluster_names(clusters_dict, white_list_ids=white_list_ids) sequence_super_dict = OrderedDict() out_dir = FileRoutines.check_path(output_dir) for species in clusters_dict: idx_file = "%s_tmp.idx" % species sequence_file = "%s%s.%s" % (FileRoutines.check_path( dir_with_sequence_files), species, sequence_file_extension) sequence_super_dict[species] = SeqIO.index_db( idx_file, sequence_file, format=sequence_file_format) if mode == "species": seqeuence_names = self.get_sequence_names( clusters_dict, write_ids=False, out_prefix=None, white_list_ids=white_list_ids) for species in seqeuence_names: out_file = "%s%s.%s" % (out_dir, species, sequence_file_extension) SeqIO.write(SequenceRoutines.record_by_id_generator( sequence_super_dict[species], seqeuence_names[species]), out_file, format=sequence_file_format) elif mode == "families": def per_family_record_generator(seq_super_dict, clust_dict, cluster_id): if species_label_first: label_sequence = lambda label, name: "%s%s%s" % ( label, separator_for_labeling, name) else: label_sequence = lambda label, name: "%s%s%s" % ( name, separator_for_labeling, label) for species in seq_super_dict: #print species, cluster_id for record_id in clust_dict[species][cluster_id]: if label_species: record = deepcopy( seq_super_dict[species][record_id]) record.id = label_sequence(species, record_id) yield record else: yield seq_super_dict[species][record_id] for cluster_name in cluster_names: out_file = "%s%s.%s" % (out_dir, cluster_name, sequence_file_extension) SeqIO.write(per_family_record_generator( sequence_super_dict, clusters_dict, cluster_name), out_file, format=sequence_file_format) for species in clusters_dict: os.remove("%s_tmp.idx" % species)
parser.add_argument("--indel_InbreedingCoeff", action="store", dest="indel_InbreedingCoeff", type=float, default=-0.8, help="Indel InbreedingCoeff threshold. Default - -0.8") parser.add_argument("--indel_FS", action="store", dest="indel_FS", type=float, default=200.0, help="Indel FS threshold. Default - 200.0") args = parser.parse_args() VariantFiltration.jar_path = FileRoutines.check_path(args.gatk_dir) VariantFiltration.filter_bad_variants( args.reference, args.input_vcf, args.output_prefix, snp_filter_name=args.snp_filter_name, snp_QD=args.snp_QD, snp_FS=args.snp_FS, snp_MQ=args.snp_MQ, snp_HaplotypeScore=args.snp_HaplotypeScore, snp_MappingQualityRankSum=args.snp_MappingQualityRankSum, snp_ReadPosRankSum=args.snp_ReadPosRankSum, indel_filter_name=args.indel_filter_name, indel_QD=args.indel_QD, indel_ReadPosRankSum=args.indel_ReadPosRankSum,