def get_all_gene_names(in_files): """ Input: path to a folder with htseq-count output files Output: list of unique gene names in input files """ genes_list = list() for file_counter, in_file in enumerate(in_files): in_data = hpf.l(in_file) file_gene_counter = 0 for line in in_data: if line[0:2] != "__": split_line = line.split("\t") gene_name = split_line[0] if gene_name not in genes_list: if file_counter == 0: genes_list.append(gene_name) else: sys.stderr.write( "Error: gene name " + gene_name + " in file " + in_file + " mismatches gene names in previously read file(s) in the same folder\n" ) sys.exit(1) file_gene_counter += 1 if file_gene_counter != len(genes_list): sys.stderr.write( "Error: mismatch in the number of genes between input files (encountered when reading " + in_file + ")\n") sys.exit(1) return genes_list
def main(input_vcf_file, out_path, window_step, exclude_samples): exclude_list = list() if exclude_samples != None: exclude_list = exclude_samples.split(" ") in_data = hpf.l(input_vcf_file) in_data = [n for n in in_data if n.startswith("#") == False or n.startswith("#CHROM") == True] header = in_data[0] split_header = header.split("\t") sample_names = split_header[9:len(split_header)] samples_count = len(sample_names) counts_dict = defaultdict(dict) in_data = in_data[1:len(in_data)] for line in in_data: split_line = line.split() sample_entries = split_line[9: len(split_line)] coord = int(split_line[1]) bin = math.floor(coord / window_step) bin = "bin_" + str(bin).zfill(3) for i in range(0, samples_count): sample_name = sample_names[i] if sample_name not in exclude_list: sample_entry = sample_entries[i] if sample_name not in counts_dict[bin]: counts_dict[bin][sample_name] = 0 if sample_entry != ".:0,0:0:.:0,0": allele = sample_entry.split(":")[0] if allele != "0" and allele != ".": counts_dict[bin][sample_name] += 1 df = pd.DataFrame(counts_dict) df.to_csv(out_path)
def main(in_folder, out_path): in_files = hpf.get_file_paths(in_folder, "txt") if len(in_files) == 0: sys.stderr.write( "Error: no files with .txt extension was found in the input folder\n" ) sys.exit(1) genes_list = get_all_gene_names(in_files) collection_dict = dict() for in_file in in_files: genes_dict = dict() sample_name = get_sample_name_from_file_name(in_file) for item in genes_list: genes_dict[item] = 0 in_data = hpf.l(in_file) for line in in_data: if line[0:2] != "__": split_line = line.split("\t") count = int(split_line[1]) gene_name = split_line[0] genes_dict[gene_name] = count collection_dict[sample_name] = genes_dict collection_df = pd.DataFrame(collection_dict) collection_df.to_csv(out_path, sep="\t", quoting=csv.QUOTE_NONE)
def load_dn_data(dn_path): """ Loads the dN results table """ dn_data = hpf.l(dn_path) dn_data = dn_data[1: len(dn_data)] dn_data = [n for n in dn_data if n.split()[1] != "NA"] return dn_data
def write_temp_alignment_file(alignment_file_path, temp_folder): """ Writes an alignment file to the temporary files folder """ fasta_data = hpf.l(alignment_file_path) out_file_path = temp_folder + "/temp_seq.fa" with open(out_file_path, "w") as out_file: for line in fasta_data: if line.startswith(">"): line = line.split("_")[0] out_file.write(line + "\n")
def main(input_vcf_file, genome_size, exclude_samples): exclude_list = list() if exclude_samples != None: exclude_list = exclude_samples.split(" ") in_data = hpf.l(input_vcf_file) in_data = [ n for n in in_data if n.startswith("#") == False or n.startswith("#CHROM") == True ] header = in_data[0] split_header = header.split("\t") sample_names = split_header[9:len(split_header)] samples_count = len(sample_names) counts_dict = dict() for sample_name in sample_names: counts_dict[sample_name] = 0 in_data = in_data[1:len(in_data)] print("Sample\tMean_nr_of_variants_per_10_kb") for line in in_data: split_line = line.split() sample_entries = split_line[9:len(split_line)] for i in range(0, samples_count): sample_name = sample_names[i] sample_entry = sample_entries[i] if sample_entry != ".:0,0:0:.:0,0": allele = sample_entry.split(":")[0] if allele != "0" and allele != ".": counts_dict[sample_name] += 1 freq_list = list() for sample in counts_dict: if sample not in exclude_list: sample_count = counts_dict[sample] sample_freq = sample_count / (genome_size / 10000) print(sample + "\t" + str(sample_freq)) freq_list.append(sample_freq) print("-----") print("Averaged numbers across all samples") print("Mean number of variants per 10 kb:", np.mean(freq_list)) print("Median number of variants per 10 kb:", np.median(freq_list)) print("Standard deviation of the number of variants per 10 kb:", np.std(freq_list))
def main(in_path, out_folder, fasta_path, deselected_scaffolds_path): coords_df = None deselected_scaffolds = [] if fasta_path == "": coords_df = get_scaffold_coords_by_source_features(in_path) else: coords_df = get_scaffold_coords_by_fasta(in_path, fasta_path) if deselected_scaffolds_path != "": deselected_scaffolds = hpf.l(deselected_scaffolds_path) records = list(SeqIO.parse(in_path, "embl")) t = os.system("mkdir -p " + out_folder) if t != 0: sys.stderr.write( "Error occurred when checking for the presence of output folder or creating the output folder ()" + out_folder + ")\n") sys.exit(1) for selected_scaff in range(0, coords_df.shape[0]): coords_df_entry = coords_df.iloc[selected_scaff] scaff_name = coords_df_entry["header"] scaff_id = coords_df_entry["id"] my_sequence_record = None if scaff_name not in deselected_scaffolds: query_start_coord = int(coords_df_entry.start_coord) query_end_coord = int(coords_df_entry.end_coord) out_path = out_folder + "/" + scaff_id + ".embl" union_seq = str(records[0].seq) seq = union_seq[query_start_coord - 1:query_end_coord - 1] my_sequence = Seq(seq) my_sequence_record = SeqRecord(my_sequence, id=scaff_id, name=scaff_name, description="unknown_description", dbxrefs=[]) my_sequence_record.seq.alphabet = generic_dna my_sequence_record.accession = "unknown_accession" my_sequence_record = process_record_features( records, coords_df, query_start_coord, query_end_coord, my_sequence_record) SeqIO.write(my_sequence_record, out_path, "embl")
def load_pfam_domains(pfam_domains_path): """ Loads a tab separated table where column 1 contains Hepatocystis gene names and column 2 contains the PFAM domains in the corresponding genes """ domains_list = list() domains_data = hpf.l(pfam_domains_path) domains_data = domains_data[1: len(domains_data)] pfam_domains_dict = dict() for line in domains_data: split_line = line.split() pfam_domain_entry = split_line[1] if pfam_domain_entry != "NA": pfam_domains = pfam_domain_entry.split(",") pfam_domains_dict[split_line[0]] = pfam_domains for pfam_domain in pfam_domains: if pfam_domain not in domains_list: domains_list.append(pfam_domain) return domains_list, pfam_domains_dict
def main(alignments_folder, output_folder, temp_folder, treefile_path): treefile_content = hpf.l(treefile_path) os.system("mkdir -p " + temp_folder) os.system("mkdir -p " + output_folder) os.chdir(temp_folder) alignment_files = os.listdir(alignments_folder) for alignment_file in alignment_files: alignment_file_path = alignments_folder + "/" + alignment_file write_temp_alignment_file(alignment_file_path, temp_folder) write_codeml_ctl_file("temp_seq.fa", "temp_tree.treefile", "temp_out.txt", temp_folder) write_temp_treefile(treefile_content, temp_folder) os.system("codeml codeml.ctl") results_file_name = alignment_file.split(".fa")[0] + "_codeml.txt" results_file_path = output_folder + "/" + results_file_name os.system("cp " + temp_folder + "/temp_out.txt" + " " + results_file_path)
def extract_sequences_from_fasta_by_id(args): """ Function for extracting sequences from a FASTA file by their names. The sequence names are truncated at the first space character before they are compared to the query string. The function allows extracting sequences based on 1 query string and also reading a list of query strings from a text file. There is an 'invert' mode to extract all sequences that do not match the query string(s). """ selected_seq_list = None if args.string_query == True: selected_seq_list = [args.query] else: selected_seq_list = hpf.l(args.query) for header, seq in args.fasta_data: fasta_seq_id = header.split()[0] seq_to_output = False if args.invert == False: if fasta_seq_id in selected_seq_list: seq_to_output = True else: if fasta_seq_id not in selected_seq_list: seq_to_output = True if seq_to_output == True: print_header_and_seq(header, seq)