def add_bc_umi_to_sam(path_to_bc, path_to_umi, sam_in_path, sam_out_path, file_name, cbc_range, umi_range, umi_cbc_file_type="txt"): sam_out_path += "/" + file_name sam_in = pysam.AlignmentFile(sam_in_path, "r") sam_out = pysam.AlignmentFile(sam_out_path, "w", template=sam_in) bc_list = read_from_file(file_type=umi_cbc_file_type, input_file=path_to_bc) umi_list = read_from_file(file_type=umi_cbc_file_type, input_file=path_to_umi) i = 0 for read in sam_in.fetch(): read.set_tag("XC", bc_list[i][cbc_range[0] - 1:cbc_range[1]]) read.set_tag("XM", umi_list[i][umi_range[0] - 1:umi_range[1]]) sam_out.write(read) i += 1 print("Number of reads in .bam file: " + str(i))
def write_gene_names_to_file(path_to_sam, accepted_function, out_dir, file_name): read_list = read_from_file(file_type="sam", input_file=path_to_sam) sorted_sam = sort_sam_built_in(read_list) output = out_dir + "/" + file_name handler = open(output + ".txt", "w") # some gn tags have two genes. intronic mappings can be ignored total = 0 no_mapping = 0 accepted_genes = 0 accepted_functions = accepted_function.split(",") for entry in sorted_sam: total += 1 if entry.has_tag('gn'): # did this read align anywhere in the genome? gene_names = entry.get_tag('gn').split(',') # get values for 'gn' tag of this read gene_function = entry.get_tag('gf').split(',') # get values for 'gf' tag of this read intersection = set(gene_function).intersection(accepted_functions) if len(intersection) != 0 and len(gene_names) < 2: handler.write(entry.get_tag('gn')) accepted_genes += 1 else: handler.write('no_gene') else: no_mapping += 1 handler.write('no_gene') handler.write("\n") handler.close() print("accepted reads: " + str(accepted_genes)) print("total reads: " + str(total))
def read_in_clusters(input): ''' ''' reads = read_from_file(input_file=input, file_type="txt") return [e.split("\t") for e in reads]
def read_in_clusters(path_to_cluster_file): clusters_raw = read_from_file(input_file=path_to_cluster_file, file_type="txt") no_clusters = get_number_of_clusters(clusters_raw) clusters = [] print("number of clusters: " + str(no_clusters)) for cluster_no in range(no_clusters): clusters.append(Cluster()) print("empty clusters formed") for cluster_no in range(no_clusters): cluster = clusters_raw[cluster_no].split('\t') # print(cluster) if len(cluster) == 3: clusters[cluster_no].add_reads(bc_seq=cluster[0], cluster_size=cluster[1], gene_umi_list=cluster[2]) else: clusters[cluster_no].add_reads(bc_seq=cluster[0], cluster_size=0, gene_umi_list="") print("clusters populated") # # to be removed # total = 0 # accepted_reads = 0 # no_no_genes = 0 # for cluster in clusters: # total += len(cluster.umis) # for read in cluster.gene_names: # if read != "no_gene": # accepted_reads += 1 # else: # no_no_genes += 1 # # print("number of accepted reads: " + str(accepted_reads)) # print("total reads: " + str(total)) # print("number of no_genes: " + str(no_no_genes)) return clusters
def get_bcs_umis_queryname(path_to_bc_reads, mode="SPLiT"): """ reads in the bc_read.fastq file, extracts the 3 BC's and the UMI, lastly combined the 3 BC's to one overal cellular BC :return: [(query_name, barcode_seq, qual_score, umi_seq, umi_score),(),...] """ bc_reads_list = read_from_file(input_file=path_to_bc_reads, file_type="fastq_all") bc_list = [None] * len(bc_reads_list) umi_list = [None] * len(bc_reads_list) if mode == "SPLiT": i = 0 for i, read in enumerate(bc_reads_list): query_name = read[0] bc = read[1][10:18] + read[1][48:56] + read[1][86:94] bc_qual = read[2][10:18] + read[2][48:56] + read[2][86:94] umi = read[1][0:10] umi_qual = read[2][0:10] bc_list[i] = [query_name, bc, bc_qual] umi_list[i] = [query_name, umi, umi_qual] # i += 1 if mode == "10X": for i, read in enumerate(bc_reads_list): query_name = read[0] bc = read[1][0:16] bc_qual = read[2][0:16] umi = read[1][16:] umi_qual = read[2][16:] bc_list[i] = [query_name, bc, bc_qual] umi_list[i] = [query_name, umi, umi_qual] return bc_list, umi_list
def make_barcode_combinations(bc1_path, bc2_path, bc3_path): # bc1_cleaned = remove_fasta_header(remove_newlinetag(txt_to_list(bc1_path))) bc1_cleaned = read_from_file(input_file=bc1_path, file_type="txt") bc1_extracted = extract_barcodes(bc1_cleaned) bc2_cleaned = read_from_file(input_file=bc2_path, file_type="txt") bc2_extracted = extract_barcodes(bc2_cleaned) bc3_cleaned = read_from_file(input_file=bc3_path, file_type="txt") bc3_extracted = extract_barcodes(bc3_cleaned, True) combined_bcs = [] for bc1 in bc1_extracted: for bc2 in bc2_extracted: for bc3 in bc3_extracted: combined_bcs.append(bc1 + bc2 + bc3) return combined_bcs
def select_bcs_umis_gens(path_to_umis_fastq, path_to_gen_fastq, aligned_sel_bcs): # read in umis and genes umis_list = read_from_file(input_file=path_to_umis_fastq, file_type="fastq_all") gens_list = read_from_file(input_file=path_to_gen_fastq, file_type="fastq_all") # list that will be populated with sel umis (0s or sequences) sel_umis = [0] * len(aligned_sel_bcs) for pos in range(len(umis_list)): number_low_quality_bases = get_number_of_low_quality_bases( umis_list[pos][2]) if number_low_quality_bases > 1 or aligned_sel_bcs[pos] == 0: aligned_sel_bcs[pos] = 0 gens_list[pos] = 0 else: sel_umis[pos] = umis_list[pos][1] return aligned_sel_bcs, sel_umis, gens_list
def barcodes_txt_to_FASTA(input_file_destination, output_directory, output_file_name): barcode_list = read_from_file(file_type = "txt", input_file = input_file_destination) type_to_be_removed = type_remove(barcode_list) cleaned_barcoed_list = [] if type_to_be_removed == "5Biosg": cleaned_barcoed_list = remove_5Biosg(barcode_list) elif type_to_be_removed == "5Phos": cleaned_barcoed_list = remove_5Phos(barcode_list) write_to_txt(cleaned_barcoed_list, output_directory, output_file_name)
def get_bcs_from_fastq(path_to_barcode_fastq): bc_reads_fastq = read_from_file(input_file=path_to_barcode_fastq, file_type="fastq_all") bc_reads_fastq_extracted = [] for read in bc_reads_fastq: read_name = read[0] read_seq = read[1][10:18] + read[1][48:56] + read[1][86:94] read_quality = QSanger_to_Phred33(read[2][10:18] + read[2][48:56] + read[2][86:94]) bc_reads_fastq_extracted.append((read_name, read_seq, read_quality)) return bc_reads_fastq_extracted
def merge_fastq(input_directories, out_dir, output_file_name): # get input locations of fastq files directories_list = get_input_locations(input_directories) # initialize list that will be filled with content of fastq files fastq_files = [None] * len(directories_list) # read in input fastq files for i in range(len(fastq_files)): fastq_files[i] = read_from_file(input_file=directories_list[i], file_type='fastq_all') for fastq in fastq_files: write_to_fastq(fastq, out_dir, output_file_name, mode="append")
def main(cmd_args): path_to_barcodes = cmd_args['cbc_clusters'] barcodes = read_from_file(input_file=path_to_barcodes, file_type="txt") path_to_umis = cmd_args['umi_clusters'] umis = read_from_file(input_file=path_to_umis, file_type="txt") path_to_genes = cmd_args['gene_names'] genes = read_from_file(input_file=path_to_genes, file_type="txt") # gene_no = 0 # for gen in genes: # if gen != "no_gene": # gene_no += 1 # print(gene_no) output_file_name = cmd_args["file_name"] out_put_dir = cmd_args["out_dir"] construct_cluster_umi_file(barcodes, umis, genes, out_dir=out_put_dir, file_name=output_file_name)
def get_aligned_selected_reads(path_to_sam_file, barcode_comb_list): ''' Reads in contents of .bam file and copies those reads to sam_list that could be aligned and have an ED < 1 to the aligned barcode_combinaiton. Then (this only makes sense if ED > 0) reads are corrected with the respective barcode from barcode_comb_list. :param path_to_sam_file: :param barcode_comb_list: :return: a list that contains 0 for reads that did not pass the selection criteria and a read sequence for those read that did pass the selection criteria. ''' sam_list = read_from_file(file_type="sam", input_file=path_to_sam_file) aligned_reads = [0] * len(sam_list) i = 0 for sam in sam_list: if sam[0].has_tag("AS") and sam[0].get_tag('NM') < 1: correct_seq_pos = sam[0].tid aligned_reads[i] = barcode_comb_list[correct_seq_pos] else: aligned_reads[i] = 0 i += 1 return aligned_reads
handler.write("\n") handler.write(str(comb)) handler.write("\n") chromosome += 1 def main(cmd_args): bc1_path = cmd_args["bc1"] bc2_path = cmd_args["bc2"] bc3_path = cmd_args["bc3"] out_put_dir = cmd_args["out_dir"] barcode_combinations = make_barcode_combinations(bc1_path, bc2_path, bc3_path) write_to_txt(barcode_combinations, out_put_dir, "barcode_combinations") bc_combinations_to_fasta(barcode_combinations, out_put_dir, "barcode_combinations") # if __name__ == "__main__": # main(get_cmd_args()) #### just for one time use bc_path = "/Users/manuel/Desktop/10xv2_whitelist.txt" bcs = read_from_file(input_file=bc_path, file_type="txt") out_dir = "/Users/manuel/Desktop/" filename = "10X_bc_combinations" bc_combinations_to_fasta(barcode_combinations=bcs, out_put_dir=out_dir, out_filename=filename)
# This script can be use to extract the barcode or umi from the original oligo from create_bc_comb import extract_barcodes from tools.file_input_output import read_from_file, write_to_txt bc1_path="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/metadata/barcodes_paper/r1barcodes.txt" out_put_dir="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/metadata/barcodes_paper" bc1_cleaned = read_from_file(input_file=bc1_path, file_type="txt") bc1_extracted = extract_barcodes(bc1_cleaned) write_to_txt(bc1_extracted, out_put_dir, "bc1_isolated") bc2_path="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/metadata/barcodes_paper/r2barcodes.txt" out_put_dir="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/metadata/barcodes_paper" bc2_cleaned = read_from_file(input_file=bc2_path, file_type="txt") bc2_extracted = extract_barcodes(bc2_cleaned) write_to_txt(bc2_extracted, out_put_dir, "bc2_isolated") bc3_path="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/metadata/barcodes_paper/r3barcodes.txt" out_put_dir="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/metadata/barcodes_paper" bc3_cleaned = read_from_file(input_file=bc3_path, file_type="txt") bc3_extracted = extract_barcodes(bc3_cleaned, UMI=True) write_to_txt(bc3_extracted, out_put_dir, "bc3_isolated")
def same_dimension(path_to_bc, path_to_umi, sam_in_path): bc_list = read_from_file(file_type="txt", input_file=path_to_bc) umi_list = read_from_file(file_type="txt", input_file=path_to_umi)
# import numpy as np # import gc # import sys # from scipy.sparse import csr_matrix # A = [[1, 0, 0, 1, 0, 0], [0, 0, 2, 0, 0, 1], [0, 0, 0, 2, 0, 0]] # S = csr_matrix(A) # print(sys.getsizeof(A)) # print(sys.getsizeof(S)) # del A # process = psutil.Process(os.getpid()) # print("The memory usage was: " + str(process.memory_info().rss/1000000000) + " GB") from tools.file_input_output import read_from_file sam_file = read_from_file( input_file="/Users/manuel/Desktop/bowtie_strategy/star_aligned.sam", file_type="sam") # test uniqueness # read_names = [] # for read in sam_file: # read_names.append(read[1].split(".")[1]) # import numpy as np # # print(np.unique(read_names).size) total = 0 have_both = 0 nh = 0 for read in sam_file: