示例#1
0
def add_bc_umi_to_sam(path_to_bc,
                      path_to_umi,
                      sam_in_path,
                      sam_out_path,
                      file_name,
                      cbc_range,
                      umi_range,
                      umi_cbc_file_type="txt"):
    sam_out_path += "/" + file_name
    sam_in = pysam.AlignmentFile(sam_in_path, "r")
    sam_out = pysam.AlignmentFile(sam_out_path, "w", template=sam_in)

    bc_list = read_from_file(file_type=umi_cbc_file_type,
                             input_file=path_to_bc)
    umi_list = read_from_file(file_type=umi_cbc_file_type,
                              input_file=path_to_umi)

    i = 0
    for read in sam_in.fetch():
        read.set_tag("XC", bc_list[i][cbc_range[0] - 1:cbc_range[1]])
        read.set_tag("XM", umi_list[i][umi_range[0] - 1:umi_range[1]])
        sam_out.write(read)
        i += 1

    print("Number of reads in .bam file: " + str(i))
示例#2
0
def write_gene_names_to_file(path_to_sam, accepted_function, out_dir, file_name):
    read_list = read_from_file(file_type="sam", input_file=path_to_sam)

    sorted_sam = sort_sam_built_in(read_list)

    output = out_dir + "/" + file_name
    handler = open(output + ".txt", "w")


# some gn tags have two genes. intronic mappings can be ignored
    total = 0
    no_mapping = 0
    accepted_genes = 0
    accepted_functions = accepted_function.split(",")

    for entry in sorted_sam:
        total += 1
        if entry.has_tag('gn'):  # did this read align anywhere in the genome?
            gene_names = entry.get_tag('gn').split(',')  # get values for 'gn' tag of this read
            gene_function = entry.get_tag('gf').split(',')  # get values for 'gf' tag of this read
            intersection = set(gene_function).intersection(accepted_functions)
            if len(intersection) != 0 and len(gene_names) < 2:
                handler.write(entry.get_tag('gn'))
                accepted_genes += 1
            else:
                handler.write('no_gene')
        else:
            no_mapping += 1
            handler.write('no_gene')

        handler.write("\n")

    handler.close()
    print("accepted reads: " + str(accepted_genes))
    print("total reads: " + str(total))
def read_in_clusters(input):
    '''

    '''
    reads = read_from_file(input_file=input, file_type="txt")

    return [e.split("\t") for e in reads]
def read_in_clusters(path_to_cluster_file):
    clusters_raw = read_from_file(input_file=path_to_cluster_file, file_type="txt")
    no_clusters = get_number_of_clusters(clusters_raw)
    clusters = []
    print("number of clusters: " + str(no_clusters))
    for cluster_no in range(no_clusters):
        clusters.append(Cluster())
    print("empty clusters formed")

    for cluster_no in range(no_clusters):
        cluster = clusters_raw[cluster_no].split('\t')
        # print(cluster)
        if len(cluster) == 3:
            clusters[cluster_no].add_reads(bc_seq=cluster[0], cluster_size=cluster[1], gene_umi_list=cluster[2])
        else:
            clusters[cluster_no].add_reads(bc_seq=cluster[0], cluster_size=0, gene_umi_list="")
    print("clusters populated")

    # # to be removed
    # total = 0
    # accepted_reads = 0
    # no_no_genes = 0
    # for cluster in clusters:
    #     total += len(cluster.umis)
    #     for read in cluster.gene_names:
    #         if read != "no_gene":
    #             accepted_reads += 1
    #         else:
    #             no_no_genes += 1
    #
    # print("number of accepted reads: " + str(accepted_reads))
    # print("total reads: " + str(total))
    # print("number of no_genes: " + str(no_no_genes))

    return clusters
示例#5
0
def get_bcs_umis_queryname(path_to_bc_reads, mode="SPLiT"):
    """
    reads in the bc_read.fastq file, extracts the 3 BC's and the UMI, lastly combined the
    3 BC's to one overal cellular BC
    :return: [(query_name, barcode_seq, qual_score, umi_seq, umi_score),(),...]
    """
    bc_reads_list = read_from_file(input_file=path_to_bc_reads, file_type="fastq_all")
    bc_list = [None] * len(bc_reads_list)
    umi_list = [None] * len(bc_reads_list)

    if mode == "SPLiT":
        i = 0
        for i, read in enumerate(bc_reads_list):
            query_name = read[0]
            bc = read[1][10:18] + read[1][48:56] + read[1][86:94]
            bc_qual = read[2][10:18] + read[2][48:56] + read[2][86:94]
            umi = read[1][0:10]
            umi_qual = read[2][0:10]
            bc_list[i] = [query_name, bc, bc_qual]
            umi_list[i] = [query_name, umi, umi_qual]
            # i += 1
    if mode == "10X":
        for i, read in enumerate(bc_reads_list):
            query_name = read[0]
            bc = read[1][0:16]
            bc_qual = read[2][0:16]
            umi = read[1][16:]
            umi_qual = read[2][16:]
            bc_list[i] = [query_name, bc, bc_qual]
            umi_list[i] = [query_name, umi, umi_qual]

    return bc_list, umi_list
def make_barcode_combinations(bc1_path, bc2_path, bc3_path):
    # bc1_cleaned = remove_fasta_header(remove_newlinetag(txt_to_list(bc1_path)))
    bc1_cleaned = read_from_file(input_file=bc1_path, file_type="txt")
    bc1_extracted = extract_barcodes(bc1_cleaned)

    bc2_cleaned = read_from_file(input_file=bc2_path, file_type="txt")
    bc2_extracted = extract_barcodes(bc2_cleaned)

    bc3_cleaned = read_from_file(input_file=bc3_path, file_type="txt")
    bc3_extracted = extract_barcodes(bc3_cleaned, True)

    combined_bcs = []

    for bc1 in bc1_extracted:
        for bc2 in bc2_extracted:
            for bc3 in bc3_extracted:
                combined_bcs.append(bc1 + bc2 + bc3)

    return combined_bcs
示例#7
0
def select_bcs_umis_gens(path_to_umis_fastq, path_to_gen_fastq,
                         aligned_sel_bcs):
    # read in umis and genes
    umis_list = read_from_file(input_file=path_to_umis_fastq,
                               file_type="fastq_all")
    gens_list = read_from_file(input_file=path_to_gen_fastq,
                               file_type="fastq_all")
    # list that will be populated with sel umis (0s or sequences)
    sel_umis = [0] * len(aligned_sel_bcs)

    for pos in range(len(umis_list)):
        number_low_quality_bases = get_number_of_low_quality_bases(
            umis_list[pos][2])
        if number_low_quality_bases > 1 or aligned_sel_bcs[pos] == 0:
            aligned_sel_bcs[pos] = 0
            gens_list[pos] = 0
        else:
            sel_umis[pos] = umis_list[pos][1]

    return aligned_sel_bcs, sel_umis, gens_list
def barcodes_txt_to_FASTA(input_file_destination, output_directory, output_file_name):
    barcode_list = read_from_file(file_type = "txt", input_file = input_file_destination)
    type_to_be_removed = type_remove(barcode_list)
    cleaned_barcoed_list = []

    if type_to_be_removed == "5Biosg":
        cleaned_barcoed_list = remove_5Biosg(barcode_list)
    elif type_to_be_removed == "5Phos":
        cleaned_barcoed_list = remove_5Phos(barcode_list)

    write_to_txt(cleaned_barcoed_list, output_directory, output_file_name)
def get_bcs_from_fastq(path_to_barcode_fastq):
    bc_reads_fastq = read_from_file(input_file=path_to_barcode_fastq, file_type="fastq_all")
    bc_reads_fastq_extracted = []

    for read in bc_reads_fastq:
        read_name = read[0]
        read_seq = read[1][10:18] + read[1][48:56] + read[1][86:94]
        read_quality = QSanger_to_Phred33(read[2][10:18] + read[2][48:56] + read[2][86:94])
        bc_reads_fastq_extracted.append((read_name, read_seq, read_quality))

    return bc_reads_fastq_extracted
示例#10
0
def merge_fastq(input_directories, out_dir, output_file_name):
    # get input locations of fastq files
    directories_list = get_input_locations(input_directories)
    # initialize list that will be filled with content of fastq files
    fastq_files = [None] * len(directories_list)
    # read in input fastq files
    for i in range(len(fastq_files)):
        fastq_files[i] = read_from_file(input_file=directories_list[i],
                                        file_type='fastq_all')

    for fastq in fastq_files:
        write_to_fastq(fastq, out_dir, output_file_name, mode="append")
def main(cmd_args):
    path_to_barcodes = cmd_args['cbc_clusters']
    barcodes = read_from_file(input_file=path_to_barcodes, file_type="txt")

    path_to_umis = cmd_args['umi_clusters']
    umis = read_from_file(input_file=path_to_umis, file_type="txt")

    path_to_genes = cmd_args['gene_names']
    genes = read_from_file(input_file=path_to_genes, file_type="txt")

    # gene_no = 0
    # for gen in genes:
    #     if gen != "no_gene":
    #         gene_no += 1
    # print(gene_no)

    output_file_name = cmd_args["file_name"]
    out_put_dir = cmd_args["out_dir"]

    construct_cluster_umi_file(barcodes,
                               umis,
                               genes,
                               out_dir=out_put_dir,
                               file_name=output_file_name)
示例#12
0
def get_aligned_selected_reads(path_to_sam_file, barcode_comb_list):
    '''
    Reads in contents of .bam file and copies those reads to sam_list that could be aligned
    and have an ED < 1 to the aligned barcode_combinaiton. Then (this only makes sense if ED > 0)
    reads are corrected with the respective barcode from barcode_comb_list.
    :param path_to_sam_file:
    :param barcode_comb_list:
    :return: a list that contains 0 for reads that did not pass the selection criteria and a read
    sequence for those read that did pass the selection criteria.
    '''
    sam_list = read_from_file(file_type="sam", input_file=path_to_sam_file)
    aligned_reads = [0] * len(sam_list)

    i = 0
    for sam in sam_list:
        if sam[0].has_tag("AS") and sam[0].get_tag('NM') < 1:
            correct_seq_pos = sam[0].tid
            aligned_reads[i] = barcode_comb_list[correct_seq_pos]
        else:
            aligned_reads[i] = 0
        i += 1

    return aligned_reads
        handler.write("\n")
        handler.write(str(comb))
        handler.write("\n")
        chromosome += 1


def main(cmd_args):
    bc1_path = cmd_args["bc1"]
    bc2_path = cmd_args["bc2"]
    bc3_path = cmd_args["bc3"]
    out_put_dir = cmd_args["out_dir"]

    barcode_combinations = make_barcode_combinations(bc1_path, bc2_path,
                                                     bc3_path)
    write_to_txt(barcode_combinations, out_put_dir, "barcode_combinations")
    bc_combinations_to_fasta(barcode_combinations, out_put_dir,
                             "barcode_combinations")


# if __name__ == "__main__":
#     main(get_cmd_args())

#### just for one time use
bc_path = "/Users/manuel/Desktop/10xv2_whitelist.txt"
bcs = read_from_file(input_file=bc_path, file_type="txt")

out_dir = "/Users/manuel/Desktop/"
filename = "10X_bc_combinations"
bc_combinations_to_fasta(barcode_combinations=bcs,
                         out_put_dir=out_dir,
                         out_filename=filename)
示例#14
0
# This script can be use to extract the barcode or umi from the original oligo

from create_bc_comb import extract_barcodes
from tools.file_input_output import read_from_file, write_to_txt

bc1_path="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/metadata/barcodes_paper/r1barcodes.txt"
out_put_dir="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/metadata/barcodes_paper"
bc1_cleaned = read_from_file(input_file=bc1_path, file_type="txt")
bc1_extracted = extract_barcodes(bc1_cleaned)
write_to_txt(bc1_extracted, out_put_dir, "bc1_isolated")

bc2_path="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/metadata/barcodes_paper/r2barcodes.txt"
out_put_dir="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/metadata/barcodes_paper"
bc2_cleaned = read_from_file(input_file=bc2_path, file_type="txt")
bc2_extracted = extract_barcodes(bc2_cleaned)
write_to_txt(bc2_extracted, out_put_dir, "bc2_isolated")

bc3_path="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/metadata/barcodes_paper/r3barcodes.txt"
out_put_dir="/Users/manuel/OneDrive/SPLiT-seq/SPLiT-seq_suite/DGE_matrix_generation/metadata/barcodes_paper"
bc3_cleaned = read_from_file(input_file=bc3_path, file_type="txt")
bc3_extracted = extract_barcodes(bc3_cleaned, UMI=True)
write_to_txt(bc3_extracted, out_put_dir, "bc3_isolated")
示例#15
0
def same_dimension(path_to_bc, path_to_umi, sam_in_path):
    bc_list = read_from_file(file_type="txt", input_file=path_to_bc)
    umi_list = read_from_file(file_type="txt", input_file=path_to_umi)
示例#16
0
# import numpy as np
# import gc
# import sys
# from scipy.sparse import csr_matrix
# A = [[1, 0, 0, 1, 0, 0], [0, 0, 2, 0, 0, 1], [0, 0, 0, 2, 0, 0]]
# S = csr_matrix(A)
# print(sys.getsizeof(A))
# print(sys.getsizeof(S))
# del A
# process = psutil.Process(os.getpid())
# print("The memory usage was: " + str(process.memory_info().rss/1000000000) + " GB")
from tools.file_input_output import read_from_file

sam_file = read_from_file(
    input_file="/Users/manuel/Desktop/bowtie_strategy/star_aligned.sam",
    file_type="sam")

# test uniqueness
# read_names = []
# for read in sam_file:
#     read_names.append(read[1].split(".")[1])

# import numpy as np
#
# print(np.unique(read_names).size)

total = 0
have_both = 0
nh = 0
for read in sam_file: