Пример #1
0
    def __init__(
        self,
        sample,
        outdir,
        assay,
        match_dir,
        step,     
    ):
        self.sample = sample
        self.outdir = outdir
        self.assay = assay
        self.match_dir = match_dir
        self.step = step

        if self.match_dir:
            match_dict = parse_match_dir(match_dir)
            tsne_df_file = match_dict['tsne_coord']
            marker_df_file = match_dict['markers']
            self.tsne_df = pd.read_csv(tsne_df_file, sep="\t")
            self.marker_df = pd.read_csv(marker_df_file, sep="\t")
            self.tsne_df.rename(columns={"Unnamed: 0": "barcode"}, inplace=True)
            self.cluster_tsne = cluster_tsne_list(self.tsne_df)

        if not os.path.exists(outdir):
            os.system('mkdir -p %s' % outdir)
Пример #2
0
    def __init__(self, args, display_title='Filtering'):
        super().__init__(args, display_title)

        # data
        with open(args.raw_read_count_file) as f:
            self.count_dict = json.load(f)

        self.raw_umi = 0
        self.total_corrected_umi = 0
        self.del_umi = 0
        self.read_threshold_dict = {}
        self.umi_threshold_dict = {}  # if not set explicitly, use 1 as default

        self.barcode_ref_umi_dict = utils.genDict(dim=2)
        self.ref_barcode_umi_dict = utils.genDict(dim=2)

        match_dir_dict = utils.parse_match_dir(args.match_dir)
        self.df_tsne = match_dir_dict['df_tsne']

        self.df_filter_tsne = self.df_tsne.copy()

        # out
        self.corrected_read_count_file = f'{self.out_prefix}_corrected_read_count.json'
        self.filter_read_count_file = f'{self.out_prefix}_filtered_read_count.json'
        self.filter_tsne_file = f'{self.out_prefix}_filtered_UMI_tsne.csv'
Пример #3
0
    def __init__(self, args, display_title):
        super().__init__(args, display_title)

        self.df_read_count = pd.read_csv(args.read_count_file,
                                         sep='\t',
                                         index_col=0)

        self.tsne_dict = utils.parse_match_dir(args.match_dir)
        self.match_barcode = self.tsne_dict['match_barcode']

        # out
        self.mtx = f'{self.out_prefix}_citeseq.mtx.gz'
Пример #4
0
 def read_match_dir(self):
     """
     if match_dir is not self, should read match_dir at init
     if it is self, read at run_analysis - need to run seurat first
     """
     if self.match_dir:
         match_dict = utils.parse_match_dir(self.match_dir)
         tsne_df_file = match_dict['tsne_coord']
         self.df_tsne = pd.read_csv(tsne_df_file, sep="\t")
         self.df_tsne.rename(columns={"Unnamed: 0": "barcode"},
                             inplace=True)
         self.df_marker_file = match_dict['markers']
         self.read_format_df_marker()
Пример #5
0
def analysis_cite(args):

    if not os.path.exists(args.outdir):
        os.system('mkdir -p %s' % args.outdir)

    rds = parse_match_dir(args.match_dir)['rds']
    app = CITESEQ_DIR + "/analysis_cite.R"
    cmd = (f'Rscript {app} '
           f'--rds {rds} '
           f'--citeseq_mtx {args.citeseq_mtx} '
           f'--outdir {args.outdir} '
           f'--sample {args.sample} ')
    os.system(cmd)
Пример #6
0
    def __init__(self, args, display_title=None):
        Step.__init__(self, args, display_title=display_title)
        self.read_count_file = args.read_count_file
        self.UMI_min = args.UMI_min
        self.SNR_min = args.SNR_min
        self.combine_cluster = args.combine_cluster
        self.dim = int(args.dim)
        self.coefficient = float(args.coefficient)

        # read
        self.df_read_count = pd.read_csv(self.read_count_file,
                                         sep="\t",
                                         index_col=0)

        if args.match_dir:
            match_dict = utils.parse_match_dir(args.match_dir)
            self.match_barcode = match_dict['match_barcode']
            self.n_match_barcode = match_dict['n_match_barcode']
            self.tsne_file = match_dict['tsne_coord']
            self.matrix_dir = match_dict['matrix_dir']
        elif args.matrix_dir:
            df_barcode = pd.read_csv(f'{args.matrix_dir}/barcodes.tsv',
                                     header=None)
            self.match_barcode = df_barcode[0].tolist()
            self.n_match_barcode = len(self.match_barcode)
            self.tsne_file = args.tsne_file
            self.matrix_dir = args.matrix_dir
        else:
            raise ValueError("--match_dir or --matrix_dir is required.")

        # init
        self.no_noise = False

        # out files
        self.UMI_tag_file = f'{self.outdir}/{self.sample}_umi_tag.tsv'
        self.tsne_tag_file = f'{self.outdir}/{self.sample}_tsne_tag.tsv'
        self.cluster_count_file = f'{self.outdir}/{self.sample}_cluster_count.tsv'
        self.cluster_plot = f'{self.outdir}/{self.sample}_cluster_plot.pdf'
        if self.combine_cluster:
            self.combine_cluster_count_file = f'{self.outdir}/{self.sample}_combine_cluster_count.tsv'
            self.combine_cluster_plot = f'{self.outdir}/{self.sample}_combine_cluster_plot.pdf'
Пример #7
0
    def __init__(self, args, display_title='Count'):
        super().__init__(args, display_title)

        # set
        self.min_query_length = int(args.min_query_length)
        self.capture_bam = args.capture_bam

        # read barcodes
        match_dir_dict = utils.parse_match_dir(args.match_dir)
        self.match_barcode = match_dir_dict['match_barcode']
        self.n_match_barcode = match_dir_dict['n_match_barcode']
        self.add_metric(
            name=HELP_INFO_DICT['matched_barcode_number']['display'],
            value=self.n_match_barcode,
            help_info=HELP_INFO_DICT['matched_barcode_number']['info'])

        # data
        self.total_corrected_umi = 0
        self.count_dict = utils.genDict(dim=3)

        # out
        self.raw_read_count_file = f'{self.out_prefix}_raw_read_count.json'
Пример #8
0
def count_fusion(args):

    outdir = args.outdir
    sample = args.sample
    bam = args.bam
    flanking_base = int(args.flanking_base)
    fusion_pos_file = args.fusion_pos
    match_dir = args.match_dir
    UMI_min = int(args.UMI_min)

    # check dir
    if not os.path.exists(outdir):
        os.system('mkdir -p %s' % (outdir))

    fusion_pos = read_pos(fusion_pos_file)
    out_prefix = outdir + "/" + sample
    # barcode
    match_barcode, _n_barcode = read_barcode_file(match_dir)
    # tsne
    match_tsne_file = parse_match_dir(match_dir)['tsne_coord']
    df_tsne = pd.read_csv(match_tsne_file, sep="\t", index_col=0)
    # out
    out_read_count_file = out_prefix + "_fusion_read_count.tsv"
    out_umi_count_file = out_prefix + "_fusion_UMI_count.tsv"
    out_barcode_count_file = out_prefix + "_fusion_barcode_count.tsv"
    out_tsne_file = out_prefix + "_fusion_tsne.tsv"

    # process bam
    samfile = pysam.AlignmentFile(bam, "rb")
    header = samfile.header
    new_bam = pysam.AlignmentFile(out_prefix + "_fusion.bam",
                                  "wb",
                                  header=header)
    count_dic = genDict(dim=3)
    for read in samfile:
        tag = read.reference_name
        read_start = int(read.reference_start)
        read_length = len(read.query_sequence)
        attr = read.query_name.split('_')
        barcode = attr[0]
        umi = attr[1]
        if tag in fusion_pos.keys():
            if barcode in match_barcode:
                if is_fusion(pos=fusion_pos[tag],
                             read_start=read_start,
                             read_length=read_length,
                             flanking_base=flanking_base):
                    new_bam.write(read)
                    count_dic[barcode][tag][umi] += 1
    new_bam.close()

    # write dic to pandas df
    rows = []
    for barcode in count_dic:
        for tag in count_dic[barcode]:
            for umi in count_dic[barcode][tag]:
                rows.append([barcode, tag, umi, count_dic[barcode][tag][umi]])
    df_read = pd.DataFrame(rows)
    df_read.rename(columns={
        0: "barcode",
        1: "tag",
        2: "UMI",
        3: "read_count"
    },
                   inplace=True)
    df_read.to_csv(out_read_count_file, sep="\t", index=False)

    if not rows:
        count_fusion.logger.error('***** NO FUSION FOUND! *****')
    else:
        df_umi = df_read.groupby(["barcode", "tag"]).agg({"UMI": "count"})
        df_umi = df_umi[df_umi["UMI"] >= UMI_min]
        df_umi.to_csv(out_umi_count_file, sep="\t")

        df_umi.reset_index(inplace=True)
        df_barcode = df_umi.groupby(["tag"]).agg({"barcode": "count"})
        n_match_barcode = len(match_barcode)
        # add zero count tag
        for tag in fusion_pos.keys():
            if not tag in df_barcode.barcode:
                new_row = pd.Series(data={'barcode': 0}, name=tag)
                df_barcode = df_barcode.append(new_row, ignore_index=False)
        df_barcode["percent"] = df_barcode["barcode"] / n_match_barcode
        df_barcode.to_csv(out_barcode_count_file, sep="\t")

        df_pivot = df_umi.pivot(index="barcode", columns="tag", values="UMI")
        df_pivot.fillna(0, inplace=True)
        df_tsne_fusion = pd.merge(df_tsne,
                                  df_pivot,
                                  right_index=True,
                                  left_index=True,
                                  how="left")
        df_tsne_fusion.fillna(0, inplace=True)
        df_tsne_fusion.to_csv(out_tsne_file, sep="\t")

        # plot
        count_fusion.logger.info("plot fusion...!")
        app = fusionDir + "/plot_fusion.R"
        cmd = f"Rscript {app} --tsne_fusion {out_tsne_file} --outdir {outdir}"
        os.system(cmd)
        count_fusion.logger.info("plot done.")
Пример #9
0
    def __init__(self, args, display_title):
        super().__init__(args, display_title)

        # data
        self.tsne_dict = utils.parse_match_dir(args.match_dir)
Пример #10
0
def count_mut(args):

    outdir = args.outdir
    sample = args.sample
    bam = args.bam
    shift_base = int(args.shift_base)
    mut_file = args.mut_file
    match_dir = args.match_dir

    # check dir
    if not os.path.exists(outdir):
        os.system('mkdir -p %s' % (outdir))

    mut_dic = read_mut(mut_file)
    out_prefix = outdir + "/" + sample


    # tsne
    match_dict = parse_match_dir(match_dir)
    df_tsne = pd.read_csv(match_dict['tsne_coord'], sep="\t", index_col=0)

    # out
    out_read_file = out_prefix + "_mut_read.tsv"
    out_read_count_file = out_prefix + "_mut_read_count.tsv"
    out_umi_count_file = out_prefix + "_mut_UMI_count.tsv"
    out_insertion_barcode_count_file = out_prefix + "_mut_insertion_barcode_count.tsv"
    out_tsne_file = out_prefix + "_mut_tsne.tsv"

    # process bam
    samfile = pysam.AlignmentFile(bam, "rb")
    header = samfile.header
    new_bam = pysam.AlignmentFile(out_prefix+"_mut.bam", "wb", header=header)
    rows = []
    for read in samfile:
        tag = read.reference_name
        attr = read.query_name.split('_')
        barcode = attr[0]
        umi = attr[1]
        seq = read.query_sequence
        cigar = read.cigar
        ref_pos = read.reference_start
        read_pos = read.query_alignment_start
        for (cigarType, cigarLength) in cigar:
            if cigarType == 0:  # match
                ref_pos += cigarLength
                read_pos += cigarLength
            elif cigarType == 1:  # insertion
                insert_seq = seq[read_pos:read_pos+cigarLength]
                insert_seq_length = len(insert_seq)
                rows.append({
                    "barcode": barcode,
                    "UMI": umi,
                    "gene": tag,
                    "type": "insertion",
                    "seq": insert_seq,
                    "ref_pos": ref_pos,
                    "read_pos": read_pos,
                    "seq_length": insert_seq_length
                })
                read_pos += cigarLength
            elif cigarType == 2:  # deletion
                ref_pos += cigarLength
    df = pd.DataFrame(rows)
    df = df[["barcode", "UMI", "gene", "type",
             "ref_pos", "read_pos", "seq", "seq_length"]]

    def is_valid(row):
        gene = str(row["gene"])
        ref_pos = int(row["ref_pos"])
        mut_type = str(row["type"])
        seq_length = int(row["seq_length"])
        seq = str(row["seq"])
        if gene not in mut_dic.keys():
            return False
        if mut_type not in mut_dic[gene].keys():
            return False
        if abs(ref_pos-mut_dic[gene][mut_type]["pos"]) > shift_base:
            return False
        if editdistance.eval(seq, mut_dic[gene][mut_type]["seq"]) > shift_base:
            return False
        return True

    df["is_valid"] = df.apply(func=is_valid, axis=1)
    df.to_csv(out_read_file, sep="\t")

    df_valid = df[df["is_valid"]]
    df_read_count = df_valid.groupby(
        ["gene", "type", "barcode", "UMI"]).agg({"UMI": "count"})
    df_read_count.columns = ["read_count"]
    df_read_count.reset_index(inplace=True)
    df_read_count.to_csv(out_read_count_file, sep="\t")

    df_UMI_count = df_read_count.groupby(
        ["gene", "type", "barcode"]).agg({"UMI": "count"})
    df_UMI_count.columns = ["UMI_count"]
    df_UMI_count.to_csv(out_umi_count_file, sep="\t")

    df_temp = df_UMI_count.reset_index()
    if df_temp.shape[0] == 0:
        count_mut.logger.warning('NO VALID INSERTION FOUND!')
    else:
        df_insertion = df_temp[df_temp["type"] == "insertion"]
        df_insertion_barcode_count = df_insertion.pivot(
            index="barcode", columns="gene", values="UMI_count")
        df_insertion_barcode_count.to_csv(
            out_insertion_barcode_count_file, sep="\t")

        df_tsne_mut = pd.merge(df_tsne, df_insertion_barcode_count,
                            right_index=True, left_index=True, how="left")
        df_tsne_mut.fillna(0, inplace=True)
        df_tsne_mut.to_csv(out_tsne_file, sep="\t")

        # plot
        app = parentDir + "/plot.R"
        cmd = f"Rscript {app} --tsne_mut {out_tsne_file} --outdir {outdir}"
        count_mut.logger.info(cmd)
        os.system(cmd)
        count_mut.logger.info("plot done.")