Python calculate_rpkm示例

编程语言: Python

命名空间/包名称: excel_utils

方法/功能: calculate_rpkm

hotexamples.com的示例: 3

Python calculate_rpkm - 已找到3个示例。这些是从开源项目中提取的最受好评的excel_utils.calculate_rpkm现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： generate_overview_excel.py 项目： RickGelhausen/HRIBO

def create_cds_excel_sheet(args, excel_sheet_dict, genome_dict,
                           total_mapped_dict, wildcards, conditions, contrasts,
                           te_header):
    """
    Create the excel sheet for CDS.
    """

    # read annotation from file
    annotation_dict = eu.generate_annotation_dict(args.annotation_path)
    inter_dict = create_interlap(annotation_dict)

    xtail_dict, deepribo_dict, reparation_dict = {}, {}, {}
    if args.xtail_path != "":
        xtail_dict = eu.generate_xtail_dict(args.xtail_path)
    if args.reads_deepribo != "":
        deepribo_dict = eu.generate_deepribo_dict(args.reads_deepribo)
    if args.reads_reparation != "":
        reparation_dict = eu.generate_reparation_dict(args.reads_reparation)

    keys_union = list(set().union(deepribo_dict.keys(), reparation_dict.keys(),
                                  annotation_dict.keys()))

    # read gff file
    all_sheet = []
    gff_file_rows = []

    header = ["Identifier", "Genome", "Start", "Stop", "Strand", "Locus_tag", "Overlapping_genes", "Old_locus_tag", "Name", "Gene_name", "Length", "Codon_count", "Start_codon", "Stop_codon"] +\
             ["15nt upstream", "Nucleotide_seq", "Aminoacid_seq"] +\
             [f"{cond}_TE" for cond in te_header] +\
             [f"{card}_rpkm" for card in wildcards] +\
             ["Evidence_reparation", "Reparation_probability", "Evidence_deepribo", "Deepribo_rank", "Deepribo_score"] +\
             [f"{contrast}_{item}" for contrast in contrasts for item in ["xtail_pvalue", "xtail_pvalue_adjusted", "xtail_log2FC"]]

    name_list = [f"s{x}" for x in range(len(header))]
    nTuple = collections.namedtuple('Pandas', name_list)
    gffTuple = collections.namedtuple(
        'Pandas', ["s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9"])

    for key in keys_union:
        chromosome, mid, strand = key.split(":")
        start, stop = mid.split("-")

        locus_tag, locus_tag_overlap, name, gene_name = "", "", "", ""
        reparation_probability, deepribo_rank, deepribo_score = 0, 0, 0
        evidence_reparation, evidence_deepribo = [], []

        read_list = []
        if (chromosome, strand) in inter_dict:
            overlapping_intervals = list(inter_dict[(chromosome, strand)].find(
                (int(start), int(stop))))
        else:
            overlapping_intervals = []

        if len(overlapping_intervals) > 0:
            locus_tag_overlap = ",".join(
                [interval[2] for interval in overlapping_intervals])

        gene_id = ""
        old_locus_tag = ""
        if key in annotation_dict:
            gene_id, locus_tag, name, gene_name, old_locus_tag, read_list = annotation_dict[
                key]

        length = int(stop) - int(start) + 1
        codon_count = length / 3

        if key in reparation_dict:
            reparation_probability, reparation_evidence, read_list = reparation_dict[
                key]
            evidence_list = []
            for e in reparation_evidence.split(" "):
                if not "reparation" in e:
                    evidence_list.append("reparation-" + e)
                else:
                    evidence_list.append(e)
            evidence_reparation.extend(evidence_list)

        if key in deepribo_dict:
            deepribo_rank, deepribo_score, deepribo_evidence, read_list = deepribo_dict[
                key]
            evidence_list = []
            for e in deepribo_evidence.split(" "):
                if not "deepribo" in e:
                    evidence_list.append("deepribo-" + e)
                else:
                    evidence_list.append(e)
            evidence_deepribo.extend(evidence_list)

        xtail_list = []
        for contrast in contrasts:
            if (key, contrast) in xtail_dict:
                xtail_log2FC, xtail_pvalue, xtail_pvalue_adjusted = xtail_dict[
                    (key, contrast)]
                xtail_list += [
                    xtail_pvalue, xtail_pvalue_adjusted, xtail_log2FC
                ]
            else:
                xtail_list += [None, None, None]

        start_codon, stop_codon, nucleotide_seq, aa_seq, nt_window = eu.get_genome_information(
            genome_dict[chromosome],
            int(start) - 1,
            int(stop) - 1, strand)

        evidence_reparation.sort()
        evidence_deepribo.sort()

        rpkm_list = []
        for idx, val in enumerate(read_list):
            rpkm_list.append(
                eu.calculate_rpkm(
                    total_mapped_dict[(wildcards[idx], chromosome)], val,
                    length))

        te_list = eu.calculate_te(rpkm_list, wildcards, conditions)

        identifier = f"{chromosome}:{start}-{stop}:{strand}"
        evidence_reparation = " ".join(evidence_reparation)
        evidence_deepribo = " ".join(evidence_deepribo)
        if deepribo_rank == 0:
            deepribo_rank = "999999"

        result = [identifier, chromosome, start, stop, strand, locus_tag, locus_tag_overlap, old_locus_tag, name, gene_name, length, codon_count, start_codon, stop_codon] +\
                 [nt_window, nucleotide_seq, aa_seq] +\
                 te_list + rpkm_list +\
                 [evidence_reparation, reparation_probability, evidence_deepribo, deepribo_rank, deepribo_score]+\
                  xtail_list

        attributes = f"ID={identifier};"
        if locus_tag != "":
            attributes += f"locus_tag={locus_tag};"
        if old_locus_tag != "":
            attributes += f"old_locus_tag={old_locus_tag};"
        if gene_name != "":
            attributes += f"Name={gene_name};"
        elif name != "":
            attributes += f"Name={name};"
        else:
            attributes += f"Name={identifier};"

        gff_result = [
            chromosome, "HRIBO", "CDS", start, stop, ".", strand, ".",
            attributes
        ]
        gff_file_rows.append(gffTuple(*gff_result))
        all_sheet.append(nTuple(*result))

    all_df = pd.DataFrame.from_records(
        all_sheet, columns=[header[x] for x in range(len(header))])

    all_df = all_df.astype({"Start": "int32", "Stop": "int32"})
    all_df = all_df.sort_values(by=["Genome", "Start", "Stop"])

    all_df.to_csv(args.output_path.replace(".xlsx", ".tsv"),
                  sep="\t",
                  index=False,
                  quoting=csv.QUOTE_NONE)

    excel_sheet_dict["all"] = all_df

    gff_df = pd.DataFrame.from_records(gff_file_rows,
                                       columns=[0, 1, 2, 3, 4, 5, 6, 7, 8])

    with open(args.output_path.replace(".xlsx", ".gff"), "w") as f:
        f.write("##gff-version 3\n")

    with open(args.output_path.replace(".xlsx", ".gff"), "a") as f:
        gff_df.to_csv(f,
                      sep="\t",
                      header=None,
                      index=False,
                      quoting=csv.QUOTE_NONE)

    return excel_sheet_dict

示例#2

显示文件

文件： generate_overview_excel.py 项目： RickGelhausen/HRIBO

def create_misc_excel_sheet(args, excel_sheet_dict, genome_dict,
                            total_mapped_dict, wildcards, conditions,
                            contrasts, te_header):
    """
    Create reduced sheets for all other features except CDS
    """
    annotation_dict = eu.generate_non_cds_dict(args.annotation_path)

    xtail_dict = {}
    if args.xtail_path != "":
        xtail_dict = eu.generate_xtail_dict(args.xtail_path)

    sheet_row_dict = {}
    gff_rows = []

    header = ["Identifier", "Genome", "Start", "Stop", "Strand", "Feature", "Locus_tag", "Old_locus_tag", "Name", "Gene_name", "Length", "Codon_count", "Start_codon", "Stop_codon"] +\
             ["15nt upstream", "Nucleotide_seq", "Aminoacid_seq"] +\
             [f"{cond}_TE" for cond in te_header] +\
             [f"{card}_rpkm" for card in wildcards] +\
             [f"{contrast}_{item}" for contrast in contrasts for item in ["xtail_pvalue", "xtail_pvalue_adjusted", "xtail_log2FC"]]

    name_list = [f"s{x}" for x in range(len(header))]
    nTuple = collections.namedtuple('Pandas', name_list)
    gffTuple = collections.namedtuple(
        'Pandas', ["s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9"])

    for key in annotation_dict.keys():
        chromosome, mid, strand = key.split(":")
        start, stop = mid.split("-")

        feature, gene_id, locus_tag, name, gene_name, old_locus_tag, read_list = annotation_dict[
            key]

        length = int(stop) - int(start) + 1
        codon_count = length / 3

        xtail_list = []
        for contrast in contrasts:
            if (key, contrast) in xtail_dict:
                xtail_log2FC, xtail_pvalue, xtail_pvalue_adjusted = xtail_dict[
                    (key, contrast)]
                xtail_list += [
                    xtail_pvalue, xtail_pvalue_adjusted, xtail_log2FC
                ]
            else:
                xtail_list += [None, None, None]

        start_codon, stop_codon, nucleotide_seq, aa_seq, nt_window = eu.get_genome_information(
            genome_dict[chromosome],
            int(start) - 1,
            int(stop) - 1, strand)

        rpkm_list = []
        for idx, val in enumerate(read_list):
            rpkm_list.append(
                eu.calculate_rpkm(
                    total_mapped_dict[(wildcards[idx], chromosome)], val,
                    length))

        te_list = eu.calculate_te(rpkm_list, wildcards, conditions)

        identifier = f"{chromosome}:{start}-{stop}:{strand}"

        result = [identifier, chromosome, start, stop, strand, feature, locus_tag, old_locus_tag, name, gene_name, length, codon_count, start_codon, stop_codon] +\
                 [nt_window, nucleotide_seq, aa_seq] + te_list + rpkm_list + xtail_list

        attributes = f"ID={identifier};"
        if locus_tag != "":
            attributes += f"locus_tag={locus_tag};"
        if old_locus_tag != "":
            attributes += f"old_locus_tag={old_locus_tag};"
        elif name != "":
            attributes += f"Name={name};"
        else:
            attributes += f"Name={identifier};"

        gff_result = [
            chromosome, "HRIBO", feature, start, stop, ".", strand, ".",
            attributes
        ]
        gff_rows.append(gffTuple(*gff_result))

        if feature.lower() in [
                "ncrna", "srna", "rrna", "trna", "start_codon", "stop_codon",
                "repeat_region", "3'-utr", "5'-utr"
        ]:
            if FEATURE_MAP[feature.lower()] in sheet_row_dict:
                sheet_row_dict[FEATURE_MAP[feature.lower()]].append(
                    nTuple(*result))
            else:
                sheet_row_dict[FEATURE_MAP[feature.lower()]] = [
                    nTuple(*result)
                ]
        else:
            if "misc" in sheet_row_dict:
                sheet_row_dict["misc"].append(nTuple(*result))
            else:
                sheet_row_dict["misc"] = [nTuple(*result)]

    gff_df = pd.DataFrame.from_records(gff_rows,
                                       columns=[0, 1, 2, 3, 4, 5, 6, 7, 8])

    with open(args.output_path.replace(".xlsx", "_misc.gff"), "w") as f:
        f.write("##gff-version 3\n")

    with open(args.output_path.replace(".xlsx", "_misc.gff"), "a") as f:
        gff_df.to_csv(f,
                      sep="\t",
                      header=None,
                      index=False,
                      quoting=csv.QUOTE_NONE)

    for key in sheet_row_dict.keys():
        tmp_rows = sheet_row_dict[key]
        cur_df = pd.DataFrame.from_records(
            tmp_rows, columns=[header[x] for x in range(len(header))])

        cur_df = cur_df.astype({"Start": "int32", "Stop": "int32"})
        cur_df = cur_df.sort_values(by=["Genome", "Start", "Stop"])

        excel_sheet_dict[key] = cur_df

    return excel_sheet_dict

示例#3

显示文件

文件： generate_excel_reparation.py 项目： RickGelhausen/HRIBO

def create_excel_file(args):
    # read the genome file
    genome_file = SeqIO.parse(args.genome, "fasta")
    genome_dict = dict()
    for entry in genome_file:
        genome_dict[str(entry.id)] = (str(entry.seq), str(entry.seq.complement()))

    # get the total mapped reads for each bam file
    total_mapped_dict = {}
    with open(args.total_mapped, "r") as f:
        total = f.readlines()

    wildcards = []
    for line in total:
        wildcard, chromosome, value = line.strip().split("\t")
        total_mapped_dict[(wildcard, chromosome)] = int(value)
        wildcards.append(wildcard)

    wildcards = eu.get_unique(wildcards)

    te_header = eu.get_te_header(wildcards)

    conditions = []
    for card in wildcards:
        conditions.append(card.split("-")[1])

    conditions = eu.get_unique(conditions)

    #read bed file
    read_df = pd.read_csv(args.reads, comment="#", header=None, sep="\t")

    # read gff file
    cds_sheet = []

    header = ["Identifier", "Genome", "Source", "Feature", "Start", "Stop", "Strand", "Pred_probability", "Locus_tag", "Old_locus_tag", "Name", "Length", "Codon_count"] + [cond + "_TE" for cond in te_header] + [card + "_rpkm" for card in wildcards] + ["Evidence", "Start_codon", "Stop_codon", "15nt upstream", "Nucleotide_seq", "Aminoacid_seq"]
    prefix_columns = len(read_df.columns) - len(wildcards)
    name_list = ["s%s" % str(x) for x in range(len(header))]
    nTuple = collections.namedtuple('Pandas', name_list)

    for row in read_df.itertuples(index=False, name='Pandas'):
        chromosome = getattr(row, "_0")
        source = getattr(row, "_1")
        feature = getattr(row, "_2")
        start = getattr(row, "_3")
        stop = getattr(row, "_4")
        strand = getattr(row, "_6")
        attributes = getattr(row, "_8")

        start_codon, stop_codon, nucleotide_seq, aa_seq, nt_window = eu.get_genome_information(genome_dict[chromosome], start-1, stop-1, strand)
        pred_value, name, product, note, evidence, locus_tag, old_locus_tag = eu.retrieve_column_information(attributes)

        length = stop - start + 1
        codon_count = int(length / 3)

        read_list = [getattr(row, "_%s" %x) for x in range(prefix_columns,len(row))]
        rpkm_list = []
        for idx, val in enumerate(read_list):
            rpkm_list.append(eu.calculate_rpkm(total_mapped_dict[(wildcards[idx], chromosome)], val, length))

        te_list = eu.calculate_te(rpkm_list, wildcards, conditions)

        identifier = "%s:%s-%s:%s" % (chromosome, start, stop, strand)
        result = [identifier, chromosome, "reparation", feature, start, stop, strand, pred_value, locus_tag, old_locus_tag, name, length, codon_count] + te_list + rpkm_list + [evidence, start_codon, stop_codon, nt_window, nucleotide_seq, aa_seq]


        cds_sheet.append(nTuple(*result))

    cds_df = pd.DataFrame.from_records(cds_sheet, columns=[header[x] for x in range(len(header))])
    cds_df = cds_df.sort_values(by=["Genome", "Start", "Stop"])

    dataframe_dict = { "CDS" : cds_df }

    eu.excel_writer(args.output_path, dataframe_dict, wildcards)