def create_cds_excel_sheet(args, excel_sheet_dict, genome_dict, total_mapped_dict, wildcards, conditions, contrasts, te_header): """ Create the excel sheet for CDS. """ # read annotation from file annotation_dict = eu.generate_annotation_dict(args.annotation_path) inter_dict = create_interlap(annotation_dict) xtail_dict, deepribo_dict, reparation_dict = {}, {}, {} if args.xtail_path != "": xtail_dict = eu.generate_xtail_dict(args.xtail_path) if args.reads_deepribo != "": deepribo_dict = eu.generate_deepribo_dict(args.reads_deepribo) if args.reads_reparation != "": reparation_dict = eu.generate_reparation_dict(args.reads_reparation) keys_union = list(set().union(deepribo_dict.keys(), reparation_dict.keys(), annotation_dict.keys())) # read gff file all_sheet = [] gff_file_rows = [] header = ["Identifier", "Genome", "Start", "Stop", "Strand", "Locus_tag", "Overlapping_genes", "Old_locus_tag", "Name", "Gene_name", "Length", "Codon_count", "Start_codon", "Stop_codon"] +\ ["15nt upstream", "Nucleotide_seq", "Aminoacid_seq"] +\ [f"{cond}_TE" for cond in te_header] +\ [f"{card}_rpkm" for card in wildcards] +\ ["Evidence_reparation", "Reparation_probability", "Evidence_deepribo", "Deepribo_rank", "Deepribo_score"] +\ [f"{contrast}_{item}" for contrast in contrasts for item in ["xtail_pvalue", "xtail_pvalue_adjusted", "xtail_log2FC"]] name_list = [f"s{x}" for x in range(len(header))] nTuple = collections.namedtuple('Pandas', name_list) gffTuple = collections.namedtuple( 'Pandas', ["s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9"]) for key in keys_union: chromosome, mid, strand = key.split(":") start, stop = mid.split("-") locus_tag, locus_tag_overlap, name, gene_name = "", "", "", "" reparation_probability, deepribo_rank, deepribo_score = 0, 0, 0 evidence_reparation, evidence_deepribo = [], [] read_list = [] if (chromosome, strand) in inter_dict: overlapping_intervals = list(inter_dict[(chromosome, strand)].find( (int(start), int(stop)))) else: overlapping_intervals = [] if len(overlapping_intervals) > 0: locus_tag_overlap = ",".join( [interval[2] for interval in overlapping_intervals]) gene_id = "" old_locus_tag = "" if key in annotation_dict: gene_id, locus_tag, name, gene_name, old_locus_tag, read_list = annotation_dict[ key] length = int(stop) - int(start) + 1 codon_count = length / 3 if key in reparation_dict: reparation_probability, reparation_evidence, read_list = reparation_dict[ key] evidence_list = [] for e in reparation_evidence.split(" "): if not "reparation" in e: evidence_list.append("reparation-" + e) else: evidence_list.append(e) evidence_reparation.extend(evidence_list) if key in deepribo_dict: deepribo_rank, deepribo_score, deepribo_evidence, read_list = deepribo_dict[ key] evidence_list = [] for e in deepribo_evidence.split(" "): if not "deepribo" in e: evidence_list.append("deepribo-" + e) else: evidence_list.append(e) evidence_deepribo.extend(evidence_list) xtail_list = [] for contrast in contrasts: if (key, contrast) in xtail_dict: xtail_log2FC, xtail_pvalue, xtail_pvalue_adjusted = xtail_dict[ (key, contrast)] xtail_list += [ xtail_pvalue, xtail_pvalue_adjusted, xtail_log2FC ] else: xtail_list += [None, None, None] start_codon, stop_codon, nucleotide_seq, aa_seq, nt_window = eu.get_genome_information( genome_dict[chromosome], int(start) - 1, int(stop) - 1, strand) evidence_reparation.sort() evidence_deepribo.sort() rpkm_list = [] for idx, val in enumerate(read_list): rpkm_list.append( eu.calculate_rpkm( total_mapped_dict[(wildcards[idx], chromosome)], val, length)) te_list = eu.calculate_te(rpkm_list, wildcards, conditions) identifier = f"{chromosome}:{start}-{stop}:{strand}" evidence_reparation = " ".join(evidence_reparation) evidence_deepribo = " ".join(evidence_deepribo) if deepribo_rank == 0: deepribo_rank = "999999" result = [identifier, chromosome, start, stop, strand, locus_tag, locus_tag_overlap, old_locus_tag, name, gene_name, length, codon_count, start_codon, stop_codon] +\ [nt_window, nucleotide_seq, aa_seq] +\ te_list + rpkm_list +\ [evidence_reparation, reparation_probability, evidence_deepribo, deepribo_rank, deepribo_score]+\ xtail_list attributes = f"ID={identifier};" if locus_tag != "": attributes += f"locus_tag={locus_tag};" if old_locus_tag != "": attributes += f"old_locus_tag={old_locus_tag};" if gene_name != "": attributes += f"Name={gene_name};" elif name != "": attributes += f"Name={name};" else: attributes += f"Name={identifier};" gff_result = [ chromosome, "HRIBO", "CDS", start, stop, ".", strand, ".", attributes ] gff_file_rows.append(gffTuple(*gff_result)) all_sheet.append(nTuple(*result)) all_df = pd.DataFrame.from_records( all_sheet, columns=[header[x] for x in range(len(header))]) all_df = all_df.astype({"Start": "int32", "Stop": "int32"}) all_df = all_df.sort_values(by=["Genome", "Start", "Stop"]) all_df.to_csv(args.output_path.replace(".xlsx", ".tsv"), sep="\t", index=False, quoting=csv.QUOTE_NONE) excel_sheet_dict["all"] = all_df gff_df = pd.DataFrame.from_records(gff_file_rows, columns=[0, 1, 2, 3, 4, 5, 6, 7, 8]) with open(args.output_path.replace(".xlsx", ".gff"), "w") as f: f.write("##gff-version 3\n") with open(args.output_path.replace(".xlsx", ".gff"), "a") as f: gff_df.to_csv(f, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE) return excel_sheet_dict
def create_misc_excel_sheet(args, excel_sheet_dict, genome_dict, total_mapped_dict, wildcards, conditions, contrasts, te_header): """ Create reduced sheets for all other features except CDS """ annotation_dict = eu.generate_non_cds_dict(args.annotation_path) xtail_dict = {} if args.xtail_path != "": xtail_dict = eu.generate_xtail_dict(args.xtail_path) sheet_row_dict = {} gff_rows = [] header = ["Identifier", "Genome", "Start", "Stop", "Strand", "Feature", "Locus_tag", "Old_locus_tag", "Name", "Gene_name", "Length", "Codon_count", "Start_codon", "Stop_codon"] +\ ["15nt upstream", "Nucleotide_seq", "Aminoacid_seq"] +\ [f"{cond}_TE" for cond in te_header] +\ [f"{card}_rpkm" for card in wildcards] +\ [f"{contrast}_{item}" for contrast in contrasts for item in ["xtail_pvalue", "xtail_pvalue_adjusted", "xtail_log2FC"]] name_list = [f"s{x}" for x in range(len(header))] nTuple = collections.namedtuple('Pandas', name_list) gffTuple = collections.namedtuple( 'Pandas', ["s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9"]) for key in annotation_dict.keys(): chromosome, mid, strand = key.split(":") start, stop = mid.split("-") feature, gene_id, locus_tag, name, gene_name, old_locus_tag, read_list = annotation_dict[ key] length = int(stop) - int(start) + 1 codon_count = length / 3 xtail_list = [] for contrast in contrasts: if (key, contrast) in xtail_dict: xtail_log2FC, xtail_pvalue, xtail_pvalue_adjusted = xtail_dict[ (key, contrast)] xtail_list += [ xtail_pvalue, xtail_pvalue_adjusted, xtail_log2FC ] else: xtail_list += [None, None, None] start_codon, stop_codon, nucleotide_seq, aa_seq, nt_window = eu.get_genome_information( genome_dict[chromosome], int(start) - 1, int(stop) - 1, strand) rpkm_list = [] for idx, val in enumerate(read_list): rpkm_list.append( eu.calculate_rpkm( total_mapped_dict[(wildcards[idx], chromosome)], val, length)) te_list = eu.calculate_te(rpkm_list, wildcards, conditions) identifier = f"{chromosome}:{start}-{stop}:{strand}" result = [identifier, chromosome, start, stop, strand, feature, locus_tag, old_locus_tag, name, gene_name, length, codon_count, start_codon, stop_codon] +\ [nt_window, nucleotide_seq, aa_seq] + te_list + rpkm_list + xtail_list attributes = f"ID={identifier};" if locus_tag != "": attributes += f"locus_tag={locus_tag};" if old_locus_tag != "": attributes += f"old_locus_tag={old_locus_tag};" elif name != "": attributes += f"Name={name};" else: attributes += f"Name={identifier};" gff_result = [ chromosome, "HRIBO", feature, start, stop, ".", strand, ".", attributes ] gff_rows.append(gffTuple(*gff_result)) if feature.lower() in [ "ncrna", "srna", "rrna", "trna", "start_codon", "stop_codon", "repeat_region", "3'-utr", "5'-utr" ]: if FEATURE_MAP[feature.lower()] in sheet_row_dict: sheet_row_dict[FEATURE_MAP[feature.lower()]].append( nTuple(*result)) else: sheet_row_dict[FEATURE_MAP[feature.lower()]] = [ nTuple(*result) ] else: if "misc" in sheet_row_dict: sheet_row_dict["misc"].append(nTuple(*result)) else: sheet_row_dict["misc"] = [nTuple(*result)] gff_df = pd.DataFrame.from_records(gff_rows, columns=[0, 1, 2, 3, 4, 5, 6, 7, 8]) with open(args.output_path.replace(".xlsx", "_misc.gff"), "w") as f: f.write("##gff-version 3\n") with open(args.output_path.replace(".xlsx", "_misc.gff"), "a") as f: gff_df.to_csv(f, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE) for key in sheet_row_dict.keys(): tmp_rows = sheet_row_dict[key] cur_df = pd.DataFrame.from_records( tmp_rows, columns=[header[x] for x in range(len(header))]) cur_df = cur_df.astype({"Start": "int32", "Stop": "int32"}) cur_df = cur_df.sort_values(by=["Genome", "Start", "Stop"]) excel_sheet_dict[key] = cur_df return excel_sheet_dict
def create_excel_file(args): # read the genome file genome_file = SeqIO.parse(args.genome, "fasta") genome_dict = dict() for entry in genome_file: genome_dict[str(entry.id)] = (str(entry.seq), str(entry.seq.complement())) # get the total mapped reads for each bam file total_mapped_dict = {} with open(args.total_mapped, "r") as f: total = f.readlines() wildcards = [] for line in total: wildcard, chromosome, value = line.strip().split("\t") total_mapped_dict[(wildcard, chromosome)] = int(value) wildcards.append(wildcard) wildcards = eu.get_unique(wildcards) te_header = eu.get_te_header(wildcards) conditions = [] for card in wildcards: conditions.append(card.split("-")[1]) conditions = eu.get_unique(conditions) #read bed file read_df = pd.read_csv(args.reads, comment="#", header=None, sep="\t") # read gff file cds_sheet = [] header = ["Identifier", "Genome", "Source", "Feature", "Start", "Stop", "Strand", "Pred_probability", "Locus_tag", "Old_locus_tag", "Name", "Length", "Codon_count"] + [cond + "_TE" for cond in te_header] + [card + "_rpkm" for card in wildcards] + ["Evidence", "Start_codon", "Stop_codon", "15nt upstream", "Nucleotide_seq", "Aminoacid_seq"] prefix_columns = len(read_df.columns) - len(wildcards) name_list = ["s%s" % str(x) for x in range(len(header))] nTuple = collections.namedtuple('Pandas', name_list) for row in read_df.itertuples(index=False, name='Pandas'): chromosome = getattr(row, "_0") source = getattr(row, "_1") feature = getattr(row, "_2") start = getattr(row, "_3") stop = getattr(row, "_4") strand = getattr(row, "_6") attributes = getattr(row, "_8") start_codon, stop_codon, nucleotide_seq, aa_seq, nt_window = eu.get_genome_information(genome_dict[chromosome], start-1, stop-1, strand) pred_value, name, product, note, evidence, locus_tag, old_locus_tag = eu.retrieve_column_information(attributes) length = stop - start + 1 codon_count = int(length / 3) read_list = [getattr(row, "_%s" %x) for x in range(prefix_columns,len(row))] rpkm_list = [] for idx, val in enumerate(read_list): rpkm_list.append(eu.calculate_rpkm(total_mapped_dict[(wildcards[idx], chromosome)], val, length)) te_list = eu.calculate_te(rpkm_list, wildcards, conditions) identifier = "%s:%s-%s:%s" % (chromosome, start, stop, strand) result = [identifier, chromosome, "reparation", feature, start, stop, strand, pred_value, locus_tag, old_locus_tag, name, length, codon_count] + te_list + rpkm_list + [evidence, start_codon, stop_codon, nt_window, nucleotide_seq, aa_seq] cds_sheet.append(nTuple(*result)) cds_df = pd.DataFrame.from_records(cds_sheet, columns=[header[x] for x in range(len(header))]) cds_df = cds_df.sort_values(by=["Genome", "Start", "Stop"]) dataframe_dict = { "CDS" : cds_df } eu.excel_writer(args.output_path, dataframe_dict, wildcards)