def Cufflinks_mask_GTF(assembly): chromInfo = kent.fetch_ucsc_table(assembly, "chromInfo") chrM_size = 0 masks = [] for line in chromInfo: if not line.strip(): continue line_split = line.strip().split("\t") if line_split[0] == "chrM": chrM_size = int(line_split[1]) masks.append(kent.fetch_ucsc_table(assembly, line_split[0] + "_rmsk")) masks.append(kent.fetch_ucsc_table(assembly, "rmsk")) rmsk = sum(masks, []) del masks output = [] maskid = 0 rmsk.append("\t".join(["."] * 5 + \ ["real_chrM", "0", str(chrM_size), ".", "+", ".", "rRNA"])) rmsk.append("\t".join(["."] * 5 + \ ["real_chrM", "0", str(chrM_size), ".", "-", ".", "rRNA"])) for line in rmsk: if not line.strip(): continue line_split = line.strip().split("\t") if line_split[11] not in ("tRNA", "rRNA"): continue if line_split[5] == "chrM": continue if line_split[5] == "real_chrM": line_split[5] = "chrM" maskid += 1 GTF_line = [line_split[5], "rmsk", "exon", line_split[6], line_split[7], "0.000000", line_split[9], ".", "gene_id \"MASK%06d\"; transcript_id \"MASK%06d\";" % (maskid, maskid)] output.append("\t".join(GTF_line)) return output
def Cufflinks_mask_GTF(assembly): chromInfo = kent.fetch_ucsc_table(assembly, "chromInfo") chrM_size = 0 masks = [] for line in chromInfo: if not line.strip(): continue line_split = line.strip().split("\t") if line_split[0] == "chrM": chrM_size = int(line_split[1]) masks.append(kent.fetch_ucsc_table(assembly, line_split[0] + "_rmsk")) masks.append(kent.fetch_ucsc_table(assembly, "rmsk")) rmsk = sum(masks, []) del masks output = [] maskid = 0 rmsk.append("\t".join(["."] * 5 + \ ["real_chrM", "0", str(chrM_size), ".", "+", ".", "rRNA"])) rmsk.append("\t".join(["."] * 5 + \ ["real_chrM", "0", str(chrM_size), ".", "-", ".", "rRNA"])) for line in rmsk: if not line.strip(): continue line_split = line.strip().split("\t") if line_split[11] not in ("tRNA", "rRNA"): continue if line_split[5] == "chrM": continue if line_split[5] == "real_chrM": line_split[5] = "chrM" maskid += 1 GTF_line = [ line_split[5], "rmsk", "exon", line_split[6], line_split[7], "0.000000", line_split[9], ".", "gene_id \"MASK%06d\"; transcript_id \"MASK%06d\";" % (maskid, maskid) ] output.append("\t".join(GTF_line)) return output
def Cufflinks_knownGene_GTF(assembly): kgXref = kent.fetch_ucsc_table(assembly, "kgXref") knownIsoforms = kent.fetch_ucsc_table(assembly, "knownIsoforms") knownGene = kent.fetch_ucsc_gtf(assembly, "knownGene") ucscid_to_xref = {} for line in kgXref: if line.startswith("#"): continue line_split = line.strip().split("\t") ucscid_to_xref[line_split[0]] = { "mRNA": line_split[1], "gene_symbol": line_split[4], "protein_id": line_split[6] } ucscid_to_clusterid = {} for line in knownIsoforms: if line.startswith("#"): continue line_split = line.strip().split("\t") ucscid_to_clusterid[line_split[1]] = int(line_split[0]) output = [] for line in knownGene: if not line.strip(): continue line_split = line.strip().split("\t", 8) ucscid = line_split[8].split("\"")[1] xref = ucscid_to_xref[ucscid] clusterid = ucscid_to_clusterid[ucscid] line = "\t".join(line_split[:8]) + "\t" + \ " ".join(["gene_id \"CLUST%05d\";" % clusterid, "transcript_id \"%s\";" % ucscid, "gene_name \"%s\";" % xref["gene_symbol"], "transcript_name \"%s\";" % xref["mRNA"], "protein_id \"%s\";" % xref["protein_id"] if xref["protein_id"] != "" else ""]) output.append(line) return output
def Cufflinks_knownGene_GTF(assembly): kgXref = kent.fetch_ucsc_table(assembly, "kgXref") knownIsoforms = kent.fetch_ucsc_table(assembly, "knownIsoforms") knownGene = kent.fetch_ucsc_gtf(assembly, "knownGene") ucscid_to_xref = {} for line in kgXref: if line.startswith("#"): continue line_split = line.strip().split("\t") ucscid_to_xref[line_split[0]] = {"mRNA": line_split[1], "gene_symbol": line_split[4], "protein_id": line_split[6]} ucscid_to_clusterid = {} for line in knownIsoforms: if line.startswith("#"): continue line_split = line.strip().split("\t") ucscid_to_clusterid[line_split[1]] = int(line_split[0]) output = [] for line in knownGene: if not line.strip(): continue line_split = line.strip().split("\t", 8) ucscid = line_split[8].split("\"")[1] xref = ucscid_to_xref[ucscid] clusterid = ucscid_to_clusterid[ucscid] line = "\t".join(line_split[:8]) + "\t" + \ " ".join(["gene_id \"CLUST%05d\";" % clusterid, "transcript_id \"%s\";" % ucscid, "gene_name \"%s\";" % xref["gene_symbol"], "transcript_name \"%s\";" % xref["mRNA"], "protein_id \"%s\";" % xref["protein_id"] if xref["protein_id"] != "" else ""]) output.append(line) return output