def tss_dist(inputfile=None, outputfile=None): """ Computes the distance between TSS of gene transcripts. """ gtf = GTF(inputfile, check_ensembl_format=True) gn_tss_dist = defaultdict(dict) message("Getting TSSs.") tss = gtf.get_tss(name=["transcript_id", "gene_id"], as_dict=True) for k in tss: tx_id, gn_id = k.split("|") gn_tss_dist[gn_id][tx_id] = int(tss[k]) gn_to_tx_to_tss = gtf.get_gn_to_tx(as_dict_of_dict=True) message("Computing distances.") outputfile.write("\t".join([ "gene_id", "transcript_id_1", "transcript_id_2", "dist", "tss_num_1", "tss_num_2" ]) + "\n") try: for gn_id in sorted(gn_tss_dist.keys()): tx_list = sorted(list(gn_tss_dist[gn_id].keys())) for i in range(len(tx_list) - 1): for j in range(i + 1, len(tx_list)): dist = str( abs(gn_tss_dist[gn_id][tx_list[i]] - gn_tss_dist[gn_id][tx_list[j]])) tss_1 = gn_to_tx_to_tss[gn_id][tx_list[i]] tss_2 = gn_to_tx_to_tss[gn_id][tx_list[j]] if tss_1 < tss_2: str_out = "\t".join([ gn_id, tx_list[i], tx_list[j], dist, str(tss_1), str(tss_2) ]) + "\n" outputfile.write(str_out) else: str_out = "\t".join([ gn_id, tx_list[j], tx_list[i], dist, str(tss_2), str(tss_1) ]) + "\n" outputfile.write(str_out) except (BrokenPipeError, IOError): def _void_f(*args, **kwargs): pass message("Received a boken pipe signal", type="WARNING") sys.stdout.write = _void_f sys.stdout.flush = _void_f close_properly(outputfile, inputfile)
def tss_numbering(inputfile=None, outputfile=None, compute_dist=False, key_name='tss_number', key_name_dist='dist_to_first_tss', add_nb_tss_to_gene=False, gene_key='nb_tss'): """ Computes the distance between TSS of gene transcripts. """ gtf = GTF(inputfile, check_ensembl_format=True) gn_tss_dist = defaultdict(dict) message("Getting TSSs.") tss = gtf.get_tss(name=["transcript_id"], as_dict=True) tx_to_gn = gtf.get_tx_to_gn() for k in tss: gn_id = tx_to_gn[k] gn_tss_dist[gn_id][k] = int(tss[k]) # if_dict_of_dict is true, get_gn_to_tx() returns a dict of dict # that maps gene_id to transcript_id and transcript_id to TSS # numbering (1 for most 5', then 2...). For transcripts having # the same TSSs, the tss number will be the same. gn_to_tx_to_tss = gtf.get_gn_to_tx(as_dict_of_dict=True) message("Numbering TSSs.") tss_number_file = make_tmp_file(prefix='tx_to_tss_number', suffix='.txt') gn_how_many_tss = dict() for gn_id in gn_to_tx_to_tss: for tx_id in gn_to_tx_to_tss[gn_id]: tss_num = str(gn_to_tx_to_tss[gn_id][tx_id]) tss_number_file.write(tx_id + "\t" + tss_num + "\n") if gn_id not in gn_how_many_tss: gn_how_many_tss[gn_id] = tss_num else: if int(tss_num) > int(gn_how_many_tss[gn_id]): gn_how_many_tss[gn_id] = tss_num tss_number_file.close() gtf = gtf.add_attr_from_file(feat='transcript', key='transcript_id', new_key=key_name, inputfile=open(tss_number_file.name), has_header=False) if add_nb_tss_to_gene: gn_how_many_tss_file = make_tmp_file(prefix='gn_how_many_tss', suffix='.txt') for a_key, a_val in gn_how_many_tss.items(): gn_how_many_tss_file.write(a_key + "\t" + a_val + "\n") gn_how_many_tss_file.close() gtf = gtf.add_attr_from_file(feat='gene', key='gene_id', new_key=gene_key, inputfile=open(gn_how_many_tss_file.name), has_header=False) if compute_dist: gn_to_tx_ordered_by_tss = gtf.get_gn_to_tx(ordered_5p=True) tss_dist_file = make_tmp_file(prefix='tx_tss_dist_to_first_tss', suffix='.txt') for gn_id in gn_to_tx_to_tss: tx_list = gn_to_tx_ordered_by_tss[gn_id] tx_first = tx_list.pop(0) # The first tss as distance 0 to the # first tss... tss_dist_file.write(tx_first + "\t0\n") for tx_id in tx_list: dist_to_first = abs(int(tss[tx_first]) - int(tss[tx_id])) tss_dist_file.write(tx_id + "\t" + str(dist_to_first) + "\n") tss_dist_file.close() gtf = gtf.add_attr_from_file(feat='transcript', key='transcript_id', new_key=key_name_dist, inputfile=open(tss_dist_file.name), has_header=False) gtf.write(outputfile, gc_off=True) close_properly(outputfile, inputfile)
def coverage( inputfile=None, outputfile=None, bw_list=None, labels=None, pseudo_count=1, nb_window=1, ft_type="promoter", n_highest=None, downstream=1000, key_name="cov", zero_to_na=False, name_column=None, upstream=1000, chrom_info=None, nb_proc=1, matrix_out=False, stat='mean'): """ Compute transcript coverage with one or several bigWig. """ # ------------------------------------------------------------------------- # Create a list of labels. # Take user input in account # ------------------------------------------------------------------------- bw_list = [x.name for x in bw_list] if len(bw_list) != len(set(bw_list)): message("Found the same bigwigs several times.", type="ERROR") message('Checking labels.') if labels is not None: labels = labels.split(",") # Ensure the number of labels is the same as the number of bw files. if len(labels) != len(bw_list): message("The number of labels should be the same as the number of" " bigwig files.", type="ERROR") # Ensure labels are non-redondant if len(labels) > len(set(labels)): message("Labels must be unique.", type="ERROR") else: labels = [] for i in range(len(bw_list)): labels += [ os.path.splitext( os.path.basename( bw_list[i]))[0]] # ------------------------------------------------------------------------- # Check the number of windows # # ------------------------------------------------------------------------- if n_highest is None: n_highest = nb_window message('Number of bins: %d' % nb_window) message('N highest values: %d' % n_highest) if n_highest > nb_window: message('The number of window used for computing the score' ' (-n) can not be greater than the number of' ' windows (-w)', type="ERROR") sys.exit() # ------------------------------------------------------------------------- # Check input file is in bed or GTF format # # ------------------------------------------------------------------------- message("Loading input file...") if inputfile.name == '<stdin>': gtf = GTF(inputfile.name) is_gtf = True else: region_bo = BedTool(inputfile.name) if len(region_bo) == 0: message("Unable to find requested regions", type="ERROR") if region_bo.file_type == 'gff': gtf = GTF(inputfile.name) is_gtf = True else: is_gtf = False # ------------------------------------------------------------------------- # Get regions of interest # # ------------------------------------------------------------------------- name_column = name_column.split(",") if is_gtf: message("Getting regions of interest...") if ft_type.lower() == "intergenic": region_bo = gtf.get_intergenic(chrom_info, 0, 0).slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() elif ft_type.lower() == "intron": region_bo = gtf.get_introns().slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() elif ft_type == "intron_by_tx": region_bo = gtf.get_introns(by_transcript=True, name=name_column, ).slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() elif ft_type.lower() in ["promoter", "tss"]: region_bo = gtf.get_tss(name=name_column, ).slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() elif ft_type.lower() in ["tts", "terminator"]: region_bo = gtf.get_tts(name=name_column).slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() else: region_bo = gtf.select_by_key( "feature", ft_type, 0 ).to_bed(name=name_column).slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() if len(region_bo) == 0: message("Unable to find requested regions", type="ERROR") else: region_bo = region_bo.slop(s=True, l=upstream, r=downstream, g=chrom_info.name).sort() region_bed = make_tmp_file(prefix="region", suffix=".bed") region_bo.saveas(region_bed.name) # ------------------------------------------------------------------------- # Compute coverage # # ------------------------------------------------------------------------- result_bed = bw_cov_mp(bw_list=bw_list, region_file=open(region_bed.name), labels=labels, bin_nb=nb_window, pseudo_count=pseudo_count, zero_to_na=zero_to_na, nb_proc=nb_proc, n_highest=n_highest, stat=stat, verbose=pygtftk.utils.VERBOSITY) if matrix_out: result_bed.close() df_first = pd.read_csv(result_bed.name, sep="\t", header=None) df_first = df_first.ix[:, [0, 1, 2, 3, 5, 4]] df_list = [] for i in range(len(labels)): # create a sub data frame containing the coverage values of the # current bwig str_to_find = r"^" + labels[i] + r"\|" tmp_df = df_first[df_first[3].str.match(str_to_find)].copy() to_replace = r"^" + labels[i] + r"\|" tmp_df.iloc[:, 3] = tmp_df.iloc[:, 3].replace(to_replace, r"", regex=True) df_list += [tmp_df] df_final = df_list.pop(0) for i in df_list: # Add columns to df final by joining on # chrom, start, end, transcript_id, strand df_final = df_final.merge(i.iloc[:, list(range(6))], on=[0, 1, 2, 3, 5]) df_final.columns = ["chrom", "start", "end", "name", "strand"] + labels df_final.to_csv(outputfile, sep="\t", index=False) else: nb_line = 0 for i in result_bed: outputfile.write(i) nb_line += 1 if nb_line == 0: message("No line available in output...", type="ERROR") gc.disable() close_properly(inputfile, outputfile)
def great_reg_domains(inputfile=None, outputfile=None, go_id="GO:0003700", species="hsapiens", upstream=1000, downstream=1000, chrom_info=None, distal=1000000, mode='basal_plus_extension', http_proxy=None, https_proxy=None): """ Given a GTF and a GO term, attempt compute labeled regions using GREAT 'association rule'. """ # ------------------------------------------------------------------------- # chrom_len will store the chromosome sizes. # ------------------------------------------------------------------------- chrom_len = chrom_info_as_dict(chrom_info) # ------------------------------------------------------------------------- # Read the GTF # ------------------------------------------------------------------------- gtf = GTF(inputfile, check_ensembl_format=False) # ------------------------------------------------------------------------- # Get the TSSs -- Extend them by upstream/dowstream # ------------------------------------------------------------------------- message("Defining basal regulatory domains.", type="INFO") basal_reg_bed = gtf.get_tss(name=['gene_id', 'gene_name']).slop( s=True, l=upstream, r=downstream, g=chrom_info.name).sort() basal_reg_bed_file = make_tmp_file(prefix='basal_reg', suffix='.bed') basal_reg_bed.saveas(basal_reg_bed_file.name) if mode == 'basal_plus_extension': # ------------------------------------------------------------------------- # Search for upstream limits of each basal_reg_bed # Here we ignore overlapping basal_reg_bed as the way they # are proceded is not documented in GREAT to our knowledge # ------------------------------------------------------------------------- message("Defining regulatory domains upstream regions.", type="INFO") regulatory_region_start = dict() regulatory_region_end = dict() chroms = dict() strands = dict() basal_reg_bed_upstream = basal_reg_bed.closest( basal_reg_bed, # Ignore features in B that overlap A io=True, # In addition to the closest feature in B report distance # use negative distances to report upstream features. # Report distance with respect to A. # When A is on the - strand, "upstream" means B has a # higher(start, stop). D="a", # Ignore features in B that are downstream of features in A id=True, # How ties are handled. "first" Report the first tie t="first", # Require that the query and the closest hit have different names/gene_ids. N=True) basal_reg_bed_upstream_file = make_tmp_file( prefix='basal_reg_bed_upstream', suffix='.bed') basal_reg_bed_upstream.saveas(basal_reg_bed_upstream_file.name) for line in basal_reg_bed_upstream: gene_id = line.name strand = line.strand end = line.end start = line.start gene_id = "|".join([gene_id, str(start), str(end), strand]) chroms[gene_id] = line.chrom strands[gene_id] = strand if strand == '+': # if the feature chromosome in B is # '.' we have reached the start of the chr if line.fields[6] == '.': regulatory_region_start[gene_id] = max( 0, line.start - distal) else: padding = min(distal, abs(int(line.fields[12])) - 1) regulatory_region_start[gene_id] = line.start - padding elif strand == '-': # if the feature chromosome in B is # '.' we have reached the end of the chr if line.fields[6] == '.': regulatory_region_end[gene_id] = min( int(chrom_len[line.chrom]), line.end + distal) else: padding = min(distal, abs(int(line.fields[12])) - 1) regulatory_region_end[gene_id] = line.end + padding else: message("Cannot process genes without strand", type="WARNING") message("Please check:" + gene_id, type="ERROR") # ------------------------------------------------------------------------- # Search for downstream limits of each basal_reg_bed # Here we ignore overlapping basal_reg_bed as the way they # are proceded is not documented in GREAT to our knowledge # ------------------------------------------------------------------------- message("Defining regulatory domains downstream regions.", type="INFO") basal_reg_bed_downstream = basal_reg_bed.closest( basal_reg_bed, # Ignore features in B that overlap A io=True, # In addition to the closest feature in B report distance # use negative distances to report upstream features. # Report distance with respect to A. # When A is on the - strand, "upstream" means B has a # higher(start, stop). D="a", # Ignore features in B that are upstream of features in A iu=True, # How ties are handled. "first" Report the first tie t="first", # Require that the query and the closest hit have different names/gene_ids. N=True) basal_reg_bed_downstream_file = make_tmp_file( prefix='basal_reg_bed_upstream', suffix='.bed') basal_reg_bed_downstream.saveas(basal_reg_bed_downstream_file.name) for line in basal_reg_bed_downstream: gene_id = line.name strand = line.strand end = line.end start = line.start gene_id = "|".join([gene_id, str(start), str(end), strand]) chroms[gene_id] = line.chrom strands[gene_id] = strand if strand == '+': # if the feature chromosome in B is # '.' we have reached the start of the chr if line.fields[6] == '.': regulatory_region_end[gene_id] = min( int(chrom_len[line.chrom]), line.end + distal) else: padding = min(distal, abs(int(line.fields[12])) - 1) regulatory_region_end[gene_id] = line.end + padding elif strand == '-': if line.fields[6] == '.': # sys.stderr.write(str(line.start - distal + 1) + "\n") # sys.stderr.write(gene_id + "\n") regulatory_region_start[gene_id] = max( 0, line.start - distal) else: padding = min(distal, abs(int(line.fields[12])) - 1) regulatory_region_start[gene_id] = max( 0, line.start - padding) else: message("Cannot process genes without strand", type="WARNING") message("Please check:" + gene_id, type="ERROR") # print(regulatory_region_start) else: message( "Only 'basal_plus_extension' association rule is currently supported.", type='ERROR') # ------------------------------------------------------------------------- # Print the regulatory regions of all genes # By default print all genes # ------------------------------------------------------------------------- if go_id is None: for gene_id in regulatory_region_start: outlist = [ chroms[gene_id], str(regulatory_region_start[gene_id]), str(regulatory_region_end[gene_id]), gene_id.split("|")[0], "0", strands[gene_id] ] outputfile.write("\t".join(outlist) + "\n") else: # ------------------------------------------------------------------------- # Get the list of gene/transcript associated with a particular GO term # ------------------------------------------------------------------------- message("Getting Gene Ontology annotations.") if not go_id.startswith("GO:"): go_id = "GO:" + go_id is_associated = set() bm = Biomart(http_proxy=http_proxy, https_proxy=https_proxy) bm.get_datasets('ENSEMBL_MART_ENSEMBL') if species + "_gene_ensembl" not in bm.datasets: message("Unknow dataset/species.", type="ERROR") bm.query({'query': XML.format(species=species, go=go_id)}) for i in bm.response.content.decode().split("\n"): i = i.rstrip("\n") if i != '': is_associated.add(i) for gene_id in regulatory_region_start: gene_id_short = gene_id.split("|")[0] if gene_id_short in is_associated: outlist = [ chroms[gene_id], str(regulatory_region_start[gene_id]), str(regulatory_region_end[gene_id]), gene_id.split("|")[0], "0", strands[gene_id] ] outputfile.write("\t".join(outlist) + "\n")
def rm_dup_tss(inputfile=None, outputfile=None): """If several transcripts of a gene share the same tss, select only one.""" # ---------------------------------------------------------------------- # Get the TSS # ---------------------------------------------------------------------- gtf = GTF(inputfile) tss_bo = gtf.get_tss(["gene_id", "transcript_id"]) # ---------------------------------------------------------------------- # Sort the file by name (4th col) to ensure reproducibility between calls. # ---------------------------------------------------------------------- with open(tss_bo.fn) as f: lines = [line.split('\t') for line in f] tmp_file = make_tmp_file(prefix="tss_sorted_by_tx_id", suffix=".bed") for line in sorted(lines, key=operator.itemgetter(3)): tmp_file.write('\t'.join(line)) tmp_file.close() tss_bo = BedTool(tmp_file.name) # ---------------------------------------------------------------------- # Get the list of non redundant TSSs # ---------------------------------------------------------------------- gene_dict = defaultdict(dict) to_delete = [] message("Looking for redundant TSS (gene-wise).") for line in tss_bo: tss = line.start name = line.name gene_id, tx_id = name.split("|") if gene_id in gene_dict: if tss not in gene_dict[gene_id]: gene_dict[gene_id][tss] = tx_id else: to_delete += [tx_id] else: gene_dict[gene_id][tss] = tx_id message("Deleted transcripts: " + ",".join(to_delete[1:min(10, len(to_delete))]) + "...", type="DEBUG") # ---------------------------------------------------------------------- # Write # ---------------------------------------------------------------------- gtf.select_by_key("feature", "gene", invert_match=True).select_by_key( "transcript_id", ",".join(to_delete), invert_match=True).write(outputfile, gc_off=True)