Exemplo n.º 1
0
def nb_transcripts(inputfile=None,
                   outputfile=None,
                   text_format=False,
                   key_name=""):
    """
    Compute the number of transcript per gene.
    """

    gtf = GTF(inputfile)

    message("Computing the number of transcript per gene in input GTF file.")

    # Computation of transcript number is performed on exon lines
    # Just in case some transcript lines would be lacking (but they should
    # not...)

    n_tx = gtf.get_gn_to_tx()

    if not text_format:
        tmp_file = make_tmp_file(prefix="nb_tx", suffix=".txt")

    for i in n_tx:
        if not text_format:
            tmp_file.write(i + "\t" + str(len(n_tx[i])) + "\n")
        else:
            outputfile.write(i + "\t" + str(len(n_tx[i])) + "\n")

    if not text_format:
        tmp_file.close()
        gtf.add_attr_from_file(feat="gene",
                               key="gene_id",
                               new_key=key_name,
                               inputfile=tmp_file.name).write(outputfile,
                                                              gc_off=True)

    close_properly(outputfile, inputfile)
Exemplo n.º 2
0
def tss_numbering(inputfile=None,
                  outputfile=None,
                  compute_dist=False,
                  key_name='tss_number',
                  key_name_dist='dist_to_first_tss',
                  add_nb_tss_to_gene=False,
                  gene_key='nb_tss'):
    """
    Computes the distance between TSS of gene transcripts.
    """

    gtf = GTF(inputfile, check_ensembl_format=True)

    gn_tss_dist = defaultdict(dict)

    message("Getting TSSs.")
    tss = gtf.get_tss(name=["transcript_id"], as_dict=True)
    tx_to_gn = gtf.get_tx_to_gn()

    for k in tss:
        gn_id = tx_to_gn[k]
        gn_tss_dist[gn_id][k] = int(tss[k])

    # if_dict_of_dict is true, get_gn_to_tx() returns a dict of dict
    # that maps gene_id to transcript_id and transcript_id to TSS
    # numbering (1 for most 5', then 2...). For transcripts having
    # the same TSSs, the tss number will be the same.
    gn_to_tx_to_tss = gtf.get_gn_to_tx(as_dict_of_dict=True)

    message("Numbering TSSs.")

    tss_number_file = make_tmp_file(prefix='tx_to_tss_number', suffix='.txt')

    gn_how_many_tss = dict()

    for gn_id in gn_to_tx_to_tss:
        for tx_id in gn_to_tx_to_tss[gn_id]:
            tss_num = str(gn_to_tx_to_tss[gn_id][tx_id])
            tss_number_file.write(tx_id + "\t" + tss_num + "\n")
            if gn_id not in gn_how_many_tss:
                gn_how_many_tss[gn_id] = tss_num
            else:
                if int(tss_num) > int(gn_how_many_tss[gn_id]):
                    gn_how_many_tss[gn_id] = tss_num

    tss_number_file.close()

    gtf = gtf.add_attr_from_file(feat='transcript',
                                 key='transcript_id',
                                 new_key=key_name,
                                 inputfile=open(tss_number_file.name),
                                 has_header=False)

    if add_nb_tss_to_gene:

        gn_how_many_tss_file = make_tmp_file(prefix='gn_how_many_tss',
                                             suffix='.txt')

        for a_key, a_val in gn_how_many_tss.items():
            gn_how_many_tss_file.write(a_key + "\t" + a_val + "\n")

        gn_how_many_tss_file.close()

        gtf = gtf.add_attr_from_file(feat='gene',
                                     key='gene_id',
                                     new_key=gene_key,
                                     inputfile=open(gn_how_many_tss_file.name),
                                     has_header=False)

    if compute_dist:
        gn_to_tx_ordered_by_tss = gtf.get_gn_to_tx(ordered_5p=True)
        tss_dist_file = make_tmp_file(prefix='tx_tss_dist_to_first_tss',
                                      suffix='.txt')

        for gn_id in gn_to_tx_to_tss:
            tx_list = gn_to_tx_ordered_by_tss[gn_id]
            tx_first = tx_list.pop(0)
            # The first tss as distance 0 to the
            # first tss...
            tss_dist_file.write(tx_first + "\t0\n")
            for tx_id in tx_list:
                dist_to_first = abs(int(tss[tx_first]) - int(tss[tx_id]))
                tss_dist_file.write(tx_id + "\t" + str(dist_to_first) + "\n")

        tss_dist_file.close()

        gtf = gtf.add_attr_from_file(feat='transcript',
                                     key='transcript_id',
                                     new_key=key_name_dist,
                                     inputfile=open(tss_dist_file.name),
                                     has_header=False)

    gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Exemplo n.º 3
0
def join_attr(inputfile=None,
              outputfile=None,
              join_file=None,
              has_header=False,
              new_key=None,
              target_feature=None,
              key_to_join=None,
              matrix=None):
    """
    Join attributes from a tabulated file.
    """

    # -----------------------------------------------------------
    #  Check argument consistency
    # -----------------------------------------------------------

    if matrix is True:
        if new_key is not None:
            message("--new-key and --matrix are mutually exclusive.",
                    type="ERROR")
    else:
        if new_key is None:
            message("--new-key is required when --matrix is False.",
                    type="ERROR")

    # -----------------------------------------------------------
    #  load the GTF
    # -----------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)

    # -----------------------------------------------------------
    #  Check target feature
    # -----------------------------------------------------------

    feat_list = gtf.get_feature_list(nr=True)

    if target_feature is not None:
        target_feature_list = target_feature.split(",")

        for i in target_feature_list:
            if i not in feat_list + ["*"]:
                message("Feature " + i + " not found.", type="ERROR")
    else:
        target_feature = ",".join(feat_list)

    # -----------------------------------------------------------
    #  Do it
    # -----------------------------------------------------------

    if not matrix:

        gtf = gtf.add_attr_from_file(feat=target_feature,
                                     key=key_to_join,
                                     new_key=new_key,
                                     inputfile=join_file.name,
                                     has_header=has_header)
        gtf.write(outputfile, gc_off=True)

    else:

        gtf = gtf.add_attr_from_matrix_file(feat=target_feature,
                                            key=key_to_join,
                                            inputfile=join_file.name)
        gtf.write(outputfile, gc_off=True)

    gc.disable()
    close_properly(outputfile, inputfile)