Exemplo n.º 1
0
def build_transcript_data(species, gene_bed, gene_mrna, gene_pre_mrna, pre_mrna):
    
    """
    
    Generates transcript data structures to call peaks on
    
    Allows for either predefined files (from the data directory) 
    or custom files
    
    Accepts species, and genebed, genemrnaand genepremrna options
    
    species - the species to run on
    gene_bed - an abribtary bed file of locations to search for peaks (should be gene locations)
    gene_mrna - the effective length of the mrna of a gene (unmappable regions removed)
    gene_premrna - the effective length of the pre-mrna (unmappable regions removed)
    pre_mrna - flag True indicates use pre-mRNA lengths instead of mRNA lengths
     
    returns genes and lengths dict
    
    """
    
    #error checking 
    
    acceptable_species = get_acceptable_species()
    if (species is None and 
        gene_bed is None and 
        (gene_mrna is None or gene_pre_mrna is None)):
        
        raise ValueError("You must set either \"species\" or \"geneBed\"+\"geneMRNA\"+\"genePREMRNA\"")

    if species is not None and gene_bed is not None:
        raise ValueError("You shouldn't set both geneBed and species, defaults exist for %s" % (acceptable_species))
    
    #Now actually assign values
    if species is not None:
        try:
            gene_bed      = clipper.data_file(species + ".AS.STRUCTURE_genes.BED.gz")
            gene_mrna     = clipper.data_file(species + ".AS.STRUCTURE_mRNA.lengths")
            gene_pre_mrna = clipper.data_file(species + ".AS.STRUCTURE_premRNA.lengths")
            
        except ValueError:
            raise ValueError("Defaults don't exist for your species: %s. Please choose from: %s or supply \"geneBed\"+\"geneMRNA\"+\"genePREMRNA\"" % (species, acceptable_species))

    #Selects mRNA or preMRNA lengths
    if pre_mrna is True:
        lenfile = gene_pre_mrna
    else:
        lenfile = gene_mrna

    if lenfile is None:
        raise IOError("""didn't pass correct mRNA length file option 
                    with given length file""")
        
    #builds dict to do processing on,
    genes = build_geneinfo(gene_bed)
    lengths = build_lengths(lenfile)
    
    
    return genes, lengths
Exemplo n.º 2
0
def build_transcript_data_gtf_as_structure(species, pre_mrna):
    """
    
    gtf_file - gtf file generated from AS_STRUCTURE_gtf ipython notebook 
    pre_mrna - if true uses pre mRNA length instead of mRNA length
    
    """
    bedtoolintervals = []
    x = clipper.data_file(species + ".AS.STRUCTURE.COMPILED.gff")
    gtf_file = pybedtools.BedTool(x)
    for gene in gtf_file:
        effective_length = gene.attrs[
            'premrna_length'] if pre_mrna else gene.attrs['mrna_length']
        attrs = "gene_id=%s;" % (gene.attrs['gene_id'])
        if "transcript_ids" in gene.attrs:
            attrs += "transcript_ids=%s;" % (gene.attrs['transcript_ids'])
        attrs += "effective_length=%s" % (str(effective_length))

        bedtoolintervals.append(
            pybedtools.create_interval_from_list(
                map(str, [
                    gene['chrom'], "AS_STRUCTURE", "mRNA",
                    str(gene.start + 1),
                    str(gene.stop + 1), "0", gene['strand'], ".", attrs
                ])))

    return pybedtools.BedTool(bedtoolintervals)
Exemplo n.º 3
0
def build_transcript_data_gtf_as_structure(species, pre_mrna):
    
    """
    
    gtf_file - gtf file generated from AS_STRUCTURE_gtf ipython notebook 
    pre_mrna - if true uses pre mRNA length instead of mRNA length
    
    """
    results = []
    x = clipper.data_file(species + ".AS.STRUCTURE.COMPILED.gff")
    gtf_file = pybedtools.BedTool(x)
    for gene in gtf_file:
        
        effective_length = gene.attrs['premrna_length'] if pre_mrna else gene.attrs['mrna_length']
        attrs = "gene_id=%s;" % (gene.attrs['gene_id'])
        if "transcript_ids" in gene.attrs:
            attrs += "transcript_ids=%s;" % (gene.attrs['transcript_ids']) 
        attrs += "effective_length=%s" % (str(effective_length)) 
        
        results.append(pybedtools.create_interval_from_list(map(str, [gene['chrom'], 
                                                                      "AS_STRUCTURE", 
                                                                      "mRNA", 
                                                                      str(gene.start + 1), 
                                                                      str(gene.stop + 1),
                                                                      "0", 
                                                                      gene['strand'], 
                                                                      ".",
                                                                      attrs
                                                                      ])))
        
            
    return pybedtools.BedTool(results)
Exemplo n.º 4
0
def build_transcript_data_gtf_as_structure(species, pre_mrna):
    """
    calculate effective length for each transcript from pre-created gtf file in clipper/data
    Returns Bedtool containing effective length
    :param species: (str) genome name
    :param pre_mrna: (bool) if true uses pre-mRNA length instead of mRNA length
    :return: (pybedtools.Bedtool)
    :rtype: pybedtools.BedTool

    """
    bedtool_intervals = []
    x = clipper.data_file(species + ".AS.STRUCTURE.COMPILED.gff")
    gtf_file = pybedtools.BedTool(x)
    for gene in gtf_file:
        effective_length = gene.attrs['premrna_length'] if pre_mrna else gene.attrs['mrna_length']
        attrs = "gene_id=%s;" % (gene.attrs['gene_id'])
        if "transcript_ids" in gene.attrs:
            attrs += "transcript_ids=%s;" % (gene.attrs['transcript_ids'])
        attrs += "effective_length=%s" % (str(effective_length))

        # add to bedtool_intervals
        to_string = map(str, [gene['chrom'],"AS_STRUCTURE","mRNA",str(gene.start + 1),str(gene.stop + 1),"0",gene['strand'],".",attrs]) # map object
        bedtool_intervals.append(pybedtools.create_interval_from_list(list(to_string)))


    return pybedtools.BedTool(bedtool_intervals)
Exemplo n.º 5
0
def build_transcript_data(species, gene_bed, gene_mrna, gene_pre_mrna,
                          pre_mrna):
    """
    
    Generates transcript data structures to call peaks on
    
    Allows for either predefined files (from the data directory) 
    or custom files
    
    Accepts species, and genebed, genemrnaand genepremrna options
    
    species - the species to run on
    gene_bed - an abribtary bed file of locations to search for peaks (should be gene locations)
    gene_mrna - the effective length of the mrna of a gene (unmappable regions removed)
    gene_premrna - the effective length of the pre-mrna (unmappable regions removed)
    pre_mrna - flag True indicates use pre-mRNA lengths instead of mRNA lengths
     
    returns genes and lengths dict
    
    """

    #error checking

    acceptable_species = get_acceptable_species()
    if (species is None and gene_bed is None
            and (gene_mrna is None or gene_pre_mrna is None)):

        raise ValueError(
            "You must set either \"species\" or \"geneBed\"+\"geneMRNA\"+\"genePREMRNA\""
        )

    if species is not None and gene_bed is not None:
        raise ValueError(
            "You shouldn't set both geneBed and species, defaults exist for %s"
            % (acceptable_species))

    #Now actually assign values
    if species is not None:
        try:
            gene_bed = clipper.data_file(species +
                                         ".AS.STRUCTURE_genes.BED.gz")
            gene_mrna = clipper.data_file(species +
                                          ".AS.STRUCTURE_mRNA.lengths")
            gene_pre_mrna = clipper.data_file(species +
                                              ".AS.STRUCTURE_premRNA.lengths")

        except ValueError:
            raise ValueError(
                "Defaults don't exist for your species: %s. Please choose from: %s or supply \"geneBed\"+\"geneMRNA\"+\"genePREMRNA\""
                % (species, acceptable_species))

    #Selects mRNA or preMRNA lengths
    if pre_mrna is True:
        lenfile = gene_pre_mrna
    else:
        lenfile = gene_mrna

    if lenfile is None:
        raise IOError("""didn't pass correct mRNA length file option 
                    with given length file""")

    #builds dict to do processing on,
    genes = build_geneinfo(gene_bed)
    lengths = build_lengths(lenfile)

    #this is a stopgap until it can be fully factored out, returing a gtf file of
    #genes and effective lengths, eventually this is the file we want to pass in
    gtf_list = []

    for gene in genes.keys():
        gtf_list.append(
            pybedtools.create_interval_from_list([
                genes[gene][0], "AS_STRUCTURE", "mRNA",
                str(genes[gene][2]),
                str(genes[gene][3]), ".",
                str(genes[gene][4]), ".",
                "gene_id=" + gene + "; effective_length=" + str(lengths[gene])
            ]))

    return pybedtools.BedTool(gtf_list)
Exemplo n.º 6
0
def build_transcript_data(species, gene_bed, gene_mrna, gene_pre_mrna, pre_mrna):
    
    """
    
    Generates transcript data structures to call peaks on
    
    Allows for either predefined files (from the data directory) 
    or custom files
    
    Accepts species, and genebed, genemrnaand genepremrna options
    
    species - the species to run on
    gene_bed - an abribtary bed file of locations to search for peaks (should be gene locations)
    gene_mrna - the effective length of the mrna of a gene (unmappable regions removed)
    gene_premrna - the effective length of the pre-mrna (unmappable regions removed)
    pre_mrna - flag True indicates use pre-mRNA lengths instead of mRNA lengths
     
    returns genes and lengths dict
    
    """
    
    #error checking 
    
    acceptable_species = get_acceptable_species()
    if (species is None and 
        gene_bed is None and 
        (gene_mrna is None or gene_pre_mrna is None)):
        
        raise ValueError("You must set either \"species\" or \"geneBed\"+\"geneMRNA\"+\"genePREMRNA\"")

    if species is not None and gene_bed is not None:
        raise ValueError("You shouldn't set both geneBed and species, defaults exist for %s" % (acceptable_species))
    
    #Now actually assign values
    if species is not None:
        try:
            gene_bed      = clipper.data_file(species + ".AS.STRUCTURE_genes.BED.gz")
            gene_mrna     = clipper.data_file(species + ".AS.STRUCTURE_mRNA.lengths")
            gene_pre_mrna = clipper.data_file(species + ".AS.STRUCTURE_premRNA.lengths")
            
        except ValueError:
            raise ValueError("Defaults don't exist for your species: %s. Please choose from: %s or supply \"geneBed\"+\"geneMRNA\"+\"genePREMRNA\"" % (species, acceptable_species))

    #Selects mRNA or preMRNA lengths
    if pre_mrna is True:
        lenfile = gene_pre_mrna
    else:
        lenfile = gene_mrna

    if lenfile is None:
        raise IOError("""didn't pass correct mRNA length file option 
                    with given length file""")
        
    #builds dict to do processing on,
    genes = build_geneinfo(gene_bed)
    lengths = build_lengths(lenfile)
    
    #this is a stopgap until it can be fully factored out, returing a gtf file of 
    #genes and effective lengths, eventually this is the file we want to pass in
    gtf_list = []
    
    for gene in genes.keys():
        gtf_list.append(pybedtools.create_interval_from_list([genes[gene][0], 
                        "AS_STRUCTURE", 
                        "mRNA",
                        str(genes[gene][2]), 
                        str(genes[gene][3]),
                        ".",
                        str(genes[gene][4]),
                        ".",
                        "gene_id=" + gene + "; effective_length=" + str(lengths[gene])]))

    return pybedtools.BedTool(gtf_list)
Exemplo n.º 7
0
def build_transcript_data(species, gene_bed, gene_mrna, gene_pre_mrna,
                          pre_mrna):
    """
    
    Generates transcript data structures to call peaks on
    
    Allows for either predefined files (from the data directory) 
    or custom files
    
    Accepts species, and genebed, genemrnaand genepremrna options
    
    species - the species to run on
    gene_bed - an abribtary bed file of locations to search for peaks (should be gene locations)
    gene_mrna - the effective length of the mrna of a gene (unmappable regions removed)
    gene_premrna - the effective length of the pre-mrna (unmappable regions removed)
    
    returns genes and lengths dict
    
    """

    #error checking

    acceptable_species = get_acceptable_species()
    if (species is None and gene_bed is None
            and (gene_mrna is None or gene_pre_mrna is None)):

        raise ValueError(
            "You must set either \"species\" or \"geneBed\"+\"geneMRNA\"+\"genePREMRNA\""
        )

    if species is not None and gene_bed is not None:
        raise ValueError(
            "You shouldn't set both geneBed and species, defaults exist for %s"
            % (acceptable_species))

    #Now actually assign values
    if species is not None:
        try:
            gene_bed = clipper.data_file(species +
                                         ".AS.STRUCTURE_genes.BED.gz")
            gene_mrna = clipper.data_file(species +
                                          ".AS.STRUCTURE_mRNA.lengths")
            gene_pre_mrna = clipper.data_file(species +
                                              ".AS.STRUCTURE_premRNA.lengths")

        except ValueError:
            raise ValueError(
                "Defaults don't exist for your species: %s. Please choose from: %s or supply \"geneBed\"+\"geneMRNA\"+\"genePREMRNA\""
                % (species, acceptable_species))

    #Selects mRNA or preMRNA lengths
    if pre_mrna is True:
        lenfile = gene_pre_mrna
    else:
        lenfile = gene_mrna

    if lenfile is None:
        raise IOError("""didn't pass correct mRNA length file option 
                    with given length file""")

    #builds dict to do processing on,
    genes = build_geneinfo(gene_bed)
    lengths = build_lengths(lenfile)

    return genes, lengths