示例#1
0
def justConvertFilenames(species, outputdir):
    import gene_associations
    gene_to_symbol = gene_associations.getGeneToUid(species,
                                                    ('hide', 'Ensembl-Symbol'))
    from import_scripts import OBO_import
    symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol)

    for filename in os.listdir(outputdir):
        if '.pdf' in filename or '.png' in filename:
            fn = string.replace(filename, '.pdf', '')
            fn = string.replace(fn, '.png', '')
            newname = string.split(fn, '__')

            if newname[0] in gene_to_symbol:
                new_filename = str(filename)
                if '__' in filename:
                    new_filename = string.split(filename, '__')[1]
                elif '\\' in filename:
                    new_filename = string.split(filename, '\\')[1]
                elif '/' in filename:
                    new_filename = string.split(filename, '/')[1]
                nnname = gene_to_symbol[
                    newname[0]][0] + '-SashimiPlot_' + new_filename
                try:
                    os.rename(os.path.join(outputdir, filename),
                              os.path.join(outputdir, nnname))
                except Exception:
                    pass
            else:
                continue
示例#2
0
def importGeneSymbols(species):
    import gene_associations
    gene_to_symbol = gene_associations.getGeneToUid(species,
                                                    ('hide', 'Ensembl-Symbol'))

    from import_scripts import OBO_import
    symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol)

    return gene_to_symbol, symbol_to_gene
示例#3
0
def sashmi_plot_list(bamdir,
                     eventsToVisualizeFilename,
                     PSIFilename,
                     events=None):
    try:
        import gene_associations
        gene_to_symbol = gene_associations.getGeneToUid(
            species, ('hide', 'Ensembl-Symbol'))
        from import_scripts import OBO_import
        symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol)
    except Exception:
        symbol_to_gene = {}

    if events == None:
        splicing_events, expandedSearch = importSplicingEventsToVisualize(
            eventsToVisualizeFilename)
    else:
        ### Replace any ":" from the input events
        #for i in range(len(events)): events[i] = string.replace(events[i],':','__')
        expandedSearch = True

        for i in range(len(events)):
            gene = string.split(events[i], '__')[0]
            if gene in gene_to_symbol:
                symbol = gene_to_symbol[gene][0]
            elif 'ENS' not in gene or 'G0000' in gene:
                if gene in symbol_to_gene:
                    ensID = symbol_to_gene[gene][0]
                    symbol = gene
                    events[
                        i] = ensID  ### translate this ID to an Ensembl gene ID for propper SashimiPlot lookup
        splicing_events = events  ### optionally get from supplied variable

    if len(splicing_events) == 0:
        print eventsToVisualizeFilename
        forceNoCompatibleEventsInFile

    print 'Exporting plots',

    ### Determine Groups for Coloring
    groups_file = 'None'
    dir_list = unique.read_directory(root_dir + '/ExpressionInput')

    for file in dir_list:
        if 'groups.' in file:
            groups_file = root_dir + '/ExpressionInput/' + file

    if groups_file != None:
        try:
            import ExpressionBuilder
            sample_group_db = ExpressionBuilder.simplerGroupImport(groups_file)
            groups = []
            for sample in sample_group_db:
                if sample_group_db[sample] not in groups:
                    groups.append(sample_group_db[sample]
                                  )  ### create an ordered list of unique group
        except Exception:
            groups = ['None']
            #print traceback.format_exc()
            pass

    processed_events = formatAndSubmitSplicingEventsToSashimiPlot(
        PSIFilename, bamdir, splicing_events, sample_group_db, groups, False)
    mopup_events = getMopUpEvents(splicing_events, processed_events)

    ### Do the same for supplied gene queries or junctions that didn't map above using the gene expression values as a guide
    #print len(splicing_events),len(processed_events),len(mopup_events)
    processed_events = formatAndSubmitSplicingEventsToSashimiPlot(
        steady_state_exp_file, bamdir, mopup_events, sample_group_db, groups,
        expandedSearch)
    if len(processed_events) > 0:
        mopup_events = getMopUpEvents(mopup_events, processed_events)
        processed_events = formatAndSubmitSplicingEventsToSashimiPlot(
            PSIFilename, bamdir, mopup_events, sample_group_db, groups, True)
    return gene_to_symbol
示例#4
0
def importDataSimple(filename, input_type, MOD=None, Species=None):
    id_db = {}
    fn = filepath(filename)
    x = 0
    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        t = string.split(data, '\t')
        if data[0] == '#' and x == 0: x = 0
        elif x == 0:
            column_headers = t[1:]
            if input_type != 'LineageProfiler':
                try:
                    column_headers = t[
                        2]  ### exclude the ID, system code and p-value column headers
                except Exception:
                    column_headers = 'NA'
            x = 1
        else:
            if x == 1 and input_type != 'LineageProfiler':
                ### get system conversions
                system_code = t[1]
                import GO_Elite
                from import_scripts import OBO_import
                system_codes, source_types, mod_types = GO_Elite.getSourceData(
                )
                source_data = system_codes[system_code]
                try:
                    Mod = mod  ### global established in upstream functions
                    speciescode = species_code
                except Exception:
                    Mod = MOD
                    speciescode = Species
                if source_data == Mod:
                    source_is_mod = True
                else:
                    source_is_mod = False
                    mod_source = Mod + '-' + source_data + '.txt'
                    gene_to_source_id = gene_associations.getGeneToUid(
                        speciescode, ('hide', mod_source))
                    source_to_gene = OBO_import.swapKeyValues(
                        gene_to_source_id)
            if input_type != 'LineageProfiler':
                if source_is_mod == True:
                    try:
                        id_db[t[0]] = float(t[2])
                    except Exception:
                        id_db[t[0]] = 'NA'
                elif t[0] in source_to_gene:
                    mod_ids = source_to_gene[t[0]]
                    for mod_id in mod_ids:
                        try:
                            value = t[2]
                        except Exception:
                            value = 'NA'
                        if value == '+': value = 1
                        elif value == '-': value = -1
                        try:
                            id_db[mod_id] = float(
                                value
                            )  ### If multiple Ensembl IDs in dataset, only record the last associated fold change
                        except Exception:
                            id_db[mod_id] = 'NA'
                        break
            else:
                id_db[t[0]] = map(float, t[1:])  ### Applies to LineageProfiler
            x += 1
    #print len(id_db),column_headers
    return id_db, column_headers
示例#5
0
def reformatPolyAdenylationCoordinates(species, force):
    """ PolyA annotations are currently only available from UCSC for human, but flat file
    annotations from 2003-2006 are available for multiple species. Convert these to BED format"""
    version = {}
    version['Rn'] = '2003(rn3)'
    version['Dr'] = '2003(zv4)'
    version['Gg'] = '2004(galGal2)'
    version['Hs'] = '2006(hg8)'
    version['Mm'] = '2004(mm5)'

    print 'Exporting polyADB_2 coordinates as BED for', species
    ### Obtain the necessary database files
    url = 'http://altanalyze.org/archiveDBs/all/polyAsite.txt'
    output_dir = 'AltDatabase/ucsc/' + species + '/'
    if force == 'yes':
        filename, status = update.download(url, output_dir, '')
    else:
        filename = output_dir + 'polyAsite.txt'

    ### Import the refseq to Ensembl information
    import gene_associations
    from import_scripts import OBO_import
    from build_scripts import EnsemblImport
    import export
    try:
        ens_unigene = gene_associations.getGeneToUid(species,
                                                     'Ensembl-UniGene')
        print len(ens_unigene), 'Ensembl-UniGene entries imported'
        external_ensembl = OBO_import.swapKeyValues(ens_unigene)
        use_entrez = 'no'
    except Exception:
        ens_entrez = gene_associations.getGeneToUid(species,
                                                    'Ensembl-EntrezGene')
        print len(ens_entrez), 'Ensembl-EntrezGene entries imported'
        external_ensembl = OBO_import.swapKeyValues(ens_entrez)
        use_entrez = 'yes'
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(
        species, 'RNASeq', 'key_by_array')

    export_bedfile = output_dir + species + '_polyADB_2_predictions.bed'
    print 'exporting', export_bedfile
    export_data = export.ExportFile(export_bedfile)
    header = '#' + species + '\t' + 'polyADB_2' + '\t' + version[species] + '\n'
    export_data.write(header)

    fn = filepath(filename)
    x = 0
    not_found = {}
    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        if x == 0: x = 1
        else:
            siteid, llid, chr, sitenum, position, supporting_EST, cleavage = string.split(
                data, '\t')
            if chr == 'chrM':
                chr = 'chrMT'  ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if chr == 'M':
                chr = 'MT'  ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if species in siteid:
                if 'NA' not in chr: chr = 'chr' + chr
                strand = '+'
                geneid = siteid
                pos_start = str(int(position) - 1)
                pos_end = position
                if use_entrez == 'no':
                    external_geneid = string.join(
                        string.split(siteid, '.')[:2], '.')
                else:
                    external_geneid = llid
                if external_geneid in external_ensembl:
                    ens_geneid = external_ensembl[external_geneid][0]
                    geneid += '-' + ens_geneid
                    chr, strand, start, end = gene_location_db[ens_geneid]
                else:
                    not_found[external_geneid] = []
                    bed_format = string.join(
                        [chr, pos_start, pos_end, geneid, '0', '-'], '\t'
                    ) + '\n'  ### We don't know the strand, so write out both strands
                    export_data.write(bed_format)
                bed_format = string.join(
                    [chr, pos_start, pos_end, geneid, '0', strand],
                    '\t') + '\n'
                export_data.write(bed_format)
    export_data.close()