Пример #1
0
def importPGF(dir,species,filename):
    fn=filepath(filename); probe_db = {}; x=0
    psr_file = dir+'/'+species+'/'+array_type+'/'+species+'_probeset-psr.txt'
    psr_file = string.replace(psr_file,'affymetrix/LibraryFiles/','')
    try: eo = export.ExportFile(filepath(psr_file))
    except Exception: eo = export.ExportFile(filepath(psr_file[1:]))
    for line in open(fn,'rU').xreadlines():
        if line[0] != '#':
            data = cleanUpLine(line); x+=1
            t = string.split(data,'\t')
            if len(t)==2 or len(t)==3:
                if len(t[0])>0:
                    probeset = t[0]; type = t[1]
                    eo.write(probeset+'\t'+t[-1]+'\n') ### Used for HTA array where we need to have PSR to probeset IDs
            else:
                try:
                    probe = t[2]
                    #if probeset == '10701621': print probe
                    try: probe_db[probeset].append(probe)
                    except KeyError: probe_db[probeset] = [probe]
                except Exception: null=[]
    eo.close() 
    new_file = dir+'/'+species+'/'+array_type+'/'+species+'_probeset-probes.txt'
    new_file = string.replace(new_file,'affymetrix/LibraryFiles/','')
    headers = 'probeset\t' + 'probe\n'; n=0
    try: data = export.ExportFile(filepath(new_file))
    except Exception: data = export.ExportFile(filepath(new_file[1:]))
    data.write(headers)
    for probeset in probe_db:
        for probe in probe_db[probeset]:
            data.write(probeset+'\t'+probe+'\n'); n+=1
    data.close()
    print n, 'Entries exported for', new_file
Пример #2
0
def unzipFiles(filename,dir):
    import zipfile
    output_filepath = filepath(dir+'/'+filename)
    try:
        zfile = zipfile.ZipFile(output_filepath)
        for name in zfile.namelist():
            if name.endswith('/'):null=[] ### Don't need to export
            else:
                if 'EnsMart' in name and 'EnsMart' in dir:
                    dir = export.findParentDir(dir[:-1]) ### Remove EnsMart suffix directory
                try: outfile = export.ExportFile(filepath(dir+name))
                except Exception: outfile = export.ExportFile(filepath(dir+name[1:]))
                outfile.write(zfile.read(name)); outfile.close()
        #print 'Zip extracted to:',output_filepath
        status = 'completed'
    except Exception, e:
        try:
            ### Use the operating system's unzip if all else fails
            extracted_path = string.replace(output_filepath,'.zip','')
            try: os.remove(extracted_path) ### This is necessary, otherwise the empty file created above will require user authorization to delete
            except Exception: null=[]
            subprocessUnzip(dir,output_filepath)
            status = 'completed'
        except IOError:
            print e
            print 'WARNING!!!! The zip file',output_filepath,'does not appear to be a valid zip archive file or is currupt.'
            status = 'failed'
Пример #3
0
def exportCorrelationResults(exp_input):
    input_file = export.findFilename(exp_input)
    if '.txt' in exp_output_file:
        corr_output_file = string.replace(exp_output_file, 'DATASET',
                                          'LineageCorrelations')
    else:  ### Occurs when processing a non-standard AltAnalyze file
        corr_output_file = exp_output_file + '/' + input_file
    corr_output_file = string.replace(
        corr_output_file, '.txt',
        '-' + coding_type + '-' + compendiumPlatform + '.txt')
    if analysis_type == 'AltExon':
        corr_output_file = string.replace(corr_output_file, coding_type,
                                          'AltExon')
    filename = export.findFilename(corr_output_file)
    score_data = export.ExportFile(corr_output_file)
    if use_scipy:
        zscore_output_dir = string.replace(corr_output_file, '.txt',
                                           '-zscores.txt')
        probability_data = export.ExportFile(zscore_output_dir)
        #adjustPValues()
        replacePearsonPvalueWithZscore()
    ### Make title row
    headers = ['Sample_name']
    for tissue in tissue_comparison_scores:
        for (r, p, sample) in tissue_comparison_scores[tissue]:
            headers.append(sample)
        break
    title_row = string.join(headers, '\t') + '\n'
    score_data.write(title_row)
    if use_scipy:
        probability_data.write(title_row)
    ### Export correlation data
    tissue_scores = {}
    tissue_probabilities = {}
    tissue_score_list = []  ### store and rank tissues according to max(score)
    for tissue in tissue_comparison_scores:
        scores = []
        probabilities = []
        for (r, p, sample) in tissue_comparison_scores[tissue]:
            scores.append(r)
            probabilities.append(p)
        tissue_score_list.append((max(scores), tissue))
        tissue_scores[tissue] = string.join(map(str, [tissue] + scores),
                                            '\t') + '\n'  ### export line
        if use_scipy:
            tissue_probabilities[tissue] = string.join(
                map(str, [tissue] + probabilities), '\t') + '\n'

    tissue_score_list.sort()
    tissue_score_list.reverse()
    for (score, tissue) in tissue_score_list:
        score_data.write(tissue_scores[tissue])
        if use_scipy:
            probability_data.write(tissue_probabilities[tissue])
    score_data.close()
    if use_scipy:
        probability_data.close()
    print filename, 'exported...'
    return zscore_output_dir
Пример #4
0
def statisticallyFilterTransposedFile(input_file,
                                      output_file,
                                      threshold,
                                      minGeneCutoff=499,
                                      binarize=True):
    """ The input file is a large expression matrix with the rows as cells and the columns as genes to filter """

    if 'exp.' in input_file:
        counts_file = string.replace(input_file, 'exp.', 'geneCount.')
    else:
        counts_file = input_file[:-4] + '-geneCount.txt'

    import export
    eo = export.ExportFile(counts_file)
    eo.write('Sample\tGenes Expressed(threshold:' + str(threshold) + ')\n')
    eo_full = export.ExportFile(output_file)

    sample_expressed_genes = {}
    header = True
    count_sum_array = []
    cells_retained = 0
    for line in open(input_file, 'rU').xreadlines():
        data = cleanUpLine(line)
        if '.csv' in input_file:
            t = string.split(data, ',')
        else:
            t = string.split(data, '\t')
        if header:
            eo_full.write(line)
            gene_len = len(t)
            genes = t[1:]
            header = False
        else:
            cell = t[0]
            values = map(float, t[1:])
            binarized_values = []
            for v in values:
                if v > threshold:
                    if binarize:  ### do not count the individual read counts, only if a gene is expressed or not
                        binarized_values.append(1)
                    else:
                        binarized_values.append(
                            v
                        )  ### When summarizing counts and not genes expressed
                else:
                    binarized_values.append(0)
            genes_expressed = sum(binarized_values)
            if genes_expressed > minGeneCutoff:
                eo_full.write(line)
                cells_retained += 1
                eo.write(cell + '\t' + str(genes_expressed) + '\n')
    eo.close()
    eo_full.close()
    print cells_retained, 'Cells with genes expressed above the threshold'
Пример #5
0
def annotateMetaProbesetGenes(summary_exp_file, expression_file,
                              metaprobeset_file, species):
    metaprobeset_cv_file = string.replace(metaprobeset_file, species + '_',
                                          species + '_Conversion_')
    metaprobeset_cv_file = string.replace(metaprobeset_cv_file, '.mps', '.txt')

    fn = filepath(metaprobeset_cv_file)
    uid_db = {}
    for line in open(fn, 'rU').xreadlines():
        data = UI.cleanUpLine(line)
        uid, ens_gene = string.split(data, '\t')
        uid_db[uid] = ens_gene

    export_data = export.ExportFile(expression_file)
    fn = filepath(summary_exp_file)
    x = 0
    for line in open(fn, 'rU').xreadlines():
        if line[0] == '#': null = []
        elif x == 0:
            export_data.write(line)
            x += 1
        else:
            data = cleanUpLine(line)
            t = string.split(data, '\t')
            uid = t[0]
            ens_gene = uid_db[uid]
            export_data.write(string.join([ens_gene] + t[1:], '\t') + '\n')
    export_data.close()
def getColoredPathwayTest():
    fileType = 'png'  ### svg, pdf
    graphId = "ffffff90"
    wpid = "WP2062"
    color = "0000ff"
    graphId = ["ffffff90", "ffffffe5"]
    color = ["0000ff", "0000ff"]
    ### revision = 0 is the most current version
    #file = client.service.getColoredPathway(pwId=wpid,revision=0,graphId=graphId,color=color,fileType=fileType)
    kwargs = {
        'identifier': 'WP2062',
        'version': 0,
        'element_identifiers': ["ffffff90", "ffffffe5"],
        'colors': ["#0000FF", "#0000FF"],
        'file_format': 'image/svg+xml'
    }
    file = wikipathways_api_client_instance.get_colored_pathway(
        identifier=wpid,
        version=0,
        element_identifiers=graphId,
        colors=color,
        file_format=fileType)
    #file = base64.b64decode(file) ### decode this file
    outfile = export.ExportFile(wpid + '.png')
    outfile.write(file)
    outfile.close()
Пример #7
0
def getFeatureIsoformGenomePositions(species,protein_ft_db,mRNA_protein_seq_db,gene_transcript_db,coordinate_type):
    """ Adapted from compareProteinFeatures but for one isoform and returns genomic coordinates for each feature
    This function is designed to export all unique isoforms rather than just comparison isoforms """
    
    import export
    export_file = 'AltDatabase/ensembl/'+species+'/ProteinFeatureIsoform_complete.txt'                
    export_data = export.ExportFile(export_file)

    failed = 0
    worked = 0
    failed_ac=[]
    for gene in protein_ft_db:
        transcript_feature_db={}
        for ft in protein_ft_db[gene]:
            try:
                ft_name = ft.PrimaryAnnot(); annotation = ft.SecondaryAnnot()
                for (mRNA,type) in gene_transcript_db[gene]:
                    try:
                        protein,protein_seq = mRNA_protein_seq_db[mRNA]
                        error = False
                    except Exception:
                        failed_ac.append(mRNA)
                        error = True
                    if error == False:
                        if ft.DomainSeq() in protein_seq:
                            #if coordinate_type == 'genomic':
                            pos1_genomic = ft.GenomicStart(); pos2_genomic = ft.GenomicStop()
                            #else:
                            pos1 = str(ft.DomainStart()); pos2 = str(ft.DomainEnd())
    
                            ### There are often many features that overlap within a transcript, so consistently pick just one
                            if mRNA in transcript_feature_db:
                                db = transcript_feature_db[mRNA]
                                if (pos1,pos2) in db:
                                    db[pos1, pos2].append([pos1_genomic, pos2_genomic, protein,ft_name,annotation])
                                else:
                                    db[pos1, pos2]=[[pos1_genomic, pos2_genomic, protein,ft_name,annotation]]
                            else:
                                db={}
                                db[pos1, pos2]=[[pos1_genomic, pos2_genomic, protein,ft_name,annotation]]
                                transcript_feature_db[mRNA] = db
                                
                            #values = [mRNA, protein, pos1, pos2,ft_name,annotation]; unique_entries.append(values)
                            worked+=1
            except IOError:
                failed+=1

        for transcript in transcript_feature_db:
            db = transcript_feature_db[transcript]
            for (pos1,pos2) in db:
                db[pos1,pos2].sort() ### Pick the alphabetically listed first feature
                pos1_genomic, pos2_genomic, protein,ft_name,annotation = db[pos1,pos2][0]
                values = [transcript, protein, pos1, pos2,pos1_genomic, pos2_genomic, ft_name,annotation]
                export_data.write(string.join(values,'\t')+'\n')
                
    export_data.close()
    print failed,'features failed to have corresponding aligned genomic locations out of', worked+failed
    failed_ac = unique.unique(failed_ac)
    print len(failed_ac),'mRNAs without identified/in silico derived proteins'  ### Appear to be ncRNAs without ATGs
    print failed_ac[:20]
Пример #8
0
def exportCorrelationResults():
    corr_output_file = string.replace(exp_output_file, 'DATASET',
                                      'TissueCorrelations')
    corr_output_file = string.replace(corr_output_file, '.txt',
                                      '-' + coding_type + '.txt')
    if analysis_type == 'exonLevel':
        corr_output_file = string.replace(corr_output_file, coding_type,
                                          'AltExon')
    data = export.ExportFile(corr_output_file)
    ### Make title row
    headers = ['Sample_name']
    for tissue in tissue_comparison_scores:
        for (r, sample) in tissue_comparison_scores[tissue]:
            headers.append(sample)
        break
    title_row = string.join(headers, '\t') + '\n'
    data.write(title_row)
    ### Export correlation data
    for tissue in tissue_comparison_scores:
        values = [tissue]
        for (r, sample) in tissue_comparison_scores[tissue]:
            values.append(str(r))
        values = string.join(values, '\t') + '\n'
        data.write(values)
    data.close()
    print corr_output_file, 'exported...'
Пример #9
0
def reformatHeatmapFile(input_file):
    import unique
    export_file = string.replace(input_file, 'Clustering-', 'Input-')
    eo = export.ExportFile(export_file)
    first_row = True
    fn = filepath(input_file)
    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        t = string.split(data, '\t')
        if first_row == True:
            if 'column_clusters-flat' not in t:
                array_names = []
                for i in t[2:]:
                    array_names.append(string.replace(i, ':', '-'))
                    #array_names.append(i)
            elif 'column_clusters-flat' in t:
                array_clusters = t[2:]
                unique_clusters = unique.unique(array_clusters)
                ind = 0
                headers = []
                for c in array_clusters:
                    headers.append(c + '::' + array_names[ind])
                    ind += 1
                headers = string.join(['uid'] + headers, '\t') + '\n'
                eo.write(headers)
                first_row = False
        else:
            values = string.join([t[0]] + t[2:], '\t') + '\n'
            eo.write(values)
    return export_file, len(unique_clusters)
Пример #10
0
def checkForDuplicateIDs(input_file):
    first_row = True
    key_db = {}
    key_list = []
    fn = filepath(input_file)

    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        t = string.split(data, '\t')
        if first_row == True:
            headers = line
            first_row = False
        else:
            key = t[0]
            key_list.append(key)
            key_db[key] = t

    if len(key_db) != len(key_list):
        print 'Duplicate IDs present... writing a cleaned-up version of the input file:'
        ### Duplicate IDs present
        input_file = input_file[:-4] + '-clean.txt'
        export_text = export.ExportFile(
            input_file)  ### create a new input file
        export_text.write(headers)  ### Header is the same for each file
        for key in key_db:
            t = key_db[key]
            export_text.write(string.join(t, '\t') +
                              '\n')  ### Write z-score values and row names
        export_text.close()
        print 'File written...'
    return input_file
Пример #11
0
def translateToEntrezGene(species,filename):
    x=0; type = 'pathway'
    try: ens_to_entrez = gene_associations.getGeneToUid(species,('hide','Ensembl-EntrezGene'))
    except Exception: ens_to_entrez ={}

    if len(ens_to_entrez)>0:
        export_file = string.replace(filename,'Ensembl','EntrezGene')
        export_data = export.ExportFile(export_file)
        export_data.write('EntrezGene\tOntologyID\n')
        fn = filepath(filename)
        for line in open(fn,'rU').xreadlines():
            if x==0: x=1
            else:
                data = cleanUpLine(line)
                try:
                    ensembl,pathway = string.split(data,'\t')
                    type = 'ontology'
                except Exception:
                    ensembl,null,pathway = string.split(data,'\t')
                try:
                    entrezs = ens_to_entrez[ensembl]
                    for entrez in entrezs:
                        if type == 'ontology':
                            export_data.write(entrez+'\t'+pathway+'\n')
                        else:
                            export_data.write(entrez+'\tEn\t'+pathway+'\n')
                except Exception:
                    null=[]
        export_data.close()
Пример #12
0
def filterRows(input_file, output_file, filterDB=None, logData=False):
    orderlst = {}
    counter = []
    export_object = export.ExportFile(output_file)
    firstLine = True
    Flag = 0
    #print len(filterDB)
    for line in open(input_file, 'rU').xreadlines():
        flag1 = 0
        data = cleanUpLine(line)
        values = string.split(data, '\t')
        if firstLine:
            firstLine = False
            k = values.index('UID')
            if Flag == 0:
                export_object.write(line)
        else:
            if values[k] in filterDB:
                counter = [
                    index for index, value in enumerate(filterDB)
                    if value == values[k]
                ]
                for it in range(0, len(counter)):
                    orderlst[counter[it]] = line
    try:
        for i in range(0, len(orderlst)):
            export_object.write(orderlst[i])
    except Exception:
        print i, filterDB[i]

    export_object.close()
    print 'Filtered rows printed to:', output_file
Пример #13
0
def matchTranscriptExonIDsToJunctionIDs(species, array_type, gene_junction_db):
    """ Matches junctionIDs to precomputed transcript-level exonID strings - simpler and more accurate than importEnsemblTranscriptSequence"""

    output_file = 'AltDatabase/' + species + '/SequenceData/output/' + array_type + '_coordinte-mRNA_alignments.txt'
    dataw = export.ExportFile(output_file)

    filename = 'AltDatabase/ensembl/' + species + '/mRNA-ExonIDs.txt'
    fn = filepath(filename)
    x = 0
    for line in open(fn, 'rU').xreadlines():
        data = line.strip()
        gene, transcript, protein, exonIDs = string.split(data, '\t')
        exonIDs += '|'  ### such that the last exon is propperly searchable
        if gene in gene_junction_db:
            junctions_data = gene_junction_db[gene]
            for jd in junctions_data:
                junctionIDs = string.split(jd.Probeset() + '|', ':')[-1]
                junctionIDs = string.replace(
                    junctionIDs, '-', '|'
                )  ### this is the format of the transcript ExonID string
                if x == 0:
                    x = 1
                    print junctionIDs, exonIDs
                if junctionIDs in exonIDs:
                    dataw.write(
                        string.join([jd.Probeset(), '1', transcript], '\t') +
                        '\n')
                else:
                    dataw.write(
                        string.join([jd.Probeset(), '0', transcript], '\t') +
                        '\n')
    dataw.close()
Пример #14
0
def exportExpressionData(filename, sample_db):
    export_text = export.ExportFile(filename)
    all_genes_db = {}
    sample_list = []
    for sample in sample_db:
        sample_list.append(sample)
        gene_db = sample_db[sample]
        for geneid in gene_db:
            all_genes_db[geneid] = []
    sample_list.sort()  ### Organize these alphabetically rather than randomly
    column_header = string.join(
        ['ProbeName'] + sample_list,
        '\t') + '\n'  ### format column-names for export
    export_text.write(column_header)

    for geneid in all_genes_db:
        values = []
        for sample in sample_list:
            try:
                values.append(
                    sample_db[sample][geneid])  ### protein_expression
            except Exception:
                values.append(0)
        export_text.write(
            string.join([geneid] + map(str, values), '\t') + '\n')
    export_text.close()
Пример #15
0
def getPathwayAs(pathway_db, species_code, mod):
    begin_time = time.time()
    try:
        export.deleteFolder(
            'BuildDBs/WPs')  ### clear any remaining pathway files
    except Exception:
        null = []
    for wpid in pathway_db:
        file_type = 'gpml'
        wp_id_data = client.service.getPathwayAs(fileType=file_type,
                                                 pwId=wpid,
                                                 revision=0)
        wp_id_data = base64.b64decode(wp_id_data)
        gpml_path = filepath('BuildDBs/WPs/' + wpid + '.gpml')
        outfile = export.ExportFile(gpml_path)
        outfile.write(wp_id_data)
        outfile.close()
        gene_system_list = string.split(wp_id_data, '\n')
        parent_path = export.findParentDir(gpml_path)
        pathway_db = gene_associations.getGPMLGraphData(
            parent_path, species_code, mod)  ### get GPML data back
        os.remove(gpml_path)  ### Only store the file temporarily

    end_time = time.time()
    time_diff = float(end_time - begin_time)
    """
    try: print "WikiPathways data imported in %d seconds" % time_diff
    except Exception: null=None ### Occurs when transitioning back from the Official Database download window (not sure why) -- TclError: can't invoke "update" command
    """
    return pathway_db
Пример #16
0
def unzipFiles(filename,dir):
    import zipfile
    output_filepath = filepath(dir+filename)
    try:
        zfile = zipfile.ZipFile(output_filepath)
        for name in zfile.namelist():
            if name.endswith('/'):null=[] ### Don't need to export
            else: 
                try: outfile = export.ExportFile(dir+name)
                except Exception: outfile = export.ExportFile(dir+name[1:])
                outfile.write(zfile.read(name)); outfile.close()
        #print 'Zip extracted to:',output_filepath
        status = 'completed'
    except Exception, e:
        print e
        print 'WARNING!!!! The zip file',output_filepath,'does not appear to be a valid zip archive file or is currupt.'
        status = 'failed'
Пример #17
0
def exportANOVAStats(filename,matrix_pvalues_list):
    import export
    export_name = filename[:-4]+'-stats.txt'
    ee=export.ExportFile(export_name)
    ee.write('SplicingEvent\tANOVA rawp\tANOVA adjp\tDriving Group(s)\n')
    for ls in matrix_pvalues_list:
        ee.write(string.join(ls,'\t')+'\n')
    ee.close()
Пример #18
0
def exportEnsemblUniprot(filename):
    import export
    export_data = export.ExportFile(filename)
    export_data.write(string.join(['ensembl', 'uniprot'], '\t') + '\n')
    for uniprot in uniprot_ensembl_db:
        for ensembl in uniprot_ensembl_db[uniprot]:
            export_data.write(string.join([ensembl, uniprot], '\t') + '\n')
    export_data.close()
Пример #19
0
def createSchemaTextFile(species, platform, schema_text, DBname):
    schema_filename = filepath('AltDatabase/' + species + '/' + platform +
                               '/' + DBname + '_schema.sql')
    export_data = export.ExportFile(schema_filename)

    ### We will need to augment the database with protein feature annotations for
    export_data.write(schema_text)
    export_data.close()
Пример #20
0
def returnANOVAFiltered(original_data, matrix_pvalues):
    import export
    eo = export.ExportFile(filename[:-4] + '-ANOVA.txt')
    eo.write(original_data['header'])
    for key in matrix_pvalues:
        eo.write(original_data[key])
        last_line = original_data[key]
    eo.close()
Пример #21
0
def findSpeciesInUniProtFiles(force):
    ### Download all UniProt annotation files and grab all species names, TaxIDs and corresponding URLs
    
    import AltAnalyze
    ###Get species annotations from the GO-Elite config
    species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={}
    for species_full in species_annot_db:
        taxid=species_annot_db[species_full].TaxID()
        tax_db[taxid]=species_full

    if force == 'yes':
        ### Should only need to be run if UniProt changes it's species to file associations or new species supported by Ensembl
        import export; import update
        filesearch = '_sprot_'
        all_swissprot = update.getFTPData('ftp.expasy.org','/databases/uniprot/current_release/knowledgebase/taxonomic_divisions',filesearch)
        for file in all_swissprot:
            gz_filepath, status = update.download(file,'uniprot_temp/','')        
            if status == 'not-removed':
                try: os.remove(gz_filepath) ### Not sure why this works now and not before
                except OSError: status = status
            
    species_uniprot_db={}; altanalyze_species_uniprot_db={}
    dir=read_directory('/uniprot_temp')
    for filename in dir:    
        fn=filepath('uniprot_temp/'+filename)
        for line in open(fn,'r').xreadlines():
            data = cleanUpLine(line)
            if data[0:2] == 'OX':
                taxid = string.split(data,'=')[1][:-1]
                if taxid in tax_db:
                    species_full = tax_db[taxid]
            elif data[0:2] == 'OS':
                species = data[5:]
                species = string.split(species,' ')[:2]
                species_full = string.join(species,' ')
            elif data[0] == '/':
                url = 'ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/'+filename
                ss = string.split(species_full,' ')
                if len(ss)==2: ### Species name is in the format H**o sapiens - and '(' not in species_full and ')' not in species_full and '/' not in species_full
                    try: species_uniprot_db[species_full].append((taxid,'ftp://'+url+'.gz'))
                    except KeyError: species_uniprot_db[species_full] = [(taxid,'ftp://'+url+'.gz')]
                taxid = ''; species_full = ''
    import EnsemblImport
    species_uniprot_db = EnsemblImport.eliminate_redundant_dict_values(species_uniprot_db)
    ### Export all species to UniProt file relationships so this function needs to only be run once
    import export         
    up = export.ExportFile('Config/uniprot-species-file.txt')
    for species_full in species_uniprot_db:
        values = species_uniprot_db[species_full]
        if len(values)>1:
            found = 'no'
            for (taxid,url) in values:
                if taxid in tax_db:
                    if species_full == tax_db[taxid]: found='yes'; print 'ambiguity resolved:',species_full; break
                if found == 'yes': break
        else: (taxid,url) = values[0]
        up.write(string.join([species_full,taxid,url],'\t')+'\n')
    up.close()
Пример #22
0
def exportVersionData(version,version_date,dir,force='application-path'):
    new_file = dir+'version.txt'
    new_file_default = filepath(new_file,force=force) ### can use user directory local or application local
    print new_file_default;sys.exit()
    try:
        data.write(str(version)+'\t'+str(version_date)+'\n'); data.close()
    except:
        data = export.ExportFile(new_file)
        data.write(str(version)+'\t'+str(version_date)+'\n'); data.close()
Пример #23
0
def exportSelectedIDs(input_IDs):
    expressionFile = outputDir+'/networks/IDList.txt'
    data = export.ExportFile(expressionFile)
    data.write('UID\tSystemCode\n')
    for id in input_IDs:
        if 'HMDB' in id:
            id = hmdb_symbol_db[id]
        data.write(id+'\tEn\n')
    data.close()
    return expressionFile
Пример #24
0
def extractFeatures(species,countinp):
    import export
    ExonsPresent=False
    lastgene = None
    lastend = None
    genes_detected={}
    count=0
    first_last_exons = {} ### Make a fake junction comprised of the first and last exon
    if 'counts.' in countinp:
	feature_file = string.replace(countinp,'counts.','features.')
	fe = export.ExportFile(feature_file)
	firstLine = True
	for line in open(countinp,'rU').xreadlines():
	    if firstLine: firstLine=False
	    else:
		feature_info = string.split(line,'\t')[0]
		fe.write(feature_info+'\n')
		junction_annotation = string.split(feature_info,'=')[0]
		if '-' in junction_annotation:
		    geneid = string.split(junction_annotation,':')[0]
		    genes_detected[geneid]=[]
		if ExonsPresent == False:
		    exon = string.split(feature_info,'=')[0]
		    if '-' not in exon:
			ExonsPresent = True
			
	### Add exon-info if necessary

	exons_file = unique.filepath('AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_exon.txt')
        firstLine = True
	for line in open(exons_file,'rU').xreadlines():
	    if firstLine: firstLine=False
	    else:
		line = line.rstrip('\n')
		t = string.split(line,'\t')
		gene,exon,chr,strand,start,end = t[:6]
		if gene!=lastgene:
		    if len(genes_detected)==0 or gene in genes_detected: ### restrict to detected genes
			first_last_exons[gene] = [(chr,start)]
		    if len(genes_detected)==0 or lastgene in genes_detected: ### restrict to detected genes
			try: first_last_exons[lastgene].append(lastend)
			except Exception:
			    pass ### occurs for the first gene	
		if ExonsPresent == False:
		    fe.write(gene+':'+exon+'='+chr+':'+start+'-'+end+'\n')
		lastgene = gene; lastend = end
	if len(genes_detected)==0 or lastgene in genes_detected:
	    first_last_exons[lastgene].append(lastend)
	
	### Add a fake junction for the whole gene
	for gene in first_last_exons:
	    (chr,start),end = first_last_exons[gene]
	    fe.write(gene+':E1.1-E100.1'+'='+chr+':'+start+'-'+end+'\n')
	fe.close()
    return feature_file	
Пример #25
0
def exportVersionData(version,version_date,dir):
    ### Used by the module UI
    program_type,database_dir = unique.whatProgramIsThis(); parent_dir = ''
    if program_type == 'AltAnalyze': parent_dir = 'AltDatabase/goelite/'
    elif 'OBO' in dir or 'Config' in dir: parent_dir = ''
    else: parent_dir = database_dir
    dir = parent_dir+dir    
    global current_version; current_version = version
    global current_version_date; current_version_date = version_date
    new_file = dir+'version.txt'
    data = export.ExportFile(new_file)
    data.write(str(version)+'\t'+str(version_date)+'\n'); data.close()
Пример #26
0
def transposeMatrix(input_file):
    arrays = []
    import export
    eo = export.ExportFile(input_file[:-4] + '-transposed.txt')
    for line in open(input_file, 'rU').xreadlines():
        data = cleanUpLine(line)
        values = string.split(data, '\t')
        arrays.append(values)
    t_arrays = zip(*arrays)
    for t in t_arrays:
        eo.write(string.join(t, '\t') + '\n')
    eo.close()
Пример #27
0
def exportPathwayData(original_pathway_z_score_data, gene_file, headers,
                      resource_name, pathway_type):
    if resource_name == 'GeneOntology':
        resource_name = 'GO'  ### Makes the output filename compatible with GenMAPP-CS plugin filenames
    if resource_name == 'WikiPathways':
        resource_name = 'local'  ### Makes the output filename compatible with GenMAPP-CS plugin filenames
    new_file = mappfinder_output_dir + '/' + gene_file[:
                                                       -4] + '-' + resource_name + '.txt'

    global sort_results
    data = export.ExportFile(new_file)
    data.write(headers)
    sort_results = []
    #print "Results for",len(original_pathway_z_score_data),"pathways exported to",new_file
    for pathway in original_pathway_z_score_data:
        zsd = original_pathway_z_score_data[pathway]
        try:
            results = [
                zsd.Changed(),
                zsd.Measured(),
                zsd.InPathway(),
                zsd.PercentChanged(),
                zsd.PercentPresent(),
                zsd.ZScore(),
                zsd.PermuteP(),
                zsd.AdjP()
            ]
        except AttributeError:
            print pathway, len(permuted_z_scores[pathway])
            kill
        try:  ###This is unnecessary, unless using the non-nested GO associations (which can have out of sync GOIDs)
            if pathway_type == 'Ontology':
                s = ontology_annotations[pathway]
                annotations = [
                    s.OntologyID(),
                    s.OntologyTerm(),
                    s.OntologyType()
                ]
                results = annotations + results
            else:
                results = [pathway] + results
            results = string.join(results, '\t') + '\n'
            sort_results.append(
                [float(zsd.ZScore()), -1 / float(zsd.Measured()), results])
        except KeyError:
            null = []
    sort_results.sort()
    sort_results.reverse()
    for values in sort_results:
        results = values[2]
        data.write(results)
    data.close()
Пример #28
0
def exportSymbolRelationships(pathway_to_symbol,selected_species,pathway_type,type):    
    if selected_species != None: ### Restrict to selected species only
        current_species_dirs=selected_species
    else:
        current_species_dirs = unique.read_directory('/'+database_dir)
    
    for species in current_species_dirs:
        if '.' not in species:
            ens_dir = database_dir+'/'+species+'/gene-'+type+'/Ensembl-'+pathway_type+'.txt'
            ens_data = export.ExportFile(ens_dir)
            if 'mapp' in type: ens_data.write('GeneID\tSystem\tGeneSet\n')
            else: ens_data.write('GeneID\tGeneSet\n')
            try: ens_to_entrez = gene_associations.getGeneToUid(species,('hide','Ensembl-EntrezGene'))
            except Exception: ens_to_entrez ={}
            if len(ens_to_entrez)>0:
                entrez_dir = database_dir+'/'+species+'/gene-'+type+'/EntrezGene-'+pathway_type+'.txt'
                entrez_data = export.ExportFile(entrez_dir)
                if 'mapp' in type: entrez_data.write('GeneID\tSystem\tGeneSet\n')
                else: entrez_data.write('GeneID\tGeneSet\n')
            #print 'Exporting '+pathway_type+' databases for:',species
            try: gene_to_source_id = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol'))
            except Exception: gene_to_source_id={}
            source_to_gene = OBO_import.swapKeyValues(gene_to_source_id)
            source_to_gene = lowerSymbolDB(source_to_gene)
            for pathway in pathway_to_symbol:
                for symbol in pathway_to_symbol[pathway]:
                    try:
                        genes = source_to_gene[symbol]
                        for gene in genes:
                            if 'mapp' in type: ens_data.write(gene+'\tEn\t'+pathway+'\n')
                            else: ens_data.write(gene+'\t'+pathway+'\n')
                            if gene in ens_to_entrez:
                                for entrez in ens_to_entrez[gene]:
                                    if 'mapp' in type: entrez_data.write(entrez+'\tL\t'+pathway+'\n')
                                    else: entrez_data.write(entrez+'\t'+pathway+'\n')
                    except Exception: null=[]
            ens_data.close()
            try: entrez_data.close()
            except Exception: null=[]
Пример #29
0
def exportOntologyRelationships(nested_ontology_gene,gene_to_source_id,mod,source_type,ontology_type):
    program_type,database_dir = unique.whatProgramIsThis()
    if ontology_type == 'GeneOntology': ontology_type = 'GO'
    new_file = database_dir+'/'+species_code+'/nested/'+mod+'_to_Nested-'+ontology_type+'.txt'
    data = export.ExportFile(new_file)
    title = [mod,'ontology_id']; title_str = string.join(title,'\t')
    data.write(title_str+'\n')
    for ontology_id in nested_ontology_gene:
        for gene in nested_ontology_gene[ontology_id]:
            output_list = [gene,ontology_id]
            output_str = string.join(output_list,'\t')
            data.write(output_str+'\n')
    data.close()
    print new_file, 'saved to disk'
Пример #30
0
def filterPSIValues(filename, percentCutoff=0.75, filterStatus=True):
    """ Filter the PSI file to only include events in which >75% of the samples have PSI values """

    firstRow = True
    header = True
    rows = 0
    filtered = 0
    ### Filtered Export file
    new_file = filename[:-4] + '-' + str(int(100 * percentCutoff)) + 'p.txt'

    if filterStatus:
        ea = export.ExportFile(new_file)

    for line in open(filename, 'rU').xreadlines():
        data = line.rstrip()
        t = string.split(data, '\t')
        if header:
            header = False
            eventindex = t.index(
                'EventAnnotation'
            )  ### This field is the last annotation column before sample PSI values
            t = [t[1]] + t[eventindex + 1:]
            header_length = len(t) - 1
            minimum_values_present = int(
                float((header_length) - 1.0) * percentCutoff)
            not_detected = header_length - minimum_values_present
            new_line = line
            if filterStatus:
                ea.write(new_line)
        else:
            if filterStatus:  ### If sufficient PSI detected samples, write to new file
                t = [t[1]] + t[eventindex + 1:]
                missing_values_at_the_end = (header_length + 1) - len(t)
                missing = missing_values_at_the_end + t.count('')
                if missing < not_detected:
                    new_line = line
                    ea.write(new_line)
                    filtered += 1
        rows += 1

    if filterStatus:
        ea.close()
        return new_file, header_length
    else:
        ### Just return the number of samples in the file
        return header_length