Exemplo n.º 1
0
def importBioMarkerGeneAssociations():
    try:
        biomarker_files = unique.read_directory('BuildDBs/BioMarkers/')
    except Exception:
        biomarker_files = unique.read_directory('/BuildDBs/BioMarkers/')
    x=0; marker_symbol_db={}
    for file in biomarker_files:
        if '.txt' in file:
            fn = filepath('BuildDBs/BioMarkers/'+file)
            for line in open(fn,'rU').xreadlines():
                data = cleanUpLine(line)
                t = string.split(data,'\t')
                if x==0:
                    x = 1; y=0
                    for i in t:
                        if 'marker-in' in i: mi = y
                        if 'Symbol' in i: sy = y
                        y+=1
                ensembl = t[0]; symbol = string.lower(t[sy]); marker = t[mi]
                markers = string.split(marker,'|')
                for marker in markers:
                    try: marker_symbol_db[marker].append(symbol)
                    except Exception: marker_symbol_db[marker]=[symbol]
    marker_symbol_db = gene_associations.eliminate_redundant_dict_values(marker_symbol_db)
    return marker_symbol_db
Exemplo n.º 2
0
def remoteIndexing(species, fl):
    """ Begin building strand gff and index files for SashimiPlot based on the AltAnalyze database
    exon, junction and gene annotations """

    global gff_export_obj
    try:
        ### When fl is strand dataset information object
        countsFileDir = fl.CountsFile(
        )  ### Counts file containing exon and junction positions
        root_dir = fl.RootDir()  ### Root folder location
    except Exception:
        ### STRAND proper object may not be supplied with this information. Use the root directory alone to infer these
        root_dir = fl
        search_dir = root_dir + '/ExpressionInput'
        files = unique.read_directory(
            search_dir)  ### all files in ExpressionInput
        for file in files:
            if 'counts.' in file and 'steady-state.txt' not in file:
                countsFileDir = search_dir + '/' + file  ### counts file with exon positions

    PSIFileDir = root_dir + '/AltResults/AlternativeOutput/' + species + '_RNASeq_top_alt_junctions-PSI.txt'
    OutputDir = findParentDir(PSIFileDir)
    output = OutputDir + "events_sashimi.gff"
    gff_export_obj = open(output, 'w')

    ### Sometimes only junctions are in the count file so create strand new file with detected junctions and all exons
    ### This information and associated featrues is extracted from the counts file
    featuresEvaluated = extractFeatures(species, countsFileDir)

    ### Compile and export the coordinates to gff format and index these coordinates for fast retreival by Miso
    Indexing(featuresEvaluated, PSIFileDir, output)
Exemplo n.º 3
0
def importWikiPathways(selected_species,force):
    if selected_species == None:
        selected_species = unique.read_directory('/'+database_dir)
    importSpeciesData()
    getSourceData()
    all_species = 'no'
    if force == 'yes':
        try:
            gene_associations.convertAllGPML(selected_species,all_species) ### Downloads GPMLs and builds flat files
            status = 'built'
        except IOError:
            print 'Unable to connect to http://www.wikipathways.org'
            status = 'failed'
    status = 'built'
    if status == 'built':
        import BuildAffymetrixAssociations

        for species_code in selected_species:
            species_name = species_names[species_code]
            if status == 'built':          
                relationship_types = ['native','mapped']
                for relationship_type in relationship_types:
                    #print 'Processing',relationship_type,'relationships'
                    index=0
                    integrate_affy_associations = 'no'
                    incorporate_previous = 'yes'
                    process_affygo = 'no'
                    counts = BuildAffymetrixAssociations.importWikipathways(source_types,incorporate_previous,process_affygo,species_name,species_code,integrate_affy_associations,relationship_type,'over-write previous')
                    index+=1
    print 'Finished integrating updated WikiPathways'
Exemplo n.º 4
0
def importCircularRNAEvents(folder, circ_p):
    dataset_events = {}
    files = unique.read_directory(folder)
    for file in files:
        if 'circRNA.' in file and '.txt' in file:
            events = []
            dataset = file[:-4]
            fn = unique.filepath(folder + '/' + file)
            firstRow = True
            for line in open(fn, 'rU').xreadlines():
                data = cleanUpLine(line)
                t = string.split(data, '\t')
                if firstRow:
                    index = 0
                    """ Standard Fields from MultiPath-PSI """
                    for i in t:
                        if 'PValue' == i:
                            pv = index
                        if 'logFC' == i:
                            lf = index
                        index += 1
                    firstRow = False
                else:
                    id = t[0]
                    pval = float(t[pv])
                    logFC = float(t[lf])
                    ci = circInformation(id, pval, logFC)
                    if pval < circ_p:
                        events.append(ci)
            dataset_events[dataset] = events
    return dataset_events
Exemplo n.º 5
0
def read_directory(sub_dir):
    dir_list = unique.read_directory(sub_dir)
    #add in code to prevent folder names from being included
    dir_list2 = []
    for file in dir_list:
        if '.txt' in file: dir_list2.append(file)
    return dir_list2
Exemplo n.º 6
0
def combineDropSeq(input_dir):
    import unique
    files = unique.read_directory(input_dir)
    combinedGeneExpression = {}
    for input_file in files:  #:70895507-70895600
        header = True
        if '.txt' in input_file:
            for line in open(input_dir + '/' + input_file, 'rU').xreadlines():
                data = cleanUpLine(line)
                t = string.split(data, '\t')
                if header:
                    header_row = line
                    samples = t[1:]
                    header = False
                else:
                    values = map(float, t[1:])
                    gene = t[0]
                    if gene in combinedGeneExpression:
                        prior_values = combinedGeneExpression[gene]
                        count_sum_array = [
                            sum(value)
                            for value in zip(*[prior_values, values])
                        ]
                    else:
                        count_sum_array = values
                    combinedGeneExpression[gene] = count_sum_array

    input_file = input_dir + '/test.txt'
    export_object = open(input_file, 'w')
    export_object.write(string.join(['UID'] + samples, '\t') + '\n')
    for gene in combinedGeneExpression:
        values = string.join(map(str, [gene] + combinedGeneExpression[gene]),
                             '\t')
        export_object.write(values + '\n')
    export_object.close()
Exemplo n.º 7
0
def getFiles(sub_dir):
    dir_list = unique.read_directory(sub_dir)
    dir_list2 = []
    ###Only get folder names
    for entry in dir_list:
        dir_list2.append(entry)
    return dir_list2
Exemplo n.º 8
0
def readDirText(sub_dir):
    dir_list = unique.read_directory(sub_dir)
    dir_list2 = []
    ###Code to prevent folder names from being included
    for entry in dir_list:
        if entry[-4:] == ".txt": dir_list2.append(entry)
    return dir_list2
Exemplo n.º 9
0
def importReciprocalJunctions(inputpsi,PSIJunctions):
    ### Also include other predicted splicing events
    alt_dir = string.split(inputpsi,'AlternativeOutput')[0]+'AlternativeOutput'
    files = unique.read_directory(alt_dir)
    added=0
    already_added=0
    for file in files:
	if 'ASPIRE-exon-inclusion-results' in file or 'linearregres-exon-inclusion-results' in file:
            alt_exon_path = alt_dir+'/'+file
	    header=True
	    for line in open(alt_exon_path,'rU').xreadlines():
		line = line.rstrip(os.linesep)
		if header: header=False
		else:
		    t=string.split(line,'\t')
		    inclusion_junction = t[8]
		    exclusion_junction = t[10]
		    pair = inclusion_junction+' '+exclusion_junction
		    pair = string.replace(pair,':','__')
		    if pair in PSIJunctions:
			already_added+=1
		    else:
			PSIJunctions.append(pair)
			added+=1
    return PSIJunctions
Exemplo n.º 10
0
def read_directory(sub_dir):
    dir_list = unique.read_directory(sub_dir)
    dir_list2 = []
    for entry in dir_list:
        if entry[-4:] == ".txt" or entry[-4:] == ".csv":
            dir_list2.append(entry)
    return dir_list2
Exemplo n.º 11
0
def read_directory(sub_dir):
    dir_list = unique.read_directory(sub_dir)
    dir_list2 = []
    for entry in dir_list:
        if entry[-4:] == ".txt" or entry[-4:] == ".all" or entry[
                -5:] == ".data" or entry[-3:] == ".fa":
            dir_list2.append(entry)
    return dir_list2
Exemplo n.º 12
0
def read_directory(sub_dir):
    dir_list = unique.read_directory(sub_dir)
    #add in code to prevent folder names from being included
    dir_list2 = []
    for entry in dir_list:
        #if entry[-4:] == ".txt" or entry[-4:] == ".all" or entry[-5:] == ".data" or entry[-3:] == ".fa":
        dir_list2.append(entry)
    return dir_list2
Exemplo n.º 13
0
def getFiles(sub_dir,directories=True):
    dir_list = unique.read_directory(sub_dir); dir_list2 = []
    for entry in dir_list:
        if directories:
            if '.' not in entry: dir_list2.append(entry)
        else:
            if '.' in entry: dir_list2.append(entry)
    return dir_list2
Exemplo n.º 14
0
def read_directory(sub_dir):
    dir_list = unique.read_directory(sub_dir)
    #add in code to prevent folder names from being included
    dir_list2 = []
    for file in dir_list:
        lf = string.lower(file)
        if '.txt' in lf or '.sif' in lf or '.tab' in lf: dir_list2.append(file)
    return dir_list2
Exemplo n.º 15
0
def read_directory(sub_dir):
    dir_list = unique.read_directory(sub_dir)
    dir_list2 = []
    ###Code to prevent folder names from being included
    for entry in dir_list:
        if entry[-4:] == ".txt" or entry[
                -4:] == ".csv" or ".ontology" in entry or '.obo' in entry:
            dir_list2.append(entry)
    return dir_list2
Exemplo n.º 16
0
def getValidExpFile(altanalyze_rawexp_dir):
    import unique
    dir_files = unique.read_directory(altanalyze_rawexp_dir)
    valid_file = ''
    for file in dir_files:
        if 'exp.' in file and 'state.txt' not in file and 'feature' not in file:
            valid_file = altanalyze_rawexp_dir + '/' + file
            break
    return valid_file
Exemplo n.º 17
0
def matrixImport(filename):
    matrix={}
    original_data={}
    headerRow=True
    for line in open(filename,'rU').xreadlines():
        original_line = line
        data = line.rstrip()
        values = string.split(data,'\t')
        if headerRow:
            group_db={}
            groups=[]
            if ':' in data:
                group_sample_list = map(lambda x: string.split(x,':'),values[1:])
                index=1
                for (g,s) in group_sample_list:
                    try: group_db[g].append(index)
                    except Exception: group_db[g] = [index]
                    index+=1
                    if g not in groups: groups.append(g)
            else:
                import ExpressionBuilder
                search_dir = string.split(filename,'AltResults')[0]+'ExpressionInput'
                files = unique.read_directory(search_dir)
                for file in files:
                    if 'groups.' in file:
                        sample_group_db = ExpressionBuilder.simplerGroupImport(search_dir+'/'+file)
                index=0
                for s in values[1:]:
                    g = sample_group_db[s]
                    try: group_db[g].append(index)
                    except Exception: group_db[g] = [index]
                    index+=1
                    if g not in groups: groups.append(g)
            headerRow = False
            grouped_values=[]
            original_data['header'] = original_line
        else:
            key = values[0]
            grouped_floats=[]
            float_values = []
            for g in groups: ### string values
                gvalues_list=[]
                for i in group_db[g]:
                    if values[i] != '0':
                        try: gvalues_list.append(float(values[i]))
                        except Exception: pass
                    else:
                        try: gvalues_list.append('') ### Thus are missing values
                        except Exception: pass
                grouped_floats.append(gvalues_list)
            matrix[key] = grouped_floats
            if '\n' not in original_line:
                original_line+='\n'
            original_data[key] = original_line
            last_line = line
    return matrix,original_data
Exemplo n.º 18
0
def deleteNestedOntologyFiles(ontology_type):
    program_type,database_dir = unique.whatProgramIsThis()
    current_species_dirs = unique.read_directory('/'+database_dir)
    for species_code in current_species_dirs:
        c = GrabFiles(); c.setdirectory('/'+database_dir+'/'+species_code+'/nested')
        if ontology_type == 'GeneOntology': ontology_type = 'GO'
        file_dirs = c.searchdirectory('-'+ontology_type) ### list all nested files referencing the Ontology type
        for file in file_dirs:
            try: os.remove(filepath(database_dir+'/'+species_code+'/nested/'+file))
            except Exception: null=[]
Exemplo n.º 19
0
def read_directory(sub_dir):
    try:
        dir_list = unique.read_directory(sub_dir)
    except Exception:
        dir_list = []  ### Directory does not exist
    dir_list2 = []
    ###Code to prevent folder names from being included
    for entry in dir_list:
        if entry[-4:] == ".txt" or entry[-4:] == ".csv":
            dir_list2.append(entry)
    return dir_list2
Exemplo n.º 20
0
def downloadDomainAssociations(selected_species):
    paths=[]
    if selected_species != None: ### Restrict to selected species only
        current_species_dirs=selected_species
    else:
        current_species_dirs = unique.read_directory('/'+database_dir)
    for species in current_species_dirs:
        url = 'http://www.genmapp.org/go_elite/Databases/ExternalSystems/Domains/'+species+'_Ensembl-Domain.gz'
        fln,status = update.downloadSuppressPrintOuts(url,'BuildDBs/Domains/','txt')
        if 'Internet' not in status:
            paths.append((species,fln))
    return paths
Exemplo n.º 21
0
def transferGOSlimGeneAssociations(selected_species):
    if selected_species != None: ### Restrict to selected species only
        current_species_dirs=selected_species
    else:
        current_species_dirs = unique.read_directory('/'+database_dir)
    for species_code in current_species_dirs:
        try:
            ens_go_file_dir = filepath(database_dir+'/'+species_code+'/gene-go/Ensembl-GOSlim.txt')
            goslim_ens_file = filepath(database_dir+'/'+species_code+'/uid-gene/Ensembl-goslim_goa.txt')
            export.copyFile(goslim_ens_file,ens_go_file_dir)
            translateToEntrezGene(species_code,ens_go_file_dir)
        except Exception: null=[]
Exemplo n.º 22
0
def considerOnlyMammalian(selected_species):
    supported_mammals = ['Am','Bt', 'Cf', 'Ch', 'Cj', 'Cp', 'Do', 'Ec', 'Ee', 'Et', 'Fc', 'Gg', 'Go', 'Hs',
                         'La', 'Ma', 'Md', 'Me', 'Mi', 'Ml', 'Mm', 'Oa', 'Oc','Og', 'Op', 'Pc', 'Pp',
                         'Pt', 'Pv', 'Rn', 'Sa', 'Ss', 'St', 'Tb', 'Tn', 'Tr', 'Ts', 'Tt', 'Vp']
    filtered_species=[]
    if selected_species == None:
        selected_species = unique.read_directory('/'+database_dir)
        
    for i in selected_species:
        if i in supported_mammals:
            filtered_species.append(i)
    return filtered_species
Exemplo n.º 23
0
def importDiseaseOntologyGeneAssocations():
    disease_ontology_files = unique.read_directory('/BuildDBs/Disease')
    symbol_to_DO={}
    for file in disease_ontology_files:
        if '_do' in file:
            fn = filepath('BuildDBs/Disease/'+file)
            for line in open(fn,'rU').xreadlines():
                data = cleanUpLine(line)
                t = string.split(data,'\t')
                if len(t)>1:
                    symbol=string.lower(t[2]); doid = t[4]
                    try: symbol_to_DO[doid].append(symbol)
                    except Exception: symbol_to_DO[doid]=[symbol]
    return symbol_to_DO
Exemplo n.º 24
0
def searchDirectory(directory, var, secondary=None):
    directory = unique.filepath(directory)

    files = unique.read_directory(directory)
    for file in files:
        if var in file:
            if secondary == None:
                return directory + '/' + file
                break
            elif secondary in file:
                return directory + '/' + file
                break

    ### if all else fails
    return directory + '/' + file
Exemplo n.º 25
0
def importMiRAssociations(selected_species,force):
    supported_databases = unique.read_directory('/'+database_dir)
    if selected_species != None: ### Restrict by selected species
        supported_databases=selected_species

    missing_miR_associations=[]
    found_miR_associations=[]
    for species in supported_databases:
        if force == 'yes':
            try:
                fn = downloadMiRDatabases(species)
                found_miR_associations.append((species,fn))
            except Exception:
                missing_miR_associations.append(species)
                
    for (species,fn) in found_miR_associations:
        importMiRGeneAssociations(species,fn)
Exemplo n.º 26
0
def remoteSashimiPlot(species, fl, bamdir, genelis):
    global inputpsi
    global outputdir
    try:
        countinp = fl.CountsFile()
        root_dir = fl.RootDir()
    except Exception:
        root_dir = fl
        search_dir = root_dir + '/ExpressionInput'
        files = unique.read_directory(search_dir)
        for file in files:
            if 'counts.' in file and 'steady-state.txt' not in file:
                countinp = search_dir + '/' + file

    inputpsi = root_dir + '/AltResults/AlternativeOutput/' + species + '_RNASeq_top_alt_junctions-PSI.txt'
    #outputdir=findParentDir(inputpsi)+"sashimiplots"
    outputdir = root_dir + '/ExonPlots'
    outputdir = root_dir + '/SashimiPlots'
    try:
        os.mkdir(unique.filepath(outputdir))
    except Exception:
        pass
    #print bamdir
    #print countinp
    #print inputpsi
    #print genelis
    Sashimiplottting(bamdir, countinp, inputpsi, genelis)

    gene_label, gene_sym = genelist(inputpsi)
    for filename in os.listdir(outputdir):
        if '.pdf' in filename:
            newname = string.split(filename, ':')
            if newname[0] in gene_sym:
                new_filename = str(filename)
                if ':' in filename:
                    new_filename = string.split(filename, ':')[1]
                elif '\\' in filename:
                    new_filename = string.split(filename, '\\')[1]
                elif '/' in filename:
                    new_filename = string.split(filename, '/')[1]
                nnname = gene_sym[newname[0]] + '-SashimiPlot_' + new_filename
                os.rename(os.path.join(outputdir, filename),
                          os.path.join(outputdir, nnname))
            else:
                continue
Exemplo n.º 27
0
def remoteIndexing(species,fl):
    global export_in
    try:
	countinp = fl.CountsFile()
	root_dir = fl.RootDir()
    except Exception:
	root_dir = fl
        search_dir = root_dir+'/ExpressionInput'
        files = unique.read_directory(search_dir)
        for file in files:
            if 'counts.' in file and 'steady-state.txt' not in file:
                    countinp = search_dir+'/'+file
		    
    inputpsi = root_dir+'/AltResults/AlternativeOutput/'+species+'_RNASeq_top_alt_junctions-PSI.txt'
    outputdir=findParentDir(inputpsi)
    output=outputdir+"events_trial.gff"
    export_in=open(output,'w')
    Indexing(countinp,inputpsi,output)
Exemplo n.º 28
0
def exportIndexes(input_dir):
    import unique
    bam_dirs = unique.read_directory(input_dir)
    print 'Building BAM index files',
    for file in bam_dirs:
        if string.lower(file[-4:]) == '.bam':
            bam_dir = input_dir+'/'+file
            bamf = pysam.AlignmentFile(bam_dir, "rb" )
            ### Is there an indexed .bai for the BAM? Check.
            try:
                for entry in bamf.fetch():
                    codes = map(lambda x: x[0],entry.cigar)
                    break
            except Exception:
                ### Make BAM Indexv lciv9df8scivx 
                print '.',
                bam_dir = str(bam_dir)
                #On Windows, this indexing step will fail if the __init__ pysam file line 51 is not set to - catch_stdout = False
                pysam.index(bam_dir)
Exemplo n.º 29
0
def remoteIndexing(species,fl):
    global export_in
    try:
	countinp = fl.CountsFile()
	root_dir = fl.RootDir()
    except Exception:
	root_dir = fl
        search_dir = root_dir+'/ExpressionInput'
        files = unique.read_directory(search_dir)
        for file in files:
            if 'counts.' in file and 'steady-state.txt' not in file:
                    countinp = search_dir+'/'+file
		    
    inputpsi = root_dir+'/AltResults/AlternativeOutput/'+species+'_RNASeq_top_alt_junctions-PSI.txt'
    outputdir=findParentDir(inputpsi)
    output=outputdir+"events_sashimi.gff"
    export_in=open(output,'w')
    
    ### Sometimes only junctions are in the count file so create a new file with detected junctions and all exons
    featuresEvaluated = extractFeatures(species,countinp)

    Indexing(featuresEvaluated,inputpsi,output)
Exemplo n.º 30
0
def importSplicingEvents(folder):
    dataset_events = {}
    files = unique.read_directory(folder)
    for file in files:
        if 'PSI.' in file and '.txt' in file:
            events = []
            dataset = file[:-4]
            fn = unique.filepath(folder + '/' + file)
            firstRow = True
            for line in open(fn, 'rU').xreadlines():
                data = cleanUpLine(line)
                t = string.split(data, '\t')
                if firstRow:
                    index = 0
                    """ Standard Fields from MultiPath-PSI """
                    for i in t:
                        if 'Event-Direction' == i:
                            ed = index
                        if 'ClusterID' == i:
                            ci = index
                        if 'AltExons' == i:
                            ae = index
                        if 'EventAnnotation' == i:
                            ea = index
                        if 'Coordinates' == i:
                            co = index
                        index += 1
                    firstRow = False
                else:
                    id = t[0]
                    event_direction = t[ed]
                    clusterID = t[ci]
                    altExons = t[ae]
                    coordinates = t[co]
                    ei = EventInformation(id, event_direction, clusterID,
                                          altExons, coordinates)
                    events.append(ei)
            dataset_events[dataset] = events
    return dataset_events