Exemplo n.º 1
0
def exportMetaProbesets(array_type,species):
    import AltAnalyze; reload(AltAnalyze)
    import export
    probeset_types = ['core','extended','full']
    if array_type == 'junction': probeset_types = ['all']
    for probeset_type in probeset_types:
        exon_db,null = AltAnalyze.importSplicingAnnotations(array_type,species,probeset_type,'yes','')
        gene_db={}; null=[]
        for probeset in exon_db:
            ### At this point, exon_db is filtered by the probeset_type (e.g., core)
            ensembl_gene_id = exon_db[probeset].GeneID()
            try: gene_db[ensembl_gene_id].append(probeset)
            except Exception: gene_db[ensembl_gene_id] = [probeset]
            
        exon_db=[]; uid=0
        output_dir = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_'+array_type+'_'+probeset_type+'.mps'
        #output_cv_dir = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Conversion_'+array_type+'_'+probeset_type+'.txt'
        #data_conversion = export.ExportFile(output_cv_dir)
        data = export.ExportFile(output_dir)
        data.write('probeset_id\ttranscript_cluster_id\tprobeset_list\tprobe_count\n')
        print "Exporting",len(gene_db),"to",output_dir
        for ensembl_gene_id in gene_db:
            probeset_strlist = string.join(gene_db[ensembl_gene_id],' '); uid+=1
            line = string.join([str(uid),str(uid),probeset_strlist,str(len(gene_db[ensembl_gene_id])*4)],'\t')+'\n'
            data.write(line)
            #conversion_line = string.join([str(uid),ensembl_gene_id],'\t')+'\n'; data_conversion.write(conversion_line)
        data.close(); #data_conversion.close()
Exemplo n.º 2
0
def findSpeciesInUniProtFiles(force):
    ### Download all UniProt annotation files and grab all species names, TaxIDs and corresponding URLs
    
    import AltAnalyze
    ###Get species annotations from the GO-Elite config
    species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={}
    for species_full in species_annot_db:
        taxid=species_annot_db[species_full].TaxID()
        tax_db[taxid]=species_full

    if force == 'yes':
        ### Should only need to be run if UniProt changes it's species to file associations or new species supported by Ensembl
        import export; import update
        filesearch = '_sprot_'
        all_swissprot = update.getFTPData('ftp.expasy.org','/databases/uniprot/current_release/knowledgebase/taxonomic_divisions',filesearch)
        for file in all_swissprot:
            gz_filepath, status = update.download(file,'uniprot_temp/','')        
            if status == 'not-removed':
                try: os.remove(gz_filepath) ### Not sure why this works now and not before
                except OSError: status = status
            
    species_uniprot_db={}; altanalyze_species_uniprot_db={}
    dir=read_directory('/uniprot_temp')
    for filename in dir:    
        fn=filepath('uniprot_temp/'+filename)
        for line in open(fn,'r').xreadlines():
            data = cleanUpLine(line)
            if data[0:2] == 'OX':
                taxid = string.split(data,'=')[1][:-1]
                if taxid in tax_db:
                    species_full = tax_db[taxid]
            elif data[0:2] == 'OS':
                species = data[5:]
                species = string.split(species,' ')[:2]
                species_full = string.join(species,' ')
            elif data[0] == '/':
                url = 'ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/'+filename
                ss = string.split(species_full,' ')
                if len(ss)==2: ### Species name is in the format H**o sapiens - and '(' not in species_full and ')' not in species_full and '/' not in species_full
                    try: species_uniprot_db[species_full].append((taxid,'ftp://'+url+'.gz'))
                    except KeyError: species_uniprot_db[species_full] = [(taxid,'ftp://'+url+'.gz')]
                taxid = ''; species_full = ''
    import EnsemblImport
    species_uniprot_db = EnsemblImport.eliminate_redundant_dict_values(species_uniprot_db)
    ### Export all species to UniProt file relationships so this function needs to only be run once
    import export         
    up = export.ExportFile('Config/uniprot-species-file.txt')
    for species_full in species_uniprot_db:
        values = species_uniprot_db[species_full]
        if len(values)>1:
            found = 'no'
            for (taxid,url) in values:
                if taxid in tax_db:
                    if species_full == tax_db[taxid]: found='yes'; print 'ambiguity resolved:',species_full; break
                if found == 'yes': break
        else: (taxid,url) = values[0]
        up.write(string.join([species_full,taxid,url],'\t')+'\n')
    up.close()
Exemplo n.º 3
0
def findSpeciesInUniProtFiles(force):
    ### Download all UniProt annotation files and grab all species names, TaxIDs and corresponding URLs
    
    import AltAnalyze
    ###Get species annotations from the GO-Elite config
    species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={}
    for species_full in species_annot_db:
        taxid=species_annot_db[species_full].TaxID()
        tax_db[taxid]=species_full

    if force == 'yes':
        ### Should only need to be run if UniProt changes it's species to file associations or new species supported by Ensembl
        import export; import update
        filesearch = '_sprot_'
        all_swissprot = update.getFTPData('ftp.expasy.org','/databases/uniprot/current_release/knowledgebase/taxonomic_divisions',filesearch)
        for file in all_swissprot:
            gz_filepath, status = update.download(file,'uniprot_temp/','')        
            if status == 'not-removed':
                try: os.remove(gz_filepath) ### Not sure why this works now and not before
                except OSError: status = status
            
    species_uniprot_db={}; altanalyze_species_uniprot_db={}
    dir=read_directory('/uniprot_temp')
    for filename in dir:    
        fn=filepath('uniprot_temp/'+filename)
        for line in open(fn,'r').xreadlines():
            data = cleanUpLine(line)
            if data[0:2] == 'OX':
                taxid = string.split(data,'=')[1][:-1]
                if taxid in tax_db:
                    species_full = tax_db[taxid]
            elif data[0:2] == 'OS':
                species = data[5:]
                species = string.split(species,' ')[:2]
                species_full = string.join(species,' ')
            elif data[0] == '/':
                url = 'ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/'+filename
                ss = string.split(species_full,' ')
                if len(ss)==2: ### Species name is in the format H**o sapiens - and '(' not in species_full and ')' not in species_full and '/' not in species_full
                    try: species_uniprot_db[species_full].append((taxid,'ftp://'+url+'.gz'))
                    except KeyError: species_uniprot_db[species_full] = [(taxid,'ftp://'+url+'.gz')]
                taxid = ''; species_full = ''
    import EnsemblImport
    species_uniprot_db = EnsemblImport.eliminate_redundant_dict_values(species_uniprot_db)
    ### Export all species to UniProt file relationships so this function needs to only be run once
    import export         
    up = export.ExportFile('Config/uniprot-species-file.txt')
    for species_full in species_uniprot_db:
        values = species_uniprot_db[species_full]
        if len(values)>1:
            found = 'no'
            for (taxid,url) in values:
                if taxid in tax_db:
                    if species_full == tax_db[taxid]: found='yes'; print 'ambiguity resolved:',species_full; break
                if found == 'yes': break
        else: (taxid,url) = values[0]
        up.write(string.join([species_full,taxid,url],'\t')+'\n')
    up.close()
Exemplo n.º 4
0
def buildAltMouseExonAnnotations(species,array_type,force,genomic_build):
    """Code required to:
    1) Extract out Affymetrix provided exon sequence (probeset sequence extracted from "probeset_sequence_reversed.txt", derived
       directly from the Affymetrix AltMouse probe annotation file), from the "SEQUENCE-transcript-dbase.txt" (built using
       dump-chip1 .gff sequence and AltMerge-Peptide Informatics script "sequence_analysis_AltMouse_refseq.py").
    2) Once exported, grab full length exon sequences using exon/intron coordinates matches to full-length gene sequences with 2kb
       flanking sequence to efficiently predict microRNA binding site exclusion (reAnnotateCriticalExonSequences) and later for
       coordinate mapping to get exons aligning with UCSC annotated splicing annotations and exons. This sequence data replaced
       the previous file (don't need to re-run this - see rederive_exonseq == 'yes' below for reference).
    3) Match the updated exon sequences to the most recent genomic coordinates and build the exact equivalent of the exon array
       Mm_Ensembl_probeset.txt database (same structure and ExonArrayEnsemblRules.py code). This involves running EnsemblImport.
    This code should be run before the exon array location build code since the "Mm_Ensembl_probeset.txt" is created and then re-
    written as "Mm_AltMouse_Ensembl_probeset.txt".
    """
    
    from build_scripts import JunctionArray
    from build_scripts import JunctionArrayEnsemblRules    
    rederive_exonseq = 'no'
    ### Only needs to be run once, to export exon sequence for AltMouse array the original (1 and 2 above)
    if rederive_exonseq == 'yes':
        import AltAnalyze
        from import_scripts import ExonAnnotate_module
        from build_scripts import ExonAnalyze_module
        agglomerate_inclusion_probesets = 'no'; onlyAnalyzeJunctions='no'
        probeset_annotations_file = "AltDatabase/"+species+"/"+array_type+"/"+"MASTER-probeset-transcript.txt"
        verifyFile(probeset_annotations_file,array_type) ### Will force download if missing
        exon_db={}; filtered_arrayids={};filter_status='no'
        constituitive_probeset_db,exon_db,genes_being_analyzed = AltAnalyze.importSplicingAnnotationDatabase(probeset_annotations_file,array_type,filtered_arrayids,filter_status)
        alt_junction_db,critical_exon_db,exon_dbase,exon_inclusion_db,exon_db = ExonAnnotate_module.identifyPutativeSpliceEvents(exon_db,constituitive_probeset_db,{},agglomerate_inclusion_probesets,onlyAnalyzeJunctions)
        ExonAnnotate_module.exportJunctionComparisons(alt_junction_db,critical_exon_db,exon_dbase)
        print "Finished exporting junctions used in AltMouse array comparisons."

        ExonAnalyze_module.exportAltMouseExonSequence()
        JunctionArray.reAnnotateCriticalExonSequences(species,array_type)

    ### Get UCSC associations (download databases if necessary)
    mRNA_Type = 'mrna'; run_from_scratch = 'yes'
    export_all_associations = 'no' ### YES only for protein prediction analysis
    buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force)

    reannotate_exon_seq = 'yes'
    print 'genomic_build', genomic_build
    if genomic_build == 'new':
        ### Need to run with every new genomic build (match up new coordinates
        print "Begining to derive exon sequence from new genomic build"
        JunctionArray.identifyCriticalExonLocations(species,array_type)
        reannotate_exon_seq = 'yes'
    JunctionArrayEnsemblRules.getAnnotations(species,array_type,reannotate_exon_seq,force)
    
    ### Download files required during AltAnalyze analysis but not during the database build process
    filename = "AltDatabase/"+species+"/"+array_type+"/"+"MASTER-probeset-transcript.txt"
    verifyFile(filename,array_type) ### Will force download if missing
    filename = "AltDatabase/"+species+'/'+ array_type+'/'+array_type+"_annotations.txt"
    verifyFile(filename,array_type) ### Will force download if missing
Exemplo n.º 5
0
def buildAltMouseExonAnnotations(species,array_type,force,genomic_build):
    """Code required to:
    1) Extract out Affymetrix provided exon sequence (probeset sequence extracted from "probeset_sequence_reversed.txt", derived
       directly from the Affymetrix AltMouse probe annotation file), from the "SEQUENCE-transcript-dbase.txt" (built using
       dump-chip1 .gff sequence and AltMerge-Peptide Informatics script "sequence_analysis_AltMouse_refseq.py").
    2) Once exported, grab full length exon sequences using exon/intron coordinates matches to full-length gene sequences with 2kb
       flanking sequence to efficiently predict microRNA binding site exclusion (reAnnotateCriticalExonSequences) and later for
       coordinate mapping to get exons aligning with UCSC annotated splicing annotations and exons. This sequence data replaced
       the previous file (don't need to re-run this - see rederive_exonseq == 'yes' below for reference).
    3) Match the updated exon sequences to the most recent genomic coordinates and build the exact equivalent of the exon array
       Mm_Ensembl_probeset.txt database (same structure and ExonArrayEnsemblRules.py code). This involves running EnsemblImport.
    This code should be run before the exon array location build code since the "Mm_Ensembl_probeset.txt" is created and then re-
    written as "Mm_AltMouse_Ensembl_probeset.txt".
    """
    
    import JunctionArray
    import JunctionArrayEnsemblRules    
    rederive_exonseq = 'no'
    ### Only needs to be run once, to export exon sequence for AltMouse array the original (1 and 2 above)
    if rederive_exonseq == 'yes':
        import AltAnalyze
        import ExonAnnotate_module
        import ExonAnalyze_module
        agglomerate_inclusion_probesets = 'no'; onlyAnalyzeJunctions='no'
        probeset_annotations_file = "AltDatabase/"+species+"/"+array_type+"/"+"MASTER-probeset-transcript.txt"
        verifyFile(probeset_annotations_file,array_type) ### Will force download if missing
        exon_db={}; filtered_arrayids={};filter_status='no'
        constituitive_probeset_db,exon_db,genes_being_analyzed = AltAnalyze.importSplicingAnnotationDatabase(probeset_annotations_file,array_type,filtered_arrayids,filter_status)
        alt_junction_db,critical_exon_db,exon_dbase,exon_inclusion_db,exon_db = ExonAnnotate_module.identifyPutativeSpliceEvents(exon_db,constituitive_probeset_db,{},agglomerate_inclusion_probesets,onlyAnalyzeJunctions)
        ExonAnnotate_module.exportJunctionComparisons(alt_junction_db,critical_exon_db,exon_dbase)
        print "Finished exporting junctions used in AltMouse array comparisons."

        ExonAnalyze_module.exportAltMouseExonSequence()
        JunctionArray.reAnnotateCriticalExonSequences(species,array_type)

    ### Get UCSC associations (download databases if necessary)
    mRNA_Type = 'mrna'; run_from_scratch = 'yes'
    export_all_associations = 'no' ### YES only for protein prediction analysis
    buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force)

    reannotate_exon_seq = 'yes'
    print 'genomic_build', genomic_build
    if genomic_build == 'new':
        ### Need to run with every new genomic build (match up new coordinates
        print "Begining to derive exon sequence from new genomic build"
        JunctionArray.identifyCriticalExonLocations(species,array_type)
        reannotate_exon_seq = 'yes'
    JunctionArrayEnsemblRules.getAnnotations(species,array_type,reannotate_exon_seq,force)
    
    ### Download files required during AltAnalyze analysis but not during the database build process
    filename = "AltDatabase/"+species+"/"+array_type+"/"+"MASTER-probeset-transcript.txt"
    verifyFile(filename,array_type) ### Will force download if missing
    filename = "AltDatabase/"+species+'/'+ array_type+'/'+array_type+"_annotations.txt"
    verifyFile(filename,array_type) ### Will force download if missing
Exemplo n.º 6
0
def getUniProtURLsForAllSupportedSpecies():
    ### Import all UniProt supproted species and URLs
    species_uniprot_db = {}
    fn = filepath('Config/uniprot-species-file.txt')
    for line in open(fn, 'r').xreadlines():
        data = cleanUpLine(line)
        species_full, taxid, url = string.split(data, '\t')
        if 'H**o sapiens' not in species_full:  ### There's a separate file for us humans (so egotistical!!!)
            species_uniprot_db[species_full] = taxid, url

    import AltAnalyze
    ###Get species annotations from the GO-Elite config
    species_annot_db = AltAnalyze.importGOEliteSpeciesInfo()

    ### Export all urls for currently supported species
    import UI
    file_location_defaults = UI.importDefaultFileLocations()

    location_db = {}
    species_added = []
    for species_full in species_annot_db:
        if species_full in species_uniprot_db:
            taxid, url = species_uniprot_db[species_full]
            species_code = species_annot_db[species_full].SpeciesCode()
            try:
                location_db[url].append(species_code)
            except Exception:
                location_db[url] = [species_code]
            species_added.append(species_full)

    for species_full in species_annot_db:
        taxid = species_annot_db[species_full].TaxID()
        species_code = species_annot_db[species_full].SpeciesCode()
        if species_full not in species_added:
            for species_name in species_uniprot_db:
                tax, url = species_uniprot_db[species_name]
                if tax == taxid:
                    location_db[url].append(species_code)
                    print species_code

    for url in location_db:
        species = string.join(location_db[url], '|')
        fl = UI.FileLocationData('ftp', url, species)
        try:
            file_location_defaults['UniProt'].append(fl)
        except KeyError:
            file_location_defaults['UniProt'] = [fl]
    UI.exportDefaultFileLocations(file_location_defaults)
Exemplo n.º 7
0
def getUniProtURLsForAllSupportedSpecies():
    ### Import all UniProt supproted species and URLs
    species_uniprot_db={}
    fn=filepath('Config/uniprot-species-file.txt')
    for line in open(fn,'r').xreadlines():
        data = cleanUpLine(line)
        species_full,taxid,url = string.split(data,'\t')
        if 'H**o sapiens' not in species_full: ### There's a separate file for us humans (so egotistical!!!)
            species_uniprot_db[species_full] = taxid,url
        
    import AltAnalyze
    ###Get species annotations from the GO-Elite config
    species_annot_db=AltAnalyze.importGOEliteSpeciesInfo()
        
    ### Export all urls for currently supported species
    import UI
    file_location_defaults = UI.importDefaultFileLocations()
    
    location_db={}; species_added=[]
    for species_full in species_annot_db:
        if species_full in species_uniprot_db:
            taxid,url = species_uniprot_db[species_full]
            species_code = species_annot_db[species_full].SpeciesCode()
            try: location_db[url].append(species_code)
            except Exception: location_db[url] = [species_code]
            species_added.append(species_full)
            
    for species_full in species_annot_db:
        taxid = species_annot_db[species_full].TaxID()
        species_code = species_annot_db[species_full].SpeciesCode()
        if species_full not in species_added:
            for species_name in species_uniprot_db:
                tax,url = species_uniprot_db[species_name]
                if tax == taxid:
                    location_db[url].append(species_code)
                    print species_code
                
    for url in location_db:
        species = string.join(location_db[url],'|')
        fl = UI.FileLocationData('ftp', url, species)
        try: file_location_defaults['UniProt'].append(fl)
        except KeyError: file_location_defaults['UniProt'] = [fl]
    UI.exportDefaultFileLocations(file_location_defaults)
Exemplo n.º 8
0
def importSplicingAnnotations(species,array_type,avg_all_for_ss):
    if array_type == 'exon' or array_type == 'gene': probeset_type = 'full'
    else: probeset_type = 'all'
    exon_db,constitutive_probeset_db = AltAnalyze.importSplicingAnnotations(array_type,species,probeset_type,avg_all_for_ss,root_dir)
    return exon_db,constitutive_probeset_db
Exemplo n.º 9
0
def remoteRun(fl,Species,Array_type,expression_threshold,filter_method_type,p_val,express_data_format,altanalyze_file_list,avg_all_for_ss):
  start_time = time.time()
  global p; global filter_method; global exp_data_format; global array_type; global species; global root_dir; global original_exp_threshold
  global normalization_method; global exon_exp_threshold; global gene_rpkm_threshold; global junction_exp_threshold
  global exon_rpkm_threshold; global gene_exp_threshold
  
  original_exp_threshold = expression_threshold
  aspire_output_list=[]; aspire_output_gene_list=[]
  filter_method = filter_method_type
  altanalyze_files = altanalyze_file_list
  p = p_val; species = Species; array_type = Array_type
  exp_data_format = express_data_format

  ### Define global variables from the object fl
  try: normalization_method = fl.FeatureNormalization()
  except Exception: normalization_method = 'NA'
  try: exon_exp_threshold = fl.ExonExpThreshold()
  except Exception: exon_exp_threshold = 0
  try: gene_rpkm_threshold = fl.RPKMThreshold()
  except Exception: gene_rpkm_threshold = 0
  root_dir = fl.RootDir()
  try: junction_exp_threshold = fl.JunctionExpThreshold()
  except Exception: junction_exp_threshold = 0
  try: exon_rpkm_threshold = fl.ExonRPKMThreshold()
  except Exception: exon_rpkm_threshold = 0
  try: gene_exp_threshold = fl.GeneExpThreshold()
  except Exception: gene_exp_threshold = 0
    
  if 'exon' in array_type: array_type = 'exon' ###In AnalayzeExpressionDataset module, this is named 'exon-array'
  
  global log_expression_threshold; global nonlog_exp_threshold; nonlog_exp_threshold = expression_threshold
  try: log_expression_threshold = math.log(expression_threshold,2)
  except Exception: log_expression_threshold = 0 ###Occurs if expression_threshold == 0
  
  import_dir = root_dir+'AltExpression/pre-filtered/expression/'; import_dir_dabg = root_dir+'AltExpression/pre-filtered/dabg/'
  try: dir_list = read_directory(import_dir)  #send a sub_directory to a function to identify all files in a directory
  except Exception: dir_list=[]
  try: dir_list2 = read_directory(import_dir_dabg)
  except Exception: dir_list2=[]

  if len(altanalyze_files) == 0: altanalyze_files = dir_list  ###if no filenames input

  if array_type == 'RNASeq':
      altmerge_db = root_dir+'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_junctions.txt'
  elif array_type != 'AltMouse': altmerge_db = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_probesets.txt'
  else: altmerge_db = "AltDatabase/"+species+"/"+array_type+"/MASTER-probeset-transcript.txt"
  ###Import probe-level associations
  if array_type != 'AltMouse':
      exon_db,altmerge_constituitive = importSplicingAnnotations(species,array_type,avg_all_for_ss)
  else:
      exon_db,altmerge_constituitive = import_altmerge(altmerge_db,array_type) ### Prior to version 2.0, this function was distinct from that in AltAnalyze(), so replaced it for consistency
            
  global altanalzye_input; altanalyze_output=[]
  if len(dir_list)>0:
      for altanalzye_input in dir_list:    #loop through each file in the directory to output results
        if altanalzye_input in altanalyze_files:
            if altanalzye_input in dir_list2: analyze_dabg = 'yes'
            else: analyze_dabg = 'no'
            ind_start_time = time.time()
            array_db = import_dir + "/"+ altanalzye_input
            dabg_db = import_dir_dabg + "/"+ altanalzye_input
            #array_db = array_db[1:] #not sure why, but the '\' needs to be there while reading initally but not while accessing the file late
            #dabg_db = dabg_db[1:]
            dataset_name = altanalzye_input[0:-4] + '-'
            print "Begining to filter",dataset_name[0:-1]
            #print "Array type is:",array_type
            #print "Species is:", species
            #print "Expression format is:",exp_data_format
            #print "DABG p-value cut-off is:",p
            #print "Filter method is:",filter_method
            #print "Log2 expression cut-off is:",log_expression_threshold
            ###Import expression data and stats
            try:
                output_file = expr_analysis(array_db,dabg_db,altmerge_constituitive,exon_db,analyze_dabg)    #filter the expression data based on fold and p-value OR expression threshold
                altanalyze_output.append(output_file)
            except KeyError: print "Impropper array type (",dataset_name[0:-1],") for",array_type,species,'. Skipping array.'
            ind_end_time = time.time(); time_diff = int(ind_end_time-ind_start_time)
            
            #print dataset_name,"filtering finished in %d seconds" % time_diff
      end_time = time.time(); time_diff = int(end_time-start_time)
      #print "Filtering complete for all files in %d seconds" % time_diff

      AltAnalyze.clearObjectsFromMemory(exon_db)
      exon_db={}; altmerge_constituitive={}; constitutive_probeset_db={}
  else: print "No expression files to filter found..."
  return altanalyze_output
Exemplo n.º 10
0
def importSplicingAnnotations(species, array_type, avg_all_for_ss):
    if array_type == 'exon' or array_type == 'gene': probeset_type = 'full'
    else: probeset_type = 'all'
    exon_db, constitutive_probeset_db = AltAnalyze.importSplicingAnnotations(
        array_type, species, probeset_type, avg_all_for_ss, root_dir)
    return exon_db, constitutive_probeset_db
Exemplo n.º 11
0
def remoteRun(fl, Species, Array_type, expression_threshold,
              filter_method_type, p_val, express_data_format,
              altanalyze_file_list, avg_all_for_ss):
    start_time = time.time()
    global p
    global filter_method
    global exp_data_format
    global array_type
    global species
    global root_dir
    global original_exp_threshold
    global normalization_method
    global exon_exp_threshold
    global gene_rpkm_threshold
    global junction_exp_threshold
    global exon_rpkm_threshold
    global gene_exp_threshold

    original_exp_threshold = expression_threshold
    aspire_output_list = []
    aspire_output_gene_list = []
    filter_method = filter_method_type
    altanalyze_files = altanalyze_file_list
    p = p_val
    species = Species
    array_type = Array_type
    exp_data_format = express_data_format

    ### Define global variables from the object fl
    try:
        normalization_method = fl.FeatureNormalization()
    except Exception:
        normalization_method = 'NA'
    try:
        exon_exp_threshold = fl.ExonExpThreshold()
    except Exception:
        exon_exp_threshold = 0
    try:
        gene_rpkm_threshold = fl.RPKMThreshold()
    except Exception:
        gene_rpkm_threshold = 0
    root_dir = fl.RootDir()
    try:
        junction_exp_threshold = fl.JunctionExpThreshold()
    except Exception:
        junction_exp_threshold = 0
    try:
        exon_rpkm_threshold = fl.ExonRPKMThreshold()
    except Exception:
        exon_rpkm_threshold = 0
    try:
        gene_exp_threshold = fl.GeneExpThreshold()
    except Exception:
        gene_exp_threshold = 0

    if 'exon' in array_type:
        array_type = 'exon'  ###In AnalayzeExpressionDataset module, this is named 'exon-array'

    global log_expression_threshold
    global nonlog_exp_threshold
    nonlog_exp_threshold = expression_threshold
    try:
        log_expression_threshold = math.log(expression_threshold, 2)
    except Exception:
        log_expression_threshold = 0  ###Occurs if expression_threshold == 0

    import_dir = root_dir + 'AltExpression/pre-filtered/expression/'
    import_dir_dabg = root_dir + 'AltExpression/pre-filtered/dabg/'
    try:
        dir_list = read_directory(
            import_dir
        )  #send a sub_directory to a function to identify all files in a directory
    except Exception:
        dir_list = []
    try:
        dir_list2 = read_directory(import_dir_dabg)
    except Exception:
        dir_list2 = []

    if len(altanalyze_files) == 0:
        altanalyze_files = dir_list  ###if no filenames input

    if array_type == 'RNASeq':
        altmerge_db = root_dir + 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_junctions.txt'
    elif array_type != 'AltMouse':
        altmerge_db = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_probesets.txt'
    else:
        altmerge_db = "AltDatabase/" + species + "/" + array_type + "/MASTER-probeset-transcript.txt"
    ###Import probe-level associations
    if array_type != 'AltMouse':
        exon_db, altmerge_constituitive = importSplicingAnnotations(
            species, array_type, avg_all_for_ss)
    else:
        exon_db, altmerge_constituitive = import_altmerge(
            altmerge_db, array_type
        )  ### Prior to version 2.0, this function was distinct from that in AltAnalyze(), so replaced it for consistency

    global altanalzye_input
    altanalyze_output = []
    if len(dir_list) > 0:
        for altanalzye_input in dir_list:  #loop through each file in the directory to output results
            if altanalzye_input in altanalyze_files:
                if altanalzye_input in dir_list2: analyze_dabg = 'yes'
                else: analyze_dabg = 'no'
                ind_start_time = time.time()
                array_db = import_dir + "/" + altanalzye_input
                dabg_db = import_dir_dabg + "/" + altanalzye_input
                #array_db = array_db[1:] #not sure why, but the '\' needs to be there while reading initally but not while accessing the file late
                #dabg_db = dabg_db[1:]
                dataset_name = altanalzye_input[0:-4] + '-'
                print "Begining to filter", dataset_name[0:-1]
                #print "Array type is:",array_type
                #print "Species is:", species
                #print "Expression format is:",exp_data_format
                #print "DABG p-value cut-off is:",p
                #print "Filter method is:",filter_method
                #print "Log2 expression cut-off is:",log_expression_threshold
                ###Import expression data and stats
                try:
                    output_file = expr_analysis(
                        array_db, dabg_db, altmerge_constituitive, exon_db,
                        analyze_dabg
                    )  #filter the expression data based on fold and p-value OR expression threshold
                    altanalyze_output.append(output_file)
                except KeyError:
                    print "Impropper array type (", dataset_name[
                        0:
                        -1], ") for", array_type, species, '. Skipping array.'
                ind_end_time = time.time()
                time_diff = int(ind_end_time - ind_start_time)

                #print dataset_name,"filtering finished in %d seconds" % time_diff
        end_time = time.time()
        time_diff = int(end_time - start_time)
        #print "Filtering complete for all files in %d seconds" % time_diff

        AltAnalyze.clearObjectsFromMemory(exon_db)
        exon_db = {}
        altmerge_constituitive = {}
        constitutive_probeset_db = {}
    else:
        print "No expression files to filter found..."
    return altanalyze_output
def TargetScanImport(parse_sequences,force):
    """The TargetScan data is currently extracted from a cross-species conserved family file. This file only contains
    gene symbol, microRNA name and 3'UTR seed locations."""
    if species == 'Mm': tax = '10090'; prefix = 'mmu-'
    elif species == 'Hs': tax = '9606'; prefix = 'hsa-'
    elif species == 'Rn': tax = '10116'; prefix = 'rno-'
    else: prefix = 'hsa-'

    import AltAnalyze
    ###Get taxid annotations from the GO-Elite config
    species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={}
    for species_full in species_annot_db:
        if species==species_annot_db[species_full].SpeciesCode():
            tax = species_annot_db[species_full].TaxID()
            
    global l
    
    ### See if the files are already there
    verifyTSG, target_scan_target_file = verifyExternalDownload('TargetScanGenes')
    verifyTSS, target_scan_sequence_file = verifyExternalDownload('TargetScanSequences')

    if verifyTSG == 'no' or verifyTSS == 'no': ### used to be - if force == 'yes'
        if parse_sequences == 'no':
            ### Then download the latest annotations and sequences
            target_scan_target_file = downloadFile('TargetScanGenes')
            target_scan_sequence_file = downloadFile('TargetScanSequences')

    ### Cross-species TargetScan file with UTR seqeunces for all genes with reported targets in the conserved family file
    ### Although this file includes valid sequence data that appears to match up to the target file, the target file
    ### appears to only list the seed seqeunce location (UTR start and stop) and not the full binding sequence and thus
    ### is not ammenable to probe set alignment.
    print 'parsing', target_scan_sequence_file
    fn=filepath(target_scan_sequence_file); x=0; target_scan_gene_utr_seq={}
    for line in open(fn,'rU').xreadlines():         
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        if x==0: x=1
        else:
            symbol = string.upper(t[2]); tax_id = t[3]; utr_seq = t[4]
            if tax_id == tax:
                utr_seq_no_gaps = string.replace(utr_seq,'-','')
                utr_seq_no_gaps = string.replace(utr_seq_no_gaps,'U','T')
                if symbol in symbol_ensembl_current and len(utr_seq_no_gaps)>0:
                    target_scan_gene_utr_seq[symbol] = utr_seq_no_gaps
    print 'UTR sequence for',len(target_scan_gene_utr_seq),'TargetScan genes stored in memory.'
        
    mir_sequences = []; count=0
    print 'parsing', target_scan_target_file
    #verifyFile(target_scan_target_file,species) ### Makes sure file is local and if not downloads.
    fn=filepath(target_scan_target_file); x=0; k=[]; l=[]
    for line in open(fn,'rU').xreadlines():         
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        if x==0:
            x=1
            data = string.lower(data)
            t = string.split(data,'\t')
            i=0
            for value in t:
                if 'mir' in value: m = i
                elif 'gene id' in value: g = i
                elif 'gene symbol' in value: s = i
                elif 'transcript' in value: r = i
                elif 'species id' in value: txi = i
                elif 'utr start' in value: us = i
                elif 'utr end' in value: ue = i
                i+=1
        else:
            mir = t[m]; geneid = t[g]; gene_symbol = string.upper(t[s]); taxid = t[txi]; utr_start = int(t[us]); utr_end  = int(t[ue])
            ### Old format
            #mir = t[0]; gene_symbol = string.upper(t[1]); taxid = t[2]; utr_start = t[3]; utr_end = t[4]
            if '/' in mir:
                mir_list=[]
                mirs = string.split(mir,'/')
                for mirid in mirs[1:]:
                    mirid = 'miR-'+mirid
                    mir_list.append(mirid)
                mir_list.append(mirs[0])
            else: mir_list = [mir]

            if taxid == tax: ###human
                #target_scan_gene_utr_seq[symbol] = utr_seq_no_gaps
                if gene_symbol in symbol_ensembl_current: ensembl_geneids = symbol_ensembl_current[gene_symbol]; proceed = 'yes'; k.append(gene_symbol)
                else: proceed = 'no'; l.append(gene_symbol)
                if gene_symbol in target_scan_gene_utr_seq:
                    ### TargetScan provides the core, while processed miRs are typically 22nt - seems to approximate other databases better
                    adj_start = utr_start-15
                    if adj_start < 0: adj_start=0
                    mir_sequences = target_scan_gene_utr_seq[gene_symbol][adj_start:utr_end+1]
                    #if string.lower(gene_symbol) == 'tns3' and mir == 'miR-182': print mir,gene_symbol,taxid,utr_start,utr_end,mir_sequences
                else: mir_sequences=[]
                ###Already multiple geneids associated with each symbol so don't need to worry about renundancy
                if proceed == 'yes':
                    for ensembl_geneid in ensembl_geneids:
                        for mir in mir_list:
                            #if ensembl_geneid == 'ENSG00000137815' and mir == 'miR-214': print mir,gene_symbol,taxid,utr_start,utr_end,mir_sequences,target_scan_gene_utr_seq[gene_symbol];sys.exit()
                            if parse_sequences == 'yes':
                                if (prefix+mir,ensembl_geneid) in combined_results:
                                    combined_results[(prefix+mir,ensembl_geneid)].append(mir_sequences); count+=1
                            else:
                                #if ensembl_geneid == 'ENSMUSG00000029467': print mir
                                y = MicroRNATargetData(ensembl_geneid,gene_symbol,mir_sequences,prefix+mir,'TargetScan')
                                count+=1
                                try: microRNA_target_db[prefix+mir].append(y)
                                except KeyError: microRNA_target_db[prefix+mir] = [y]
    k = unique.unique(k); l = unique.unique(l)
    print 'ensembls-found:',len(k),', not found:',len(l)
    print l[:10]
    print count, 'miRNA-target relationships added for TargetScan'
def pictarImport(parse_sequences,type,added):
    """Annotations originally from the file: ng1536-S3.xls, posted as supplementary data at:
    http://www.nature.com/ng/journal/v37/n5/suppinfo/ng1536_S1.html. The file being parsed here has been pre-matched to Ensembl IDs
    using the ExonModule of LinkEST, for human."""
    mir_sequences=[]
    if species == 'Mm': filename = 'AltDatabase/miRBS/'+species+'/'+'pictar-target-annotated.txt'; tax = '10090'
    else: filename = 'AltDatabase/miRBS/'+'Mm'+'/'+'pictar-target-annotated.txt'; tax = '10116'
        
    #if species == 'Hs': filename = 'AltDatabase/miRBS/'+species+'/'+'pictar-conserved-targets-2005.txt'; tax = '9606'
    if type == 'pre-computed':
        if species == 'Hs': filename = 'AltDatabase/miRBS/'+species+'/'+'pictar-conserved-targets-2005.txt'; tax = '9606'
    else:
        if species == 'Hs': filename = 'AltDatabase/miRBS/'+'Mm'+'/'+'pictar-target-annotated.txt'; tax = '9606'

    import AltAnalyze
    ###Get taxid annotations from the GO-Elite config
    species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={}
    for species_full in species_annot_db:
        if species==species_annot_db[species_full].SpeciesCode():
            tax = species_annot_db[species_full].TaxID()
            
    print 'parsing', filename; count=0
    print 'len(symbol_ensembl)', len(symbol_ensembl)
    verifyFile(filename,species) ### Makes sure file is local and if not downloads.
    fn=filepath(filename); x=1
    for line in open(fn,'rU').xreadlines():         
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        if x==0: x=1
        else:
            if species == 'Hs':
                if type == 'pre-computed':
                    ensembl_geneid, mir, mir_sequences = t; ensembl_geneids = [ensembl_geneid]
                else:
                    symbol=string.upper(t[2]);mir=t[6];mir_sequences=t[11]
                    if symbol in symbol_ensembl and len(symbol)>0: ensembl_geneids=symbol_ensembl[symbol]
                    else: ensembl_geneids=['']                    
            elif species == 'Mm':
                mm_symbol=string.upper(t[3]);mir=t[6];mir_sequences=t[11]; mir = string.replace(mir,'hsa','mmu')
                if mm_symbol in symbol_ensembl and len(mm_symbol)>0: ensembl_geneids=symbol_ensembl[mm_symbol]
                else: ensembl_geneids=['']
            elif species == 'Rn':
                mm_symbol=string.upper(t[3]);mir=t[6];mir_sequences=t[11]; mir = string.replace(mir,'hsa','rno')
                if mm_symbol in symbol_ensembl and len(mm_symbol)>0: ensembl_geneids=symbol_ensembl[mm_symbol]
                else: ensembl_geneids=['']
            else:
                mm_symbol=string.upper(t[3]);mir=t[6];mir_sequences=t[11]
                if mm_symbol in symbol_ensembl and len(mm_symbol)>0: ensembl_geneids=symbol_ensembl[mm_symbol]
                else: ensembl_geneids=['']                
            for ensembl_geneid in ensembl_geneids:
                if len(ensembl_geneid)>1 and (ensembl_geneid,mir) not in added:
                    if parse_sequences == 'yes':
                        if (mir,ensembl_geneid) in combined_results:
                            combined_results[(mir,ensembl_geneid)].append(string.upper(mir_sequences)); count+=1
                    else:
                        #if count < 800 and '-125b' in mir: print ensembl_geneid, mir, mm_symbol; count+=1
                        #elif count>799: kill
                        y = MicroRNATargetData(ensembl_geneid,'',mir,mir_sequences,'pictar'); count+=1
                        try: microRNA_target_db[mir].append(y)
                        except KeyError: microRNA_target_db[mir] = [y]
                        added[(ensembl_geneid,mir)]=[]
                    
    print count, 'miRNA-target relationships added for PicTar'
    return added