def json_to_glutton(families) : tmp = {} bad_family_count = 0 bad_gene_count = 0 for famid in families : #tmp[famid] = GeneFamily([ Gene(*families[famid][geneid], id=geneid) for geneid in families[famid] ], id=famid) fam = [] bad = False for geneid in families[famid] : genename,geneseq = families[famid][geneid] # if i find this, keep on adding then i can print out the whole family later if geneseq == 'Sequenceunavailable' : bad = True fam.append( Gene(genename, geneseq, id=geneid) ) if not bad : tmp[famid] = GeneFamily(fam, id=famid) else : bad_family_count += 1 bad_gene_count += len(fam) if bad_family_count > 0 : get_log().error("%d bad gene families (containing %d genes)" % (bad_family_count, bad_gene_count)) return tmp
def makedb(db_fname, nucleotide=False) : c = ["makeblastdb", "-in", db_fname, "-dbtype", "nucl" if nucleotide else "prot"] try : get_log().debug(" ".join(c)) subprocess.check_output(c, stderr=subprocess.STDOUT, close_fds=True) except subprocess.CalledProcessError, cpe : get_log().fatal("%s returncode=%d\n%s" % (c[0], cpe.returncode, cpe.output)) exit(1)
def download_database_biomart(species, release, database_name='ensembl', nucleotide=True) : mart_name, schema_name, mart_release, table_name, species_desc = get_all_species(get_marts(database_name), database_name)[species] if mart_release != release : get_log().fatal("requested release (%d) does not match current biomart release (%d)" % (release, mart_release)) exit(1) seq = get_sequences(species, database_name, schema_name, table_name, nucleotide) h**o = get_homology_info(species, database_name, schema_name, table_name, nucleotide) return group_into_families(seq, h**o)
def main() : args = handle_args(argv[1:]) generic_options(args) start_time = time.time() get_log().info("this is glutton version %s" % str(glutton.__version__)) ret = commands[argv[1]](args) get_log().info("%s took %s" % (argv[1], duration_str(time.time() - start_time))) return ret
def list_command(args) : log = get_log() e = EnsemblDownloader() suppression_defaults = { 'ensembl' : 70, # errors with 69 and older (missing columns) 'metazoa' : 17, 'protists' : 17, 'plants' : 17, 'bacteria' : 17, 'fungi' : 17, } if not args.suppress : args.suppress = suppression_defaults[args.database] log.info("listing species in %s database" % args.database) log.info("suppressing releases prior to %d" % args.suppress) try : pretty_print_table( ('Species', 'Release'), e.get_all_species(db=args.database, suppress=args.suppress)) except EnsemblDownloadError, ede : log.fatal(ede.message) exit(1)
def get_sequences(species, database_name, schema_name, table_name, nucleotide) : global TIMEOUT log = get_log() log.debug("getting sequences...") payload_sequences = """<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE Query> <Query virtualSchemaName="%s" formatter="FASTA" header="0" uniqueRows="0" count="" datasetConfigVersion="0.7"> <Dataset name="%s" interface="default"> <Attribute name="ensembl_gene_id" /> <Attribute name="%s" /> </Dataset> </Query>""" # e.g. 'dipodomys_ordii': ('dordii_gene_ensembl', 'Dipodomys ordii genes (dipOrd1)') seq_type = 'coding' if nucleotide else 'peptide' #query = payload_sequences % (schema_name if database_name != 'ensembl' else 'default', table_name, seq_type) query = payload_sequences % (database_name + '_mart' if database_name != 'ensembl' else 'default', table_name, seq_type) params = urllib.urlencode({ 'query' : query }) print query sequences = {} try : f = urllib2.urlopen(get_URL(database_name), params) except urllib2.URLError, ue : log.fatal("biomart sequence query error (%s)" % str(ue)) exit(1)
def __init__(self, top_level_directory, reference_fname, min_length, min_hitidentity, min_hitlength, max_evalue, batch_size, min_alignidentity, min_alignoverlap) : self.directory = join(top_level_directory, 'alignments') self.min_length = min_length # glutton self.min_hitidentity = min_hitidentity # blast self.min_hitlength = min_hitlength # blast self.max_evalue = max_evalue # blast self.min_alignidentity = min_alignidentity # pagan self.min_alignoverlap = min_alignoverlap # pagan check_dir(self.directory, create=True) self.search = All_vs_all_search(batch_size) self.cleanup_files = [] self.q = None self.lock = threading.Lock() self.complete_jobs = 0 self.total_jobs = 0 self.log = get_log() self.param = GluttonParameters(top_level_directory) self.db = GluttonDB(reference_fname) self.param.set_reference(self.db) self.resume = self.param.able_to_resume() self.info = GluttonInformation(self.directory, self.param, self.db, resume=self.resume) self.param.set_full_checksum()
def __init__(self, batch_size=100): self.nucleotide = False self.min_hitidentity = None self.min_hitlength = None self.max_evalue = None self.batch_size = batch_size self.log = get_log() self.cleanup_files = [] self.gene_assignments = {} self.lock = threading.Lock() self.q = None self.total_jobs = 0 self.complete_jobs = 0
def get_all_species_pycogent(db, suppress) : log = get_log() #try : from cogent.db.ensembl import Species #except ImportError : # log.fatal("pycogent import failed, exiting...") # exit(1) log.warning("pycogent cannot differentiate between ensembl projects, listing everything...") # annoying that i cannot see another way to programmatically get # a list of supported species return [ (i.split()[-1], 'unknown') for i in str(Species).split('\n')[3:-1] ]
def __init__(self, qtimeout=1, maxsize=0): if maxsize == 0 : maxsize = num_threads() * 2 self.log = get_log() self.q = Queue.Queue(maxsize) self.workers = self._init_workers(num_threads()) self.q_timeout = qtimeout self.running = False self.no_more_jobs = False self.jobs_completed = 0 self.jobs_counter = itertools.count(start=1) self.start()
def __init__(self, fname=None) : self.fname = fname self.compression = ZIP_DEFLATED self.metadata = None self.data = None # dict of famid -> GeneFamily obj (list of Genes) self.seq2famid = None # dict of geneid -> famid self.dirty = False self.lock = threading.Lock() self.complete_jobs = 0 self.total_jobs = 0 self.log = get_log() if self.fname : self._read() if not self.is_complete() : self.log.warn("%s is not complete!" % self.fname)
def __init__(self, project_dir, create=False) : self.directory = project_dir self.create = create check_dir(self.directory, create=self.create) self.log = get_log() self.params = self.load(self.parameter_filename) if not self.params : self.params = { 'db_species' : None, 'db_release' : None, 'db_filename' : None, 'db_checksum' : None, 'full_checksum' : None, 'sample_checksum' : None, 'samples' : {} }
def __init__(self, alignments_dir, parameters, db, resume=True) : self.directory = alignments_dir self.params = parameters self.db = db check_dir(self.directory) self.log = get_log() self.lock = threading.RLock() # a single function requires this be an RLock over a Lock # the alignment procedure can take a long time, so everything needs to be # restartable, in addition - if we restart it then we need to be sure that # the parameters used are the same, i.e.: same reference database etc etc self.contig_query_map = {} # file id -> contig id -> query id (file id is provided by the user, called a 'label') self.query_gene_map = {} # query id -> (gene id, +/-) or None self.genefamily_filename_map = {} # gene family id -> filename if resume : self.read_progress_files()
def get_all_species(marts, database_name) : global TIMEOUT query = get_URL(database_name) + '?type=datasets&mart=%s' species = {} log = get_log() for k in marts : if not marts[k] : continue mart_name,schema_name,release = marts[k] try : f = urllib2.urlopen(query % mart_name, timeout=TIMEOUT) except urllib2.URLError, ue : log.fatal("biomart dataset listing timed out") exit(1) for line in f : line = line.strip() if not line : continue data = line.split('\t') db_name = data[1] description = data[2] if ' genes ' in description : species_name = description.split(' genes ')[0].lower().replace(' ', '_') else : species_name = db_name.split('_')[0] species[species_name] = (mart_name, schema_name, release, db_name, description) f.close()
def build_command(args) : log = get_log() gdb = GluttonDB() def _cleanup(signal, frame) : print >> stderr, "Killed by user, cleaning up..." gdb.stop() os._exit(0) signal.signal(signal.SIGINT, _cleanup) try : gdb.build(args.output, args.species, args.release, args.database, True, #not args.protein, args.download) except GluttonDBBuildError, nmgfe : log.fatal(nmgfe.message) exit(1)
def __init__(self, top_level_directory, reference_fname, assembler_name, protein_identity, alignment_length, min_gene_coverage, do_not_trim=False, testmode='none') : self.alignments_dir = join(top_level_directory, 'alignments') self.output_dir = join(top_level_directory, 'postprocessing') self.protein_identity = protein_identity self.alignment_length = alignment_length self.min_gene_coverage = min_gene_coverage self.trim = not do_not_trim self.testmode = testmode self.scaffold_dir = join(self.output_dir, 'scaffolds') self.genefamily_msa_dir = join(self.output_dir, 'genefamily_msa') self.gene_msa_dir = join(self.output_dir, 'gene_msa') check_dir(self.output_dir, create=True) check_dir(self.scaffold_dir, create=True) check_dir(self.genefamily_msa_dir, create=True) check_dir(self.gene_msa_dir, create=True) self.log = get_log() self.param = GluttonParameters(top_level_directory) self.db = GluttonDB(reference_fname) self.info = GluttonInformation(self.alignments_dir, self.param, self.db) # check reference was the same if not self.param.same_reference(self.db) : self.log.error("current reference is %s, alignments were performed using %s" % (reference_fname, self.db.filename)) exit(1); # perhaps slightly overambitious to exit, just stick to a warning pending,failures = self.info.num_alignments_not_done() if pending != 0 : self.log.warn("%d alignments were not run!" % pending) self.assembler = AssemblerOutput(assembler_name) # e.g. query39806_orf1 self.orfname_regex = re.compile("^(query\d+)\_orf(\d)$")
def get_marts(database_name) : global TIMEOUT if database_name : marts = { database_name : None } else : marts = { 'ensembl' : None, 'metazoa' : None, 'plants' : None, 'protists' : None, 'fungi' : None, 'bacteria' : None } log = get_log() query = get_URL(database_name) + '?type=registry' try : dom = parse(urllib2.urlopen(query, timeout=TIMEOUT)) except urllib2.URLError, urle : log.fatal("biomart registry query timed out") exit(1)
def get_homology_info(species, database_name, schema_name, table_name, nucleotide) : global TIMEOUT log = get_log() log.debug("getting homology...") payload_homology = """<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE Query> <Query virtualSchemaName="%s" formatter="TSV" header="0" uniqueRows="0" count="" datasetConfigVersion="0.7"> <Dataset name="%s" interface="default"> <Attribute name="ensembl_gene_id" /> <Attribute name="%s" /> </Dataset> </Query>""" # is this always correct? short_species_name = table_name.split('_')[0] # this does not seem like a great idea, but until it breaks it will do... if '_eg_' in table_name : paralog_name = short_species_name + '_eg_paralog_gene' else : paralog_name = short_species_name + '_paralog_ensembl_gene' #query2 = payload_homology % (schema_name if database_name != 'ensembl' else 'default', table_name, paralog_name) query2 = payload_homology % (database_name + '_mart' if database_name != 'ensembl' else 'default', table_name, paralog_name) params2 = urllib.urlencode({ 'query' : query2 }) print query2 # make api call try : f = urllib2.urlopen(get_URL(database_name), params2) except urllib2.URLError, ue : log.fatal("biomart homology query error (%s)" % str(ue)) exit(1)
def __init__(self) : self.log = get_log()
def __init__(self) : self.log = get_log() self.method = get_ensembl_download_method() self.log.info("ensembl download method is set to: %s" % self.method)
def __init__(self, callback) : self.state = Job.QUEUED self.log = get_log() self.callback = callback
def __init__(self, location=None) : self.binary_location = get_binary_path(self.name) if not location else location self.log = get_log()
def download_database_pycogent(species, release, database_name='ensembl', nucleotide=False) : log = get_log() #try : import cogent from cogent.db.ensembl import Species, Genome, Compara, HostAccount from cogent.db.ensembl.database import Database #except ImportError : # log.fatal("pycogent import failed, exiting...") # exit(1) if cogent.version_info != (1,5,3) : log.warning("only tested with pycogent version 1.5.3 (you are running %s)" % cogent.version) release, db_name, db_details = get_missing_info(species, release, database_name) account = HostAccount( db_details['hostname'], db_details['username'], db_details['password'], port=db_details['port']) if Species.getSpeciesName(species) == 'None' : # this is not an error, it returns the string "None" log.warning("%s not found in pycogent, attempting to add it manually" % species) Species.amendSpecies(species.capitalize().replace('_', ' '), species) genome = Genome(species, Release=release, account=account) compara = Compara([species], Release=release, account=account) # DON'T TRY THIS AT HOME! # # what happens is it searches for compara databases, but unfortunately finds more than one # in this situation pycogent just connects to the first one, which is always compara_bacteria # so one solution is to dig through all the compara objects internals to provide a connection # to the correct database ... obviously not the best solution, but at 6 lines of code definitely # the shortest ;-P # if db_name not in ('ensembl', 'bacteria') : log.warning("accessing compara from pycogent with species outside of ensembl-main and ensembl-bacteria is problematic, attempting to patch...") from cogent.db.ensembl.host import DbConnection from cogent.db.ensembl.name import EnsemblDbName import sqlalchemy new_db_name = EnsemblDbName(compara.ComparaDb.db_name.Name.replace('bacteria', db_name)) compara.ComparaDb._db = DbConnection(account=account, db_name=new_db_name) compara.ComparaDb._meta = sqlalchemy.MetaData(compara.ComparaDb._db) # end of DON'T TRY THIS AT HOME! genes = set() families = [] stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes))) for gene in genome.getGenesMatching(BioType='protein_coding') : stableid = gene.StableId # ignore genes that have already been seen as members of other gene families if stableid in genes : continue genes.add(stableid) paralogs = compara.getRelatedGenes(StableId=stableid, Relationship='within_species_paralog') current = [] if paralogs is None : stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes))) current.append((stableid, str(gene.CanonicalTranscript.Cds) if nucleotide else str(gene.CanonicalTranscript.ProteinSeq))) else : for paralog in paralogs.Members : paralogid = paralog.StableId genes.add(paralogid) stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes))) try : current.append((paralogid, str(paralog.CanonicalTranscript.Cds) if nucleotide else str(paralog.CanonicalTranscript.ProteinSeq))) except AttributeError : log.fatal("pycogent did not find a canonical transcript for %s" % paralogid) exit(1) #print ','.join([ i for i,j in current ]) families.append(current) stderr.write("\r[downloading %s] got %d sequences\n" % ("CDS" if nucleotide else "protein", len(genes))) return families
def __init__(self): super(Pagan, self).__init__() self.log = get_log() self.protein_alignment_fname = None self.nucleotide_alignment_fname = None