def load(datadir, create_session=None): ''' num_entries = load(datadir, create_session={backend.create_session}) Load LOCATE database file information into local relational database Parameters ---------- datadir : str Path to directory containing database files. create_session : callable, optional Callable object which returns an sqlalchemy session Returns ------- num_entries : int Number of entries loaded into the local database References ---------- To download database files: http://locate.imb.uq.edu.au/downloads.shtml ''' from waldo.backend import call_create_session session = call_create_session(create_session) loaded = _loadfile(path.join(datadir, _mouse), 'Mus musculus', session) loaded += _loadfile(path.join(datadir, _human), 'H**o sapiens', session) return loaded
def load(datadir, create_session=None, organism_set=set([u'Mus musculus', u'H**o sapiens'])): ''' nr_loaded = load(datadir, create_session={backend.create_session}, organism_set={'Mus musculus', 'H**o sapiens'}) Load uniprot into database Parameters ---------- datadir : str Directory containing the XML Uniprot file create_session : callable, optional a callable object that returns an sqlalchemy session organism_set : set of str, optional If not None, only organisms in this set will be loaded. Defaults to ['Mus musculus', 'H**o sapiens'] Returns ------- nr_loaded : int Nr. of entries loaded. This double counts entries that are parsed both from SwissProt and from the ID mapping. ''' from waldo.backend import call_create_session session = call_create_session(create_session) loaded = _load_uniprot_sprot(datadir, session, organism_set) loaded += _load_idmapping(datadir, session, organism_set) loaded += _load_sec_ac(datadir, session) return loaded
def load(datadir, create_session=None): ''' nr_loaded = load(datadir, create_session={backend.create_session}) Loads gene_annotation.mgi and MRK_ENSEMBL.rpt files from MGI Parameters ---------- datadir : str base directory for data. Returns ------- Nr of annotation entries loaded. References ---------- For the file formats see: ftp://ftp.informatics.jax.org/pub/reports/index.html http://www.geneontology.org/GO.format.gaf-1_0.shtml http://wiki.geneontology.org/index.php/GAF_2.0 ''' from waldo.backend import call_create_session if datadir is None: datadir = _datadir session = call_create_session(create_session) loaded = _load_gene_annotation(path.join(datadir, 'gene_association.mgi'), session) _load_mrk_ensembl(path.join(datadir, 'MRK_ENSEMBL.rpt'), session) _load_pubmed_ids(path.join(datadir, 'MRK_Reference.rpt'), session) return loaded
def clear(create_session=None): ''' clear() Removes all NOG related information ''' from waldo.backend import call_create_session session = call_create_session(create_session) session.query(models.NogEntry).delete() session.commit()
def clear(create_session=None): ''' clear() Removes all Sequence related information ''' from waldo.backend import call_create_session session = call_create_session(create_session) session.query(models.EnsemblSequence).delete() session.commit()
def load(datadir, create_session=None): ''' num_entries = load(datadir={data/}, create_session={backend.create_session}) Load the data from a subcellular location annotations file into the local relational database Parameters ---------- datadir : str, optional Base directory containing the annotations file create_session : callable, optional Callable object which returns an sqlalchemy session (default: waldo.backend.create_session) Returns ------- num_entries : integer Number of entries loaded into the local database References ---------- (none) ''' import zipfile from waldo.backend import call_create_session session = call_create_session(create_session) zf = zipfile.ZipFile(path.join(datadir, _annot)) inputf = zf.open(zf.filelist[0]) # loop through the entries in the file csvreader = csv.reader(inputf, delimiter=',', quotechar='"') count = 0 for row in csvreader: count += 1 if count == 1: continue # loop through the list of comma-separated elements on this row gene, gene_name, main_loc, other_loc, expression_type, reliability, main_loc_go, other_loc_go = row locations = main_loc.split(";") if(other_loc != ""): locations += other_loc.split(";") for name in locations: session.add(models.Location(name, gene)) session.add(models.Entry(gene)) session.commit() return count - 1 # since the first row wasn't an entry
def clear(create_session=None): ''' clear() Removes all GO related information ''' from waldo.backend import call_create_session session = call_create_session(create_session) session.query(SlimSet).delete() session.query(SlimTerm).delete() session.query(SlimMapping).delete() session.commit()
def clear(create_session=None): ''' clear() Removes all GO related information ''' from waldo.backend import call_create_session from . import models session = call_create_session(create_session) session.query(models.Term).delete() session.query(models.TermRelationship).delete() session.commit()
def clear(create_session=None): ''' clear() Removes all LOCATE related information ''' from waldo.backend import call_create_session session = call_create_session(create_session) session.query(models.Isoform).delete() session.query(models.Image).delete() session.query(models.LocatePrediction).delete() session.query(models.Literature).delete() session.query(models.LocateAnnotation).delete() session.query(models.ExternalReference).delete() session.query(models.LocateEntry).delete() session.commit()
def load(datadir, create_session=None, mouse_only=True): """ nr_loaded = load(datadir, create_session={backend.create_session}, mouse_only=True) Parameters ---------- datadir : str Directory containing the gene2ensembl.gz file create_session : callable, optional a callable that returns an sqlalchemy session mouse_only : bool, optional whether to only load mouse data Currently, only ``mouse_only=True`` is implemented! Returns ------- nr_loaded : int Nr. of entries loaded """ from waldo.backend import call_create_session filename = path.join(datadir, _inputfilename) session = call_create_session(create_session) input = _gzip_open(filename) header = input.readline() if not mouse_only: raise NotImplementedError("waldo.refseq.load: Cannot load non-mouse entries") nr_loaded = 0 for line in input: tax_id, gene_id, ensembl_gene, rna_accession, emsembl_trans, protein_accession, ensembl_peptide = line.strip().split( "\t" ) if ensembl_peptide.find("ENSMUSP") == -1: continue protein_accession, version = protein_accession.split(".") session.add(Translation("ensembl:peptide_id", ensembl_peptide, "refseq:accession", protein_accession)) session.add(Translation("refseq:accession", protein_accession, "ensembl:peptide_id", ensembl_peptide)) session.add(Translation("ensembl:gene_id", ensembl_gene, "refseq:accession", protein_accession)) session.add(Translation("refseq:accession", protein_accession, "ensembl:gene_id", ensembl_gene)) session.commit() nr_loaded += 1 return nr_loaded
def load(datadir, create_session=None, species=('Mus Musculus', 'H**o Sapiens')): ''' nr_loaded = load(datadir, create_session={backend.create_session}, species=['Mus Musculus, H**o Sapiens') Load NOG entries file file into database Parameters ---------- datadir : str Directory containing the maNOG.mapping.txt.gz file create_session : callable, optional a callable object that returns an sqlalchemy session species : sequence species to load Returns ------- nr_loaded : integer Nr. of entries loaded ''' from waldo.backend import call_create_session session = call_create_session(create_session) if datadir is None: datadir = _datadir nr_loaded = 0 filename = path.join(datadir, _inputfilename) inputfile = _gzip_open(filename) header = inputfile.readline() for line in inputfile: prot_name, \ start, \ end, \ group, \ description = line.strip().split('\t') _, prot_name = prot_name.split('.') group = group[len('maNOG'):] group = int(group) for sp in species: if _accept_species(sp, prot_name): entry = models.NogEntry(prot_name, group) session.add(entry) session.commit() nr_loaded += 1 break return nr_loaded
def clear(create_session=None): ''' clear(create_session={backend.create_session}) Removes all Uniprot related information Parameters ---------- create_session : callable, optional callable which returns a session ''' from waldo.backend import call_create_session session = call_create_session(create_session) session.query(models.Accession).delete() session.query(models.GoAnnotation).delete() session.query(models.Reference).delete() session.query(models.Comment).delete() session.query(models.Organism).delete() session.query(models.UniprotEntry).delete() session.commit()
def load(datadir, create_session=None): ''' nr_entries = load(datadir, create_session={backend.create_session}) Load Gene Ontology OBO file into database Parameters ---------- datadir : Directory containing GO files create_session : a callable object that returns an sqlalchemy session Returns ------- nr_entries : Nr of entries ''' from waldo.backend import call_create_session session = call_create_session(create_session) filename = path.join(datadir, _inputfilename) if not path.exists(filename) and path.exists(filename + '.gz'): input = _gzip_open(filename) else: input = open(filename) id = None in_term = False loaded = 0 for term in _parse_terms(input): if term['is_obsolete']: continue session.add( Term(id=term['id'][0], name=term['name'][0], namespace=term['namespace'][0])) for rel in ('is_a','part_of'): for t in term[rel]: r = TermRelationship(id, t, rel) session.add(r) loaded += 1 # This check is ugly, but commit() is rather slow # The speed up is worth it: if (loaded % 512) == 0: session.commit() session.commit() return loaded
def load(datadir, create_session=None): ''' nr_loaded = load(datadir, create_session={backend.create_session}) Load ENSEMBL FASTA file into database Parameters ---------- datadir : str Directory containing the FASTA file create_session : callable, optional a callable object that returns an sqlalchemy session Returns ------- nr_loaded : integer Nr. of entries loaded ''' from waldo.backend import call_create_session session = call_create_session(create_session) inputfilename = glob.glob(path.join(datadir, 'Mus_musculus.*.pep.all.fa.gz'))[0] filename = path.join(inputfilename) nr_loaded = 0 for seq in fasta.read(filename): htokens = seq.header.split() peptide = htokens[0] gene = htokens[3] assert gene.startswith('gene:'), 'waldo.sequences.load' gene = gene[len('gene:'):] session.add( Translation( 'ensembl:gene_id', gene, 'ensembl:peptide_id', peptide)) aaseq = seq.sequence seq = models.EnsemblSequence(peptide, aaseq) session.add(seq) session.commit() nr_loaded += 1 return nr_loaded
def load(datadir, create_session=None): ''' nr_entries = load(datadir, create_session={backend.create_session}) Load MGI GO SLIM file Parameters ---------- datadir : Directory containing GO files create_session : a callable object that returns an sqlalchemy session Returns ------- nr_entries : Nr of entries ''' from waldo.backend import call_create_session session = call_create_session(create_session) filename = path.join(datadir, _inputfilename) input = open(filename) input.readline() # header aspects = {} slimset = SlimSet("mgi") session.add(slimset) loaded = 0 for line in input: go_id,_,slim_id,_ = line.strip().split('\t') if slim_id not in aspects: term = SlimTerm(slim_id, "mgi") session.add(term) session.commit() aspects[slim_id] = term else: term = aspects[slim_id] mapping = SlimMapping(go_id, term.id) session.add(mapping) session.commit() loaded += 1 return loaded, len(aspects)