def addStudy(session, study_id): # get latest version of nexson print "adding study {s}".format(s=study_id) phy = PhylesystemAPI(get_from="local") studyobj = phy.get_study(study_id)["data"] nexml = get_nexml_el(studyobj) year = nexml.get("^ot:studyYear") proposedTrees = nexml.get("^ot:candidateTreeForSynthesis") if proposedTrees is None: proposedTrees = [] # create a new Study object new_study = Study(id=study_id, year=year) session.add(new_study) # session.commit() # get curator(s), noting that ot:curators might be a # string or a list c = nexml.get("^ot:curatorName") print " ot:curatorName: ", c # create list of curator objects curator_list = [] if isinstance(c, basestring): curator_list.append(c) else: curator_list = c for curator in curator_list: test_c = session.query(Curator).filter(Curator.name == curator).first() if test_c: print "curator {c} already exists".format(c=curator) # session.add(curator) new_study.curators.append(test_c) else: print "curator {c} does no exist".format(c=curator) new_study.curators.append(Curator(name=curator)) # mapped otus in this study otu_dict = gen_otu_dict(studyobj) # iterate over the OTUs in the study, collecting the mapped # ones (oid to ott_id mapping held at the study level) mapped_otus = {} for oid, o in otu_dict.items(): ottID = o.get("^ot:ottId") if ottID is not None: mapped_otus[oid] = ottID # iterate over trees and insert tree data for trees_group_id, tree_id, tree in iter_trees(studyobj): print " tree :", tree_id proposedForSynth = False if tree_id in proposedTrees: proposedForSynth = True treejson = json.dumps(tree) new_tree = Tree(tree_id=tree_id, study_id=study_id, proposed=proposedForSynth, data=treejson) # get otus ottIDs = set() # ott ids for this tree ntips = 0 for node_id, node in iter_node(tree): oid = node.get("@otu") # no @otu property on internal nodes if oid is not None: ntips += 1 # ottID = mapped_otus[oid] if oid in mapped_otus: ottID = mapped_otus[oid] # check that this exists in the taxonomy # (it might not, if the ID has been deprecated) taxon = session.query(Taxonomy).filter(Taxonomy.id == ottID).first() if taxon: new_tree.otus.append(taxon) ottIDs.add(ottID) new_tree.ntips = ntips # need to write function for recursive query of Taxonomy table # ottIDs = parent_closure(ottIDs,taxonomy) # update with treebase id, if exists datadeposit = nexml.get("^ot:dataDeposit") if datadeposit: url = datadeposit["@href"] pattern = re.compile(u".+TB2:(.+)$") matchobj = re.match(pattern, url) if matchobj: tb_id = matchobj.group(1) new_tree.treebase_id = tb_id session.add(new_tree) # now that we have added the tree info, update the study record # with the json data (minus the tree info) del nexml["treesById"] studyjson = json.dumps(nexml) new_study.data = studyjson session.commit()
def add_study(study_id): _LOG.debug('adding study {s}'.format(s=study_id)) # get latest version of nexson # location of repo (test vs dev) dependent on peyotl config phy = create_phylesystem_obj() try: studyobj = phy.get_study(study_id)['data'] except: _LOG.debug('did not find study {s} in phylesystem'.format(s=study_id)) raise HTTPNotFound("Study {s} not found in phylesystem".format(s=study_id)) nexml = get_nexml_el(studyobj) proposedTrees = nexml.get('^ot:candidateTreeForSynthesis') if proposedTrees is None: proposedTrees = [] # create a new Study object new_study = Study(id=study_id) DBSession.add(new_study) # update with treebase id, if exists datadeposit = nexml.get('^ot:dataDeposit') if (datadeposit): url = datadeposit['@href'] if (url): pattern = re.compile(u'.+TB2:(.+)$') matchobj = re.match(pattern,url) if (matchobj): tb_id = matchobj.group(1) new_study.treebase_id=tb_id # get curator(s), noting that ot:curators might be a # string or a list c = nexml.get('^ot:curatorName') # create list of curator objects curator_list=[] if (isinstance(c,basestring)): curator_list.append(c) else: curator_list = c for curator in curator_list: test_c = DBSession.query(Curator).filter(Curator.name==curator).first() if test_c: _LOG.debug("curator {c} already exists".format(c=curator)) #DBSession.add(curator) new_study.curators.append(test_c) else: _LOG.debug("curator {c} does not yet exist".format(c=curator)) new_study.curators.append(Curator(name=curator)) # mapped otus in this study otu_dict = gen_otu_dict(studyobj) # iterate over the OTUs in the study, collecting the mapped # ones (oid to ott_id mapping held at the study level) mapped_otus = {} for oid, o in otu_dict.items(): ottID = o.get('^ot:ottId') if ottID is not None: mapped_otus[oid]=ottID # iterate over trees and insert tree data ntrees = 0 for trees_group_id, tree_id, tree in iter_trees(studyobj): _LOG.debug(' tree : {t}'.format(t=tree_id)) ntrees+=1 proposedForSynth = False if (tree_id in proposedTrees): proposedForSynth = True treejson = json.dumps(tree) new_tree = Tree( tree_id=tree_id, study_id=study_id, proposed=proposedForSynth, data=treejson ) # get otus ottIDs = set() # ott ids for this tree ntips=0 for node_id, node in iter_node(tree): oid = node.get('@otu') # no @otu property on internal nodes if oid is not None: ntips+=1 #ottID = mapped_otus[oid] if oid in mapped_otus: ottID = mapped_otus[oid] # _LOG.debug(' mapped ottID: {m}'.format(m=ottID)) # check that this exists in the taxonomy # (it might not, if the ID has been deprecated) taxon = DBSession.query(Taxonomy).filter( Taxonomy.id==ottID ).first() if taxon: lineage = get_lineage(ottID) _LOG.debug(' lineage of {m} = {l}'.format(m=ottID,l=lineage)) for t in lineage: ottIDs.add(t) new_tree.ntips = ntips for t in ottIDs: taxon = DBSession.query(Taxonomy).filter( Taxonomy.id==t ).first() # _LOG.debug(' adding {t},{n} to tree {tid}'.format( # t=t, # n=taxon.name, # tid=tree_id) # ) new_tree.otus.append(taxon) # add the tree DBSession.add(new_tree) # now that we have added the tree info, update the study record # with the json data (minus the tree info) del nexml['treesById'] studyjson = json.dumps(nexml) new_study.data=studyjson new_study.ntrees = ntrees
phy = Phylesystem() out = codecs.getwriter('utf-8')(sys.stdout) for study_id, n in phy.iter_study_objs(): otu_dict = gen_otu_dict(n) o_dict = {} for oid, o in otu_dict.items(): try: lab = o[label_prop_name] orig = o[orig_prop_name] o_dict[oid] = [orig, None, lab] except: pass del otu_dict for tree in iter_trees(n): for node in iter_node(tree): oid = node.get('@otu') if oid is not None: ott = node.get(tax_prop_name) if ott is not None: try: o_dict[oid][1] = ott except: e = 'study {f} node {n} refers to otu {o} which is not found.\n' m = e.format(f=study_id, n=node.get('@id'), o=oid) sys.stderr.write(m) for oid, v in o_dict.items(): t = v[1] l = v[2] if l and (t != l): orig = v[0]
out = codecs.getwriter('utf-8')(sys.stdout) for study_id, n in phy.iter_study_objs(): print(study_id) otu_dict = gen_otu_dict(n) o_dict = {} for oid, o in otu_dict.items(): try: lab = o[label_prop_name] orig = o[orig_prop_name] o_dict[oid] = [orig, None, lab] except: pass del otu_dict for trees_group_id, tree_id, tree in iter_trees(n): for node_id, node in iter_node(tree): oid = node.get('@otu') if oid is not None: ott = node.get(tax_prop_name) if ott is not None: try: o_dict[oid][1] = ott except: e = 'study {f} node {n} refers to otu {o} which is not found.\n' m = e.format(f=study_id, n=node.get('@id'), o=oid) sys.stderr.write(m) for oid, v in o_dict.items(): t = v[1] l = v[2] if l and (t != l): orig = v[0]
def load_nexsons(connection,cursor,phy,config_obj,nstudies=None): counter = 0 study_properties = set() tree_properties = set() for study_id, studyobj in phy.iter_study_objs(): nexml = get_nexml_el(studyobj) #print 'STUDY: ',study_id study_properties.update(nexml.keys()) # study data for study table STUDYTABLE = config_obj.get('database_tables','studytable') year = nexml.get('^ot:studyYear') proposedTrees = nexml.get('^ot:candidateTreeForSynthesis') if proposedTrees is None: proposedTrees = [] # must insert study before trees sqlstring = ("INSERT INTO {tablename} (id) " "VALUES (%s);" .format(tablename=STUDYTABLE) ) data = (study_id,) #print ' SQL: ',cursor.mogrify(sqlstring) cursor.execute(sqlstring,data) connection.commit() # update with treebase id, if exists datadeposit = nexml.get('^ot:dataDeposit') if (datadeposit): url = datadeposit['@href'] pattern = re.compile(u'.+TB2:(.+)$') matchobj = re.match(pattern,url) if (matchobj): tb_id = matchobj.group(1) sqlstring = ("UPDATE {tablename} " "SET treebase_id=%s " "WHERE id=%s;" .format(tablename=STUDYTABLE) ) data = (tb_id,study_id) #print ' SQL: ',cursor.mogrify(sqlstring,data) cursor.execute(sqlstring,data) connection.commit() # get curator(s), noting that ot:curators might be a # string or a list c = nexml.get('^ot:curatorName') #print ' ot:curatorName: ',c curators=[] if (isinstance(c,basestring)): curators.append(c) else: curators=c # remove duplicates curators = list(set(curators)) insert_curators(connection,cursor,config_obj,study_id,curators) # iterate over trees and insert tree data # note that OTU data done separately as COPY # due to size of table (see script <scriptname>) TREETABLE = config_obj.get('database_tables','treetable') ntrees = 0 try: for trees_group_id, tree_id, tree in iter_trees(studyobj): #print ' tree :' ,tree_id ntrees += 1 proposedForSynth = False tree_properties.update(tree.keys()) if (tree_id in proposedTrees): proposedForSynth = True treejson = json.dumps(tree) ntips = 0 for node_id, node in iter_node(tree): oid = node.get('@otu') # no @otu property on internal nodes if oid is not None: ntips+=1 sqlstring = ("INSERT INTO {tablename} " "(tree_id,study_id,ntips,proposed,data) " "VALUES (%s,%s,%s,%s,%s);" .format(tablename=TREETABLE) ) data = (tree_id,study_id,ntips,proposedForSynth,treejson) #print ' SQL: ',cursor.mogrify(sqlstring,data) cursor.execute(sqlstring,data) connection.commit() except psy.Error as e: print e.pgerror # now that we have added the tree info, update the study record # with the json data (minus the tree info) and ntrees del nexml['treesById'] studyjson = json.dumps(nexml) sqlstring = ("UPDATE {tablename} " "SET data=%s,ntrees=%s " "WHERE id=%s;" .format(tablename=STUDYTABLE) ) data = (studyjson,ntrees,study_id) cursor.execute(sqlstring,data) connection.commit() counter+=1 if (counter%500 == 0): print "loaded {n} studies".format(n=counter) if (nstudies and counter>=nstudies): print "finished inserting",nstudies,"studies" break # load the tree and study properties PROPERTYTABLE = config_obj.get('database_tables','propertytable') load_properties( connection, cursor, PROPERTYTABLE, study_properties, tree_properties)