def test_unique_kegg_names(): kegg_list = Kegg.get_org_list() from collections import Counter l = [] d = csv.DictReader(kegg_list.split('\n'), delimiter='\t', fieldnames=('code', 'org', 'tax')) for row in d: code, org, tax = row['code'], row['org'], row['tax'] l.append(tax) c = Counter(l) max_val = max(c.values()) print "Max val:", max_val while max_val > 1: print "#", max_val, " count:", len([v for v in c.values() if v == max_val]) max_val -= 1
def initdb(): """Creates the database.""" db.create_all() logging.info('Getting kegg organism list...') kegg_list = Kegg.get_org_list() # use only get memory error otherwise # print BiomodelMongo.objects.only('organism', 'name').all() mongo_list = dict( (b.organism, b.name) for b in BiomodelMongo.objects.only('organism', 'name').all() ) mongo_orgs = set(mongo_list.keys()) # to prevent insertion porting the same name kegg_names = set() logging.info('Insertion begins...') d = csv.DictReader(kegg_list.split('\n'), delimiter='\t', fieldnames=('code', 'org', 'tax')) for row in d: code, org, tax = row['code'], row['org'], row['tax'] if org in mongo_orgs: if tax not in kegg_names: o = Organism(row['code'], row['org'], row['tax']) o.save() b = Biomodel(name=tax, kegg_org=org) b.save() # finally tax to the set kegg_names.add(tax) # insert user u = User(username='******', email='*****@*****.**', password='******') u.save() logging.info('Done !')