def buildContentSummary(categories, categoryData, database): # builds the content summary for a database iters = 2 if len(categories) > 1 else 1 keywords = [TAXONOMY.get(cat) for cat in categories[:iters]] for i in range(iters): keys = reduce(list.__add__, keywords[i:]) urls = getUniqueDocs(keys, categoryData) logger("Building the content summary for " + categories[i] + \ ". Total docs to fetch: " + str(len(urls)), highlight=True) crawler.getContentSummary(database, categories[i], urls, categoryData)
def _sorted_homologenes(self, homologenes): '''sort list of homologenes [(taxid, geneid),...] based on the order defined in species_li. ''' d = {} for i, species in enumerate(list(TAXONOMY.keys())): d[TAXONOMY[species]["tax_id"]] = i gene_li = [(d.get(taxid, taxid), taxid, geneid) for taxid, geneid in homologenes] return [g[1:] for g in sorted(gene_li)]
def classifyDb(database, Tc=100, Ts=0.6): # classifies a database based on values of # threshold and specificity categories, categoryData = ["Root"], {} for cat in categories: logger("Analyzing " + cat + " category") filename = cat.lower() + ".txt" keywords = TAXONOMY.get(cat) if keywords: queryUrlMap = buildQueryUrlMap(database, filename) categoryData.update(queryUrlMap) keywordCount = {k: sum([q["count"] for q in queryUrlMap[k].itervalues()]) for k in keywords} N = float(sum(keywordCount.values())) for k, v in keywordCount.items(): logger("Coverage for {0} : {1}, Specificity: {2}".format(k, str(v), str(v/N))) if v >= Tc and v/N >= Ts: logger(">>>>>> Adding " + k + " to category <<<<<<") categories.append(k) return (categories, categoryData)
def __init__(self, data_folder): # if species_li is None, include all species self.set_species_li(list(TAXONOMY.keys())) self.data_folder = data_folder self.datafile = os.path.join(self.data_folder, self.DATAFILE)