def condense_ids(df, tax_dict, ranks, max_group_size, threshold_assignments=False): """ Create mapping from tax_id to its condensed id. Also creates the assignment hash on either the condensed_id or assignment_tax_id decided by the --split-condensed-assignments switch. By taking a hash of the set (frozenset) of ids the qseqid is given a unique identifier (the hash). Later, we will use this hash and assign an ssignment name based on either the set of condensed_ids or assignment_tax_ids. The modivation for using a hash rather than the actual assignment text for grouping is that the assignment text can contains extra annotations that are independent of which assignment group a qseqid belongs to such as a 100% id star. """ condensed = sequtils.condense_ids(df[ASSIGNMENT_TAX_ID].unique(), tax_dict, ranks=ranks, max_size=max_group_size) condensed = pd.DataFrame(condensed.items(), columns=[ASSIGNMENT_TAX_ID, 'condensed_id']) condensed = condensed.set_index(ASSIGNMENT_TAX_ID) if threshold_assignments: assignment_hash = hash(frozenset(condensed.index.unique())) else: assignment_hash = hash(frozenset(condensed['condensed_id'].unique())) condensed['assignment_hash'] = assignment_hash return df.join(condensed, on=ASSIGNMENT_TAX_ID)
def condense_ids( df, tax_dict, ranks, max_group_size, threshold_assignments=False): """ Create mapping from tax_id to its condensed id. Also creates the assignment hash on either the condensed_id or assignment_tax_id decided by the --split-condensed-assignments switch. By taking a hash of the set (frozenset) of ids the qseqid is given a unique identifier (the hash). Later, we will use this hash and assign an ssignment name based on either the set of condensed_ids or assignment_tax_ids. The modivation for using a hash rather than the actual assignment text for grouping is that the assignment text can contains extra annotations that are independent of which assignment group a qseqid belongs to such as a 100% id star. """ condensed = sequtils.condense_ids( df[ASSIGNMENT_TAX_ID].unique(), tax_dict, ranks=ranks, max_size=max_group_size) condensed = pd.DataFrame( condensed.items(), columns=[ASSIGNMENT_TAX_ID, 'condensed_id']) condensed = condensed.set_index(ASSIGNMENT_TAX_ID) if threshold_assignments: assignment_hash = hash(frozenset(condensed.index.unique())) else: assignment_hash = hash(frozenset(condensed['condensed_id'].unique())) condensed['assignment_hash'] = assignment_hash return df.join(condensed, on=ASSIGNMENT_TAX_ID)
def test03(self): """ test max_size = 0 """ taxonomy = self.taxonomy thisdatadir = self.thisdatadir this_test = sys._getframe().f_code.co_name condensed_assignments_ref = path.join(thisdatadir, this_test, 'assignments.pkl.bz2') condensed_assignments_ref = BZ2File(condensed_assignments_ref) condensed_assignments_ref = cPickle.load(condensed_assignments_ref) condense_assignments = lambda x: sequtils.condense_ids( x, taxonomy, max_size=0) condensed_assignments = map(condense_assignments, self.assignments) self.assertEquals(condensed_assignments, condensed_assignments_ref)