def reduce_alignment(alignment): """ Function reduces a given alignment. Notes ----- Reduction here means that the output alignment consists only of those parts which have not been marked to be ignored by the user (parts in brackets). It requires that all data is properly coded. If reduction fails, this will throw a warning, and all brackets are simply removed in the output alignment. """ # check for bracket indices in all columns cols = misc.transpose(alignment) ignore_indices = [] ignore = False for i, col in enumerate(cols): reduced_col = sorted(set(col)) if '(' in reduced_col: if len(reduced_col) == 1: ignore_indices += [i] ignore = True else: ignore = False elif ')' in reduced_col: if len(reduced_col) == 1: ignore_indices += [i] ignore = False else: ignore_indices = [] elif ignore: ignore_indices += [i] if ignore_indices: new_cols = [] for i, col in enumerate(cols): if i not in ignore_indices: new_cols += [col] else: new_cols = cols new_alm = misc.transpose(new_cols) for i, alm in enumerate(new_alm): for j, char in enumerate(alm): if char in '()': new_alm[i][j] = '-' return new_alm
def reduce_alignment(alignment): """ Function reduces a given alignment. Notes ----- Reduction here means that the output alignment consists only of those parts which have not been marked to be ignored by the user (parts in brackets). It requires that all data is properly coded. If reduction fails, this will throw a warning, and all brackets are simply removed in the output alignment. """ # check for bracket indices in all columns cols = misc.transpose(alignment) ignore_indices = [] ignore = False for i, col in enumerate(cols): reduced_col = sorted(set(col)) if '(' in reduced_col: if len(reduced_col) == 1: ignore_indices += [i] ignore = True else: ignore = False elif ')' in reduced_col: if len(reduced_col) == 1: ignore_indices += [i] ignore = False else: ignore_indices = [] elif ignore: ignore_indices += [i] if ignore_indices: new_cols = [] for i, col in enumerate(cols): if i not in ignore_indices: new_cols += [col] else: new_cols = cols new_alm = misc.transpose(new_cols) for i, alm in enumerate(new_alm): for j, char in enumerate(alm): if char in '()': new_alm[i][j] = '-' return new_alm
def c_scores(self): """ Calculate the c-scores. """ almsGold = misc.transpose(self.gold.alm_matrix) almsTest = misc.transpose(self.test.alm_matrix) commons = len([i for i in almsGold if i in almsTest]) cp = commons / len(almsTest) cr = commons / len(almsGold) c_ = 2 * commons / (len(almsTest) + len(almsGold)) try: cf = 2 * cp * cr / (cp + cr) except ZeroDivisionError: cf = 0.0 return namedtuple('Scores', 'cp cr c_ cf')(cp, cr, c_, cf)
def c_scores(self): """ Calculate the c-scores. """ almsGold = misc.transpose(self.gold.alm_matrix) almsTest = misc.transpose(self.test.alm_matrix) commons = len([i for i in almsGold if i in almsTest]) cp = commons / len(almsTest) cr = commons / len(almsGold) c_ = 2 * commons / (len(almsTest) + len(almsGold)) try: cf = 2 * cp * cr / (cp + cr) except ZeroDivisionError: cf = 0.0 return namedtuple('Scores', 'cp cr c_ cf')(cp, cr, c_, cf)
def wl2multistate(wordlist, ref, missing): """ Function converts a wordlist to multistate format (compatible with PAUP). """ # convert the data to a multistate matrix # get etymological dictionary wordlist.get_etymdict(ref=ref) # define chars, we only have a limited set, unfortunately chars = ascii_letters + digits # iterate over all cognate sets and assign the chars matrix = [] for c in wordlist.concepts: taxon_to_cognate_set = wordlist.get_dict(concept=c, entry=ref) distinct_states = set() for taxon in wordlist.taxa: distinct_states.update(taxon_to_cognate_set.get(taxon, [0])) # make converter if len(distinct_states) > len(chars): # pragma: no cover # FIXME: This shouldn't just be a warning, because we # will get a KeyError # down below, since zip just returns a list of length len(chars)! log.warning('more distinct states than available characters!') char_map = dict(zip(sorted(distinct_states), chars)) char_map['-'] = '-' line = [] for taxon in wordlist.taxa: states = set(taxon_to_cognate_set.get(taxon, ['-'])) # exclude the case len(taxon_to_cognate_set[taxon]) == 0 if len(states) == 1: line.append(char_map[states.pop()]) elif not states: line.append(missing) else: line.append('({0})'.format("".join( [char_map[x] for x in sorted(states)]))) matrix.append(line) return misc.transpose(matrix)
def wl2multistate(wordlist, ref, missing): """ Function converts a wordlist to multistate format (compatible with PAUP). """ # convert the data to a multistate matrix # get etymological dictionary wordlist.get_etymdict(ref=ref) # define chars, we only have a limited set, unfortunately chars = ascii_letters + digits # iterate over all cognate sets and assign the chars matrix = [] for c in wordlist.concepts: taxon_to_cognate_set = wordlist.get_dict(concept=c, entry=ref) distinct_states = set() for taxon in wordlist.taxa: distinct_states.update(taxon_to_cognate_set.get(taxon, [0])) # make converter if len(distinct_states) > len(chars): # pragma: no cover # FIXME: This shouldn't just be a warning, because we # will get a KeyError # down below, since zip just returns a list of length len(chars)! log.warning('more distinct states than available characters!') char_map = dict(zip(sorted(distinct_states), chars)) char_map['-'] = '-' line = [] for taxon in wordlist.taxa: states = set(taxon_to_cognate_set.get(taxon, ['-'])) # exclude the case len(taxon_to_cognate_set[taxon]) == 0 if len(states) == 1: line.append(char_map[states.pop()]) elif not states: line.append(missing) else: line.append('({0})'.format( "".join([char_map[x] for x in sorted(states)]))) matrix.append(line) return misc.transpose(matrix)
def normalize_alignment(alignment): """ Function normalizes an alignment. Normalization here means that columns consisting only of gaps will be deleted, and all sequences will be stretched to equal length by adding additional gap characters in the end of smaller sequences. """ # clone the alignment alm_clone = [[x for x in y] for y in alignment] # first check for alms of different length alm_lens = [len(alm) for alm in alm_clone] if alm_lens.count(1) == len(alm_lens): for i, alm in enumerate(alm_clone): alm_clone[i] = alm[0].split(' ') alm_lens[i] = len(alm_clone[i]) if len(set(alm_lens)) > 1: max_len = max(alm_lens) for i, alm in enumerate(alm_clone): new_alm = alm + ['-' for x in range(max_len)] alm_clone[i] = new_alm[:max_len] # then check for alms consisting only of gaps cols = misc.transpose(alm_clone) idxs = [] for i, col in enumerate(cols): if set(col) == set('-'): idxs += [i] for idx in idxs[::-1]: for i, alm in enumerate(alm_clone): del alm_clone[i][idx] if alignment != alm_clone: lgtxt = 'Modified the alignment:\n' for i in range(len(alignment)): lgtxt += '[!] ' + ' '.join(alignment[i]) + '->' lgtxt += ' '.join(alm_clone[i]) + '\n' log.debug(lgtxt) return alm_clone else: return alignment
def normalize_alignment(alignment): """ Function normalizes an alignment. Normalization here means that columns consisting only of gaps will be deleted, and all sequences will be stretched to equal length by adding additional gap characters in the end of smaller sequences. """ # clone the alignment alm_clone = [[x for x in y] for y in alignment] # first check for alms of different length alm_lens = [len(alm) for alm in alm_clone] if alm_lens.count(1) == len(alm_lens): for i, alm in enumerate(alm_clone): alm_clone[i] = alm[0].split(' ') alm_lens[i] = len(alm_clone[i]) if len(set(alm_lens)) > 1: max_len = max(alm_lens) for i, alm in enumerate(alm_clone): new_alm = alm + ['-' for x in range(max_len)] alm_clone[i] = new_alm[:max_len] # then check for alms consisting only of gaps cols = misc.transpose(alm_clone) idxs = [] for i, col in enumerate(cols): if set(col) == set('-'): idxs += [i] for idx in idxs[::-1]: for i, alm in enumerate(alm_clone): del alm_clone[i][idx] if alignment != alm_clone: lgtxt = 'Modified the alignment:\n' for i in range(len(alignment)): lgtxt += '[!] ' + ' '.join(alignment[i]) + '->' lgtxt += ' '.join(alm_clone[i]) + '\n' debug(lgtxt) return alm_clone else: return alignment