def action(args): seqs = fastalite(args.fasta) pairs = list(all_pairwise(seqs)) if args.distance: pairs = [(q, t, 1 - i) for q, t, i in pairs] if args.split_info and args.matrix_out: primary, secondary = args.primary_group, args.secondary_group split_info = list(csv.DictReader(args.split_info)) info = {r['seqname']: r for r in split_info if r['seqname']} tax = {r['tax_id']:r for r in split_info} pairs += map(itemgetter(1,0,2), pairs) def group(seqname): i = info[seqname] return i[primary] or i[secondary] if secondary else i[primary] pairs = ((group(left), group(right), score) for left,right,score in pairs) # sort and group rows pairs = list(groupbyl(pairs, key = itemgetter(0))) matrix_out = csv.writer(args.matrix_out) # this is the tax_id order we will be using for columns tax_ids = map(itemgetter(0), pairs) # get the species names to output as first row matrix_out.writerow([''] + [tax[t]['tax_name'] for t in tax_ids]) # iterator through the sorted rows (pairs) for row_id, columns in pairs: # sort and group columns columns = dict(groupbyl(columns, key = itemgetter(1))) # get the species name row = [tax[row_id]['tax_name']] for t in tax_ids: # if t not in columns that means there is only # sequence representing the group # therefore the median destance is 0 if t not in columns: med = 0 else: col = columns[t] med = median(map(itemgetter(2), col)) # percent and round med = math.ceil(med * 100) / 100 row.append(med) matrix_out.writerow(row) else: writer = csv.writer(args.out) writer.writerow(['query', 'target', 'identity']) writer.writerows(pairs)
def test02(self): with open(self.data('two.fasta')) as f: seqs = list(sequtils.fastalite(f)) pairs = list(sequtils.all_pairwise(seqs)) self.assertEqual(len(pairs), (len(seqs) * (len(seqs) - 1)) / 2) self.assertEqual( [s.id for s in seqs], list(sequtils.names_from_pairs(pairs)))