def cluster( records, threshold=0.95, unknown='N', key='gi', aln_program='mafft', aln_executable='mafft', aln_options='--auto --reorder --adjustdirection', seeds=None, seed_coverage=0.5, query_coverage=0.9, free_gaps=True): from krpy import krother from krpy import krcl results_dict = dict() consumed_ids = list() seed_ids = list() record_count = len(records) records = sorted(records, key=lambda x: len(x.seq), reverse=True) records_seeds = records if seeds: records_seeds = seeds for seed_rec in records_seeds: key_value = None if key == 'accession': key_value = seed_rec.id elif key == 'gi': key_value = seed_rec.annotations['gi'] elif key == 'description': key_value = seed_rec.description else: key_value = seed_rec.id s_id = key_value seed_ids.append(s_id) for a_rec in records_seeds: # print('a_rec', a_rec) key_value = None if key == 'accession': key_value = a_rec.id elif key == 'gi': key_value = a_rec.annotations['gi'] elif key == 'description': key_value = a_rec.description else: key_value = a_rec.id a_id = key_value if not seeds: if a_id in consumed_ids: continue results_dict[a_id] = list() if a_id not in consumed_ids: results_dict[a_id].append(['+', a_id, '1.0']) consumed_ids.append(a_id) for i, b_rec in enumerate(records): krcl.print_progress( current=len(consumed_ids), total=record_count, length=0, prefix=krother.timestamp() + ' ', postfix=' records clustered. Checking ' + str(i) + '/' + str(record_count) + '.', show_bar=False) # print('b_rec', b_rec) key_value = None if key == 'accession': key_value = b_rec.id elif key == 'gi': key_value = b_rec.annotations['gi'] elif key == 'description': key_value = b_rec.description else: key_value = b_rec.id b_id = key_value if a_id == b_id: continue if b_id in consumed_ids: continue aln = align( records=[a_rec, b_rec], program=aln_program, options=aln_options, program_executable=aln_executable) # print(aln) pw_cov = pairwise_coverage(pairwise_alignment=aln) # print(pw_cov) a_cov = pw_cov[0] b_cov = pw_cov[1] # if a_rec.id in pw_cov.keys(): # a_cov = pw_cov[a_rec.id] # else: # a_cov = pw_cov['_R_' + a_rec.id] # if b_rec.id in pw_cov.keys(): # b_cov = pw_cov[b_rec.id] # else: # b_cov = pw_cov['_R_' + b_rec.id] direction = '+' for a in aln: # This will only work with MAFFT! if a.id.startswith('_R_'): direction = '-' break # score = pairwise_identity( # alignment=aln, # unknown_letters=set(['N']), # unknown_id=0.0, # free_unknowns=True, # gap_id=0.0, # free_gaps=True, # end_gap_id=0.0, # free_end_gaps=True) score = identity( alignment=aln, unknown_letters=set(['N']), free_unknowns=True, free_gaps=free_gaps, free_end_gaps=True) if (score >= threshold) and (a_cov >= seed_coverage) and (b_cov >= query_coverage): results_dict[a_id].append([direction, b_id, score]) consumed_ids.append(b_id) krcl.clear_line() # print(a_id, ':', b_id, '=', score, '|', a_cov, b_cov) # Report unclustered ids results_dict['unclustered'] = list() for rec in records: key_value = None if key == 'accession': key_value = rec.id elif key == 'gi': key_value = rec.annotations['gi'] elif key == 'description': key_value = rec.description else: key_value = rec.id rec_id = key_value if rec_id not in consumed_ids: results_dict['unclustered'].append(['.', rec_id, '0.0']) return results_dict
authority_file = args.authority_file if args.log_dir: log_dir = args.log_dir records = krbioio.read_sequence_file(input_file, 'gb', ret_type='list') ps = os.path.sep tax_log_handle = krseqsearch.__tax_log_open(log_dir, ps) tax_log_html_handle = krseqsearch.__tax_log_html_open(log_dir, ps) ######### krcl.hide_cursor() for i, record in enumerate(records): krcl.print_progress(i, len(records), 50, '') name = krseqsearch.check_organism_name( record, ncbi_names_table, synonymy_table, authority_file, hacks, hacks_data_location, unresolvable_taxonomy_list, keeplist_taxonomy_list, taxa_mappings_list, tax_log_handle, tax_log_html_handle) # tn = name[0]