예제 #1
0
def cluster(

    records,
    threshold=0.95,
    unknown='N',
    key='gi',
    aln_program='mafft',
    aln_executable='mafft',
    aln_options='--auto --reorder --adjustdirection',
    seeds=None,
    seed_coverage=0.5,
    query_coverage=0.9,
    free_gaps=True):

    from krpy import krother
    from krpy import krcl

    results_dict = dict()
    consumed_ids = list()
    seed_ids = list()

    record_count = len(records)

    records = sorted(records, key=lambda x: len(x.seq), reverse=True)
    records_seeds = records
    if seeds:
        records_seeds = seeds

        for seed_rec in records_seeds:

            key_value = None
            if key == 'accession':
                key_value = seed_rec.id
            elif key == 'gi':
                key_value = seed_rec.annotations['gi']
            elif key == 'description':
                key_value = seed_rec.description
            else:
                key_value = seed_rec.id

            s_id = key_value
            seed_ids.append(s_id)

    for a_rec in records_seeds:

        # print('a_rec', a_rec)

        key_value = None
        if key == 'accession':
            key_value = a_rec.id
        elif key == 'gi':
            key_value = a_rec.annotations['gi']
        elif key == 'description':
            key_value = a_rec.description
        else:
            key_value = a_rec.id

        a_id = key_value

        if not seeds:
            if a_id in consumed_ids:
                continue

        results_dict[a_id] = list()
        if a_id not in consumed_ids:
            results_dict[a_id].append(['+', a_id, '1.0'])
            consumed_ids.append(a_id)

        for i, b_rec in enumerate(records):

            krcl.print_progress(
                current=len(consumed_ids), total=record_count, length=0,
                prefix=krother.timestamp() + ' ',
                postfix=' records clustered. Checking ' + str(i) + '/' + str(record_count) + '.',
                show_bar=False)

            # print('b_rec', b_rec)

            key_value = None
            if key == 'accession':
                key_value = b_rec.id
            elif key == 'gi':
                key_value = b_rec.annotations['gi']
            elif key == 'description':
                key_value = b_rec.description
            else:
                key_value = b_rec.id

            b_id = key_value

            if a_id == b_id:
                continue

            if b_id in consumed_ids:
                continue

            aln = align(
                records=[a_rec, b_rec],
                program=aln_program,
                options=aln_options,
                program_executable=aln_executable)

            # print(aln)

            pw_cov = pairwise_coverage(pairwise_alignment=aln)

            # print(pw_cov)

            a_cov = pw_cov[0]
            b_cov = pw_cov[1]

            # if a_rec.id in pw_cov.keys():
            #     a_cov = pw_cov[a_rec.id]
            # else:
            #     a_cov = pw_cov['_R_' + a_rec.id]

            # if b_rec.id in pw_cov.keys():
            #     b_cov = pw_cov[b_rec.id]
            # else:
            #     b_cov = pw_cov['_R_' + b_rec.id]

            direction = '+'
            for a in aln:
                # This will only work with MAFFT!
                if a.id.startswith('_R_'):
                    direction = '-'
                    break

            # score = pairwise_identity(
            #     alignment=aln,
            #     unknown_letters=set(['N']),
            #     unknown_id=0.0,
            #     free_unknowns=True,
            #     gap_id=0.0,
            #     free_gaps=True,
            #     end_gap_id=0.0,
            #     free_end_gaps=True)

            score = identity(
                alignment=aln,
                unknown_letters=set(['N']),
                free_unknowns=True,
                free_gaps=free_gaps,
                free_end_gaps=True)

            if (score >= threshold) and (a_cov >= seed_coverage) and (b_cov >= query_coverage):
                results_dict[a_id].append([direction, b_id, score])
                consumed_ids.append(b_id)

            krcl.clear_line()
            # print(a_id, ':', b_id, '=', score, '|', a_cov, b_cov)

    # Report unclustered ids
    results_dict['unclustered'] = list()
    for rec in records:

        key_value = None
        if key == 'accession':
            key_value = rec.id
        elif key == 'gi':
            key_value = rec.annotations['gi']
        elif key == 'description':
            key_value = rec.description
        else:
            key_value = rec.id

        rec_id = key_value

        if rec_id not in consumed_ids:
            results_dict['unclustered'].append(['.', rec_id, '0.0'])

    return results_dict
        authority_file = args.authority_file
    if args.log_dir:
        log_dir = args.log_dir

    records = krbioio.read_sequence_file(input_file, 'gb', ret_type='list')

    ps = os.path.sep
    tax_log_handle = krseqsearch.__tax_log_open(log_dir, ps)
    tax_log_html_handle = krseqsearch.__tax_log_html_open(log_dir, ps)

    #########
    krcl.hide_cursor()

    for i, record in enumerate(records):

        krcl.print_progress(i, len(records), 50, '')

        name = krseqsearch.check_organism_name(
            record,
            ncbi_names_table,
            synonymy_table,
            authority_file,
            hacks,
            hacks_data_location,
            unresolvable_taxonomy_list,
            keeplist_taxonomy_list,
            taxa_mappings_list,
            tax_log_handle,
            tax_log_html_handle)

        # tn = name[0]