示例#1
0
def main():
    if len(sys.argv) != 4:
        logging.error('Wrong count of arguments')
        print(usage())
        sys.exit()

    _, fasta_filename, score_matrix_filename, method = sys.argv
    method = method.lower()

    if method not in METHODS:
        logging.error('Wrong method for cluster')
        print(usage())
        sys.exit()

    logging.info('Load fasta file')
    sequences = read_fasta(fasta_filename)
    logging.info('Finish load fasta file')

    logging.info('Load score matrix')
    score_matrix = read_score_matrix(score_matrix_filename)
    logging.info('Finish load score matrix')

    logging.info('Start progressive alignment with method ' + method)
    result_sequences = METHODS[method](sequences, score_matrix)
    logging.info('Finish progressive alignment')

    for sequence in result_sequences:
        print('>' + sequence.name)
        print(sequence.seq)

    logging.info('Done.')
def main():
    if len(sys.argv) != 3:
        print("Usage: bait_frequency.py file.fsl genes.fa")
        sys.exit(-1)
    fsl_file = sys.argv[1]
    target_to_id = read_fsl(fsl_file)
    fasta_file = sys.argv[2]
    genes, name_map = readers.read_fasta(fasta_file, shorten=True, max_length_shorten=False)
    terms = ontology_common.parse_obo('new_combined.obo')
    baits_for_class = collections.defaultdict(set)
    for gene in genes.keys():
        name = gene
        found = name in target_to_id
        if found:
            results = target_to_id[name]
            for result in results:
                for cl in ontology_common.get_class(result[0], terms):
                    baits_for_class[cl].add(result[1])
            #print(name, len(results))
        else:
            print(name, '0')
    print()
    total_baits = 0
    for k, v in baits_for_class.items():
        if 'resistance gene' in terms[k]['name'][0]:
            print(terms[k]['name'][0], len(v))
            total_baits += len(v)
    print("Total counts for gene class ", total_baits)
    print()
    total_baits = 0
    for k, v in baits_for_class.items():
        if 'resistance gene' not in terms[k]['name'][0]:
            print(terms[k]['name'][0], len(v))
            total_baits += len(v)
    print("Total counts for mechanism ", total_baits)
def main():
    # for each sequence, see if there is a match with any consensus sequence
    amr = readers.read_fasta(file='../combined.fasta')
    #amr = readers.read_fasta(file='../test.combined.fasta')
    consensus = readers.read_fasta(file='../bait.fasta')
    #consensus = readers.read_fasta(file='../test.clstr.fasta')
    not_found = 0
    for name, gene in amr.items():
        found = False
        for cluster in consensus.values():
            if match(gene, cluster):
                found = True
                break
        if not found:
            not_found += 1
            print('Not found: %s' % name)

    print(not_found)
示例#4
0
def main():
    # for each sequence, see if there is a match with any consensus sequence
    amr = readers.read_fasta(file='../combined.fasta')
    #amr = readers.read_fasta(file='../test.combined.fasta')
    consensus = readers.read_fasta(file='../bait.fasta')
    #consensus = readers.read_fasta(file='../test.clstr.fasta')
    not_found = 0
    for name, gene in amr.items():
        found = False
        for cluster in consensus.values():
            if match(gene, cluster):
                found = True
                break
        if not found:
            not_found += 1
            print('Not found: %s' % name)

    print(not_found)
def main():
    if len(sys.argv) != 3:
        print("Usage: bait_topmatch.py file.fsl baits.fasta")
        sys.exit(-1)
    fsl_file = sys.argv[1]
    matched_baits = read_fsl(fsl_file)
    baits = readers.read_fasta(file=sys.argv[2])
    for b in baits.keys():
        if b not in matched_baits:
            print(b)
def main():
    if len(sys.argv) != 3:
        print("Usage: bait_topmatch.py file.fsl baits.fasta")
        sys.exit(-1)
    fsl_file = sys.argv[1]
    matched_baits = read_fsl(fsl_file)
    baits = readers.read_fasta(file=sys.argv[2])
    for b in baits.keys():
        if b not in matched_baits:
            print(b)
def main():
    if len(sys.argv) < 3 or len(sys.argv) > 4:
        print("Usage: functional_test.py [-protein] results.scan seq.fasta")
        sys.exit(-1)
    protein = True if sys.argv[1] == '-protein' else False
    if protein:
        scan_file = sys.argv[2]
        fasta_file = sys.argv[3]
        protein_map = {
        }  # place to store mapping of protein name to dna name for genes
    else:
        scan_file = sys.argv[1]
        fasta_file = sys.argv[2]

    genes = readers.read_fasta(fasta_file)
    positive_count = 0
    negative_count = 0

    for gene in genes.keys():
        if 'True' in gene:
            positive_count += 1
        elif 'False' in gene:
            negative_count += 1
        if protein:
            names = gene.split('>')
            protein_map[names[0].strip()] = names[1].strip()

    id_to_target, target_to_id = readers.read_scan_results(0,
                                                           scan_file,
                                                           protein=protein)
    false_positive = 0
    true_positive = 0
    already_seen_protein = set(
    )  # We don't want to double count if we have seen the same gene
    for key in target_to_id.keys():
        if protein:
            key = protein_map[key]
            if key in already_seen_protein:
                continue
            else:
                already_seen_protein.add(key)
        if key.startswith('False'):
            false_positive += 1
        elif key.startswith('True'):
            true_positive += 1

    print("True Positive: %d/%d(%f); False Positive: %d/%d(%f)" %
          (true_positive, positive_count,
           float(true_positive) / positive_count, false_positive,
           negative_count, float(false_positive) / negative_count))
def main():
    if len(sys.argv) != 3:
        print("Usage: bait_frequency.py file.fsl genes.fa")
        sys.exit(-1)
    fsl_file = sys.argv[1]
    target_to_id = read_fsl(fsl_file)
    fasta_file = sys.argv[2]
    genes, name_map = readers.read_fasta(fasta_file,
                                         shorten=True,
                                         max_length_shorten=False)
    terms = ontology_common.parse_obo('new_combined.obo')
    baits_for_class = collections.defaultdict(set)
    for gene in genes.keys():
        name = gene
        found = name in target_to_id
        if found:
            results = target_to_id[name]
            for result in results:
                for cl in ontology_common.get_class(result[0], terms):
                    baits_for_class[cl].add(result[1])
            #print(name, len(results))
        else:
            print(name, '0')
    print()
    total_baits = 0
    for k, v in baits_for_class.items():
        if 'resistance gene' in terms[k]['name'][0]:
            print(terms[k]['name'][0], len(v))
            total_baits += len(v)
    print("Total counts for gene class ", total_baits)
    print()
    total_baits = 0
    for k, v in baits_for_class.items():
        if 'resistance gene' not in terms[k]['name'][0]:
            print(terms[k]['name'][0], len(v))
            total_baits += len(v)
    print("Total counts for mechanism ", total_baits)
    output = ''.join([gen_random_sequence(prefix_len),
                      seq,
                      gen_random_sequence(suffix_len)])
    return "> %d:%d?%s?%s\n%s" %(prefix_len, len(seq) + prefix_len, target, name, output)


def unit_test():
    pattern = 'AAAAAA'
    value = gen_test(pattern, 'adr001', 'test')
    lines = value.split('\n')
    offset, target, name = lines[0][1:].split('?', 2)
    start, end = offset.split(':')
    start = int(start)
    end = int(end)
    if lines[1][start:end] == pattern:
        print("Success")
    else:
        print(lines[1][start:end])

#unit_test()
id_to_name = readers.read_grouping()
name_to_seq = readers.read_fasta()
for id, names in id_to_name.items():
    with open('../test/%s.fa' % id, 'w+') as fasta:
        for name in names:
            seq = name_to_seq[name]
            value = gen_test(seq, id, name)
            fasta.write(value + '\n')

def main():
    if len(sys.argv) < 3 or len(sys.argv) > 4:
        print(
            "Usage: functional_compare.py [-fsl] [-protein] results.scan seq.fasta"
        )
        sys.exit(-1)
    fsl = True if sys.argv[1] == '-fsl' else False
    protein = True if sys.argv[1] == '-protein' else False
    if fsl:
        fsl_file = sys.argv[2]
        fasta_file = sys.argv[3]
    elif protein:
        scan_file = sys.argv[2]
        fasta_file = sys.argv[3]
    else:
        scan_file = sys.argv[1]
        fasta_file = sys.argv[2]
    genes = readers.read_fasta(fasta_file)
    if fsl:
        target_to_id = read_fsl(fsl_file)
    else:
        id_to_target, target_to_id = readers.read_scan_results(0,
                                                               scan_file,
                                                               protein=protein)
    if protein:
        readers.change_RF_to_ARO(target_to_id)
        already_seen_protein = set(
        )  # We don't want to double count if we have seen the same gene

    terms = ontology_common.parse_obo('new_combined.obo')
    false_positive = 0
    true_positive = 0
    false_negative = 0
    for gene in genes.keys():
        if protein:
            names = gene.split('>')
            gene = names[1].strip()
            name = names[0].strip()
            if gene in already_seen_protein:
                continue
            else:
                already_seen_protein.add(gene)
        else:
            name = gene
        found = name in target_to_id
        if found:
            antibiotic = gene.split('_')[1]
            functional_antibiotic = antibiotic_code[antibiotic]
            results = target_to_id[name]
            results.sort(key=lambda l: l[1], reverse=True)
            index = 0
            while index < len(results):
                result = results[index]
                index += 1
                id = result[0]
                # remove formatting used by hmm
                if 's' in id:
                    id = id.replace('ARO', 'ARO:')
                    id = id.split('s')[0]
                if ';' in id:
                    # resfams can have a list of ids associated with a gene
                    classes = [
                        terms[p]['name'] for i in id.split(';')
                        for p in ontology_common.get_class(i, terms)
                    ]
                    drugs = set()
                    for i in id.split(';'):
                        drugs |= ontology_common.get_resistance(
                            ontology_common.get_lineage(i, terms), terms)
                else:
                    classes = [
                        terms[p]['name']
                        for p in ontology_common.get_class(id, terms)
                    ]
                    drugs = ontology_common.get_resistance(
                        ontology_common.get_lineage(id, terms), terms)
                identified = False
                for drug in drugs:
                    for d in ontology_common.get_lineage(drug, terms):
                        for fd in ontology_common.get_lineage(
                                functional_antibiotic[1], terms):
                            if d == fd and d not in [
                                    'ARO:1000001', 'ARO:1000003', 'Unknown'
                            ]:
                                identified = True

                if identified:
                    true_positive += 1
                    break
        else:
            false_negative += 1
        if found and not identified:
            print(gene, functional_antibiotic, id, classes, drugs)
            false_positive += 1

    print('False negative: %d; False Positive:%d; True Positive:%d' %
          (false_negative, false_positive, true_positive))
import readers
import sys


# read input files
name_to_sequence, name_map = readers.read_fasta(sys.argv[1], shorten=True)
id_to_name = readers.read_cluster(sys.argv[2])

# create fasta files for each id
readers.create_fasta_file_for_each_id(name_to_sequence, id_to_name, sys.argv[3], name_map)

def main():
    if len(sys.argv) < 3 or len(sys.argv) > 4:
        print("Usage: functional_compare.py [-fsl] [-protein] results.scan seq.fasta")
        sys.exit(-1)
    fsl = True if sys.argv[1] == '-fsl' else False
    protein = True if sys.argv[1] == '-protein' else False
    if fsl:
        fsl_file = sys.argv[2]
        fasta_file = sys.argv[3]
    elif protein:
        scan_file = sys.argv[2]
        fasta_file = sys.argv[3]
    else:
        scan_file = sys.argv[1]
        fasta_file = sys.argv[2]
    genes = readers.read_fasta(fasta_file)
    if fsl:
        target_to_id = read_fsl(fsl_file)
    else:
        id_to_target, target_to_id = readers.read_scan_results(0, scan_file, protein=protein)
    if protein:
        readers.change_RF_to_ARO(target_to_id)
        already_seen_protein = set() # We don't want to double count if we have seen the same gene

    terms = ontology_common.parse_obo('new_combined.obo')
    false_positive = 0
    true_positive = 0
    false_negative = 0
    for gene in genes.keys():
        if protein:
            names = gene.split('>')
            gene = names[1].strip()
            name = names[0].strip()
            if gene in already_seen_protein:
                continue
            else:
                already_seen_protein.add(gene)
        else:
            name = gene
        found = name in target_to_id
        if found:
            antibiotic = gene.split('_')[1]
            functional_antibiotic = antibiotic_code[antibiotic]
            results = target_to_id[name]
            results.sort(key=lambda l: l[1], reverse=True)
            index = 0
            while index < len(results):
                result = results[index]
                index += 1
                id = result[0]
                # remove formatting used by hmm
                if 's' in id:
                    id = id.replace('ARO', 'ARO:')
                    id = id.split('s')[0]
                if ';' in id:
                    # resfams can have a list of ids associated with a gene
                    classes = [terms[p]['name'] for i in id.split(';') for p in ontology_common.get_class(i, terms)]
                    drugs = set()
                    for i in id.split(';'):
                        drugs |= ontology_common.get_resistance(ontology_common.get_lineage(i, terms), terms)
                else:
                    classes = [terms[p]['name'] for p in ontology_common.get_class(id, terms)]
                    drugs = ontology_common.get_resistance(ontology_common.get_lineage(id, terms), terms)
                identified = False
                for drug in drugs:
                    for d in ontology_common.get_lineage(drug, terms):
                        for fd in ontology_common.get_lineage(functional_antibiotic[1], terms):
                            if d == fd and d not in ['ARO:1000001', 'ARO:1000003', 'Unknown']:
                                identified = True

                if identified:
                    true_positive += 1
                    break
        else:
            false_negative += 1
        if found and not identified:
            print(gene, functional_antibiotic, id, classes, drugs)
            false_positive += 1


    print('False negative: %d; False Positive:%d; True Positive:%d' % (false_negative, false_positive, true_positive))
import sys
import readers
import pairwise_alignment
import pgma
import neighbor_joining

if len(sys.argv) < 4:
    print("Usage: python progressive_alignment.py <seqs.fasta> <score_matrix> <upgma|wpgma|nj>")
    exit()
else:
    fasta_filename = sys.argv[1]
    matrix_filename = sys.argv[2]
    tree_type = sys.argv[3]
    print("Read fasta...")
    names, seqs = readers.read_fasta(fasta_filename)
    print("Read score matrix...")
    score_matrix = readers.read_matrix(matrix_filename)
    print("Align sequences...")
    if tree_type == "wpgma":
        names, seqs = pgma.pgma(names, seqs, score_matrix, 'w')
    elif tree_type == "upgma":
        names, seqs = pgma.pgma(names, seqs, score_matrix, 'u')
    elif tree_type == "nj":
        names, seqs = neighbor_joining.neigbor_joining(names, seqs, score_matrix)
    else:
        print("Error: Unkonwn option. Choose upgma, wpgma or nj.")
        exit()

    out_filename = fasta_filename.split('.')[0] + "_aligned.fasta"

    with open(out_filename, 'w') as out:
示例#14
0
import readers
import sys

# read input files
name_to_sequence = readers.read_fasta(sys.argv[1])
id_to_name = readers.read_grouping(sys.argv[2])

# create fasta files for each id
readers.create_fasta_file_for_each_id(name_to_sequence, id_to_name,
                                      sys.argv[3])