def main(): if len(sys.argv) != 4: logging.error('Wrong count of arguments') print(usage()) sys.exit() _, fasta_filename, score_matrix_filename, method = sys.argv method = method.lower() if method not in METHODS: logging.error('Wrong method for cluster') print(usage()) sys.exit() logging.info('Load fasta file') sequences = read_fasta(fasta_filename) logging.info('Finish load fasta file') logging.info('Load score matrix') score_matrix = read_score_matrix(score_matrix_filename) logging.info('Finish load score matrix') logging.info('Start progressive alignment with method ' + method) result_sequences = METHODS[method](sequences, score_matrix) logging.info('Finish progressive alignment') for sequence in result_sequences: print('>' + sequence.name) print(sequence.seq) logging.info('Done.')
def main(): if len(sys.argv) != 3: print("Usage: bait_frequency.py file.fsl genes.fa") sys.exit(-1) fsl_file = sys.argv[1] target_to_id = read_fsl(fsl_file) fasta_file = sys.argv[2] genes, name_map = readers.read_fasta(fasta_file, shorten=True, max_length_shorten=False) terms = ontology_common.parse_obo('new_combined.obo') baits_for_class = collections.defaultdict(set) for gene in genes.keys(): name = gene found = name in target_to_id if found: results = target_to_id[name] for result in results: for cl in ontology_common.get_class(result[0], terms): baits_for_class[cl].add(result[1]) #print(name, len(results)) else: print(name, '0') print() total_baits = 0 for k, v in baits_for_class.items(): if 'resistance gene' in terms[k]['name'][0]: print(terms[k]['name'][0], len(v)) total_baits += len(v) print("Total counts for gene class ", total_baits) print() total_baits = 0 for k, v in baits_for_class.items(): if 'resistance gene' not in terms[k]['name'][0]: print(terms[k]['name'][0], len(v)) total_baits += len(v) print("Total counts for mechanism ", total_baits)
def main(): # for each sequence, see if there is a match with any consensus sequence amr = readers.read_fasta(file='../combined.fasta') #amr = readers.read_fasta(file='../test.combined.fasta') consensus = readers.read_fasta(file='../bait.fasta') #consensus = readers.read_fasta(file='../test.clstr.fasta') not_found = 0 for name, gene in amr.items(): found = False for cluster in consensus.values(): if match(gene, cluster): found = True break if not found: not_found += 1 print('Not found: %s' % name) print(not_found)
def main(): if len(sys.argv) != 3: print("Usage: bait_topmatch.py file.fsl baits.fasta") sys.exit(-1) fsl_file = sys.argv[1] matched_baits = read_fsl(fsl_file) baits = readers.read_fasta(file=sys.argv[2]) for b in baits.keys(): if b not in matched_baits: print(b)
def main(): if len(sys.argv) < 3 or len(sys.argv) > 4: print("Usage: functional_test.py [-protein] results.scan seq.fasta") sys.exit(-1) protein = True if sys.argv[1] == '-protein' else False if protein: scan_file = sys.argv[2] fasta_file = sys.argv[3] protein_map = { } # place to store mapping of protein name to dna name for genes else: scan_file = sys.argv[1] fasta_file = sys.argv[2] genes = readers.read_fasta(fasta_file) positive_count = 0 negative_count = 0 for gene in genes.keys(): if 'True' in gene: positive_count += 1 elif 'False' in gene: negative_count += 1 if protein: names = gene.split('>') protein_map[names[0].strip()] = names[1].strip() id_to_target, target_to_id = readers.read_scan_results(0, scan_file, protein=protein) false_positive = 0 true_positive = 0 already_seen_protein = set( ) # We don't want to double count if we have seen the same gene for key in target_to_id.keys(): if protein: key = protein_map[key] if key in already_seen_protein: continue else: already_seen_protein.add(key) if key.startswith('False'): false_positive += 1 elif key.startswith('True'): true_positive += 1 print("True Positive: %d/%d(%f); False Positive: %d/%d(%f)" % (true_positive, positive_count, float(true_positive) / positive_count, false_positive, negative_count, float(false_positive) / negative_count))
output = ''.join([gen_random_sequence(prefix_len), seq, gen_random_sequence(suffix_len)]) return "> %d:%d?%s?%s\n%s" %(prefix_len, len(seq) + prefix_len, target, name, output) def unit_test(): pattern = 'AAAAAA' value = gen_test(pattern, 'adr001', 'test') lines = value.split('\n') offset, target, name = lines[0][1:].split('?', 2) start, end = offset.split(':') start = int(start) end = int(end) if lines[1][start:end] == pattern: print("Success") else: print(lines[1][start:end]) #unit_test() id_to_name = readers.read_grouping() name_to_seq = readers.read_fasta() for id, names in id_to_name.items(): with open('../test/%s.fa' % id, 'w+') as fasta: for name in names: seq = name_to_seq[name] value = gen_test(seq, id, name) fasta.write(value + '\n')
def main(): if len(sys.argv) < 3 or len(sys.argv) > 4: print( "Usage: functional_compare.py [-fsl] [-protein] results.scan seq.fasta" ) sys.exit(-1) fsl = True if sys.argv[1] == '-fsl' else False protein = True if sys.argv[1] == '-protein' else False if fsl: fsl_file = sys.argv[2] fasta_file = sys.argv[3] elif protein: scan_file = sys.argv[2] fasta_file = sys.argv[3] else: scan_file = sys.argv[1] fasta_file = sys.argv[2] genes = readers.read_fasta(fasta_file) if fsl: target_to_id = read_fsl(fsl_file) else: id_to_target, target_to_id = readers.read_scan_results(0, scan_file, protein=protein) if protein: readers.change_RF_to_ARO(target_to_id) already_seen_protein = set( ) # We don't want to double count if we have seen the same gene terms = ontology_common.parse_obo('new_combined.obo') false_positive = 0 true_positive = 0 false_negative = 0 for gene in genes.keys(): if protein: names = gene.split('>') gene = names[1].strip() name = names[0].strip() if gene in already_seen_protein: continue else: already_seen_protein.add(gene) else: name = gene found = name in target_to_id if found: antibiotic = gene.split('_')[1] functional_antibiotic = antibiotic_code[antibiotic] results = target_to_id[name] results.sort(key=lambda l: l[1], reverse=True) index = 0 while index < len(results): result = results[index] index += 1 id = result[0] # remove formatting used by hmm if 's' in id: id = id.replace('ARO', 'ARO:') id = id.split('s')[0] if ';' in id: # resfams can have a list of ids associated with a gene classes = [ terms[p]['name'] for i in id.split(';') for p in ontology_common.get_class(i, terms) ] drugs = set() for i in id.split(';'): drugs |= ontology_common.get_resistance( ontology_common.get_lineage(i, terms), terms) else: classes = [ terms[p]['name'] for p in ontology_common.get_class(id, terms) ] drugs = ontology_common.get_resistance( ontology_common.get_lineage(id, terms), terms) identified = False for drug in drugs: for d in ontology_common.get_lineage(drug, terms): for fd in ontology_common.get_lineage( functional_antibiotic[1], terms): if d == fd and d not in [ 'ARO:1000001', 'ARO:1000003', 'Unknown' ]: identified = True if identified: true_positive += 1 break else: false_negative += 1 if found and not identified: print(gene, functional_antibiotic, id, classes, drugs) false_positive += 1 print('False negative: %d; False Positive:%d; True Positive:%d' % (false_negative, false_positive, true_positive))
import readers import sys # read input files name_to_sequence, name_map = readers.read_fasta(sys.argv[1], shorten=True) id_to_name = readers.read_cluster(sys.argv[2]) # create fasta files for each id readers.create_fasta_file_for_each_id(name_to_sequence, id_to_name, sys.argv[3], name_map)
def main(): if len(sys.argv) < 3 or len(sys.argv) > 4: print("Usage: functional_compare.py [-fsl] [-protein] results.scan seq.fasta") sys.exit(-1) fsl = True if sys.argv[1] == '-fsl' else False protein = True if sys.argv[1] == '-protein' else False if fsl: fsl_file = sys.argv[2] fasta_file = sys.argv[3] elif protein: scan_file = sys.argv[2] fasta_file = sys.argv[3] else: scan_file = sys.argv[1] fasta_file = sys.argv[2] genes = readers.read_fasta(fasta_file) if fsl: target_to_id = read_fsl(fsl_file) else: id_to_target, target_to_id = readers.read_scan_results(0, scan_file, protein=protein) if protein: readers.change_RF_to_ARO(target_to_id) already_seen_protein = set() # We don't want to double count if we have seen the same gene terms = ontology_common.parse_obo('new_combined.obo') false_positive = 0 true_positive = 0 false_negative = 0 for gene in genes.keys(): if protein: names = gene.split('>') gene = names[1].strip() name = names[0].strip() if gene in already_seen_protein: continue else: already_seen_protein.add(gene) else: name = gene found = name in target_to_id if found: antibiotic = gene.split('_')[1] functional_antibiotic = antibiotic_code[antibiotic] results = target_to_id[name] results.sort(key=lambda l: l[1], reverse=True) index = 0 while index < len(results): result = results[index] index += 1 id = result[0] # remove formatting used by hmm if 's' in id: id = id.replace('ARO', 'ARO:') id = id.split('s')[0] if ';' in id: # resfams can have a list of ids associated with a gene classes = [terms[p]['name'] for i in id.split(';') for p in ontology_common.get_class(i, terms)] drugs = set() for i in id.split(';'): drugs |= ontology_common.get_resistance(ontology_common.get_lineage(i, terms), terms) else: classes = [terms[p]['name'] for p in ontology_common.get_class(id, terms)] drugs = ontology_common.get_resistance(ontology_common.get_lineage(id, terms), terms) identified = False for drug in drugs: for d in ontology_common.get_lineage(drug, terms): for fd in ontology_common.get_lineage(functional_antibiotic[1], terms): if d == fd and d not in ['ARO:1000001', 'ARO:1000003', 'Unknown']: identified = True if identified: true_positive += 1 break else: false_negative += 1 if found and not identified: print(gene, functional_antibiotic, id, classes, drugs) false_positive += 1 print('False negative: %d; False Positive:%d; True Positive:%d' % (false_negative, false_positive, true_positive))
import sys import readers import pairwise_alignment import pgma import neighbor_joining if len(sys.argv) < 4: print("Usage: python progressive_alignment.py <seqs.fasta> <score_matrix> <upgma|wpgma|nj>") exit() else: fasta_filename = sys.argv[1] matrix_filename = sys.argv[2] tree_type = sys.argv[3] print("Read fasta...") names, seqs = readers.read_fasta(fasta_filename) print("Read score matrix...") score_matrix = readers.read_matrix(matrix_filename) print("Align sequences...") if tree_type == "wpgma": names, seqs = pgma.pgma(names, seqs, score_matrix, 'w') elif tree_type == "upgma": names, seqs = pgma.pgma(names, seqs, score_matrix, 'u') elif tree_type == "nj": names, seqs = neighbor_joining.neigbor_joining(names, seqs, score_matrix) else: print("Error: Unkonwn option. Choose upgma, wpgma or nj.") exit() out_filename = fasta_filename.split('.')[0] + "_aligned.fasta" with open(out_filename, 'w') as out:
import readers import sys # read input files name_to_sequence = readers.read_fasta(sys.argv[1]) id_to_name = readers.read_grouping(sys.argv[2]) # create fasta files for each id readers.create_fasta_file_for_each_id(name_to_sequence, id_to_name, sys.argv[3])