def read_strains(*files, comment_char="#"): """Reads strain names from one or more plain text files and returns the set of distinct strains. Strain names can be commented with full-line or inline comments. For example, the following is a valid strain names file: # this is a comment at the top of the file strain1 # exclude strain1 because it isn't sequenced properly strain2 # this is an empty line that will be ignored. Parameters ---------- files : one or more str one or more names of text files with one strain name per line Returns ------- set : strain names from the given input files """ strains = set() for input_file in files: with open_file(input_file, 'r') as ifile: for line in ifile: # Allow comments anywhere in a given line. strain_name = line.split(comment_char)[0].strip() if len(strain_name) > 0: strains.add(strain_name) return strains
def to_mutations(aln_file, ref, aa=False): res = {} ref_array = to_numpy_array(ref) with open_file(aln_file, 'r') as fh: for name, seq in SimpleFastaParser(fh): res[name] = summarise_differences(ref_array, to_numpy_array(seq), aa) return res
def test_write_sequences_by_external_handle(self, tmpdir, sequences): output_filename = Path(tmpdir) / Path("new_sequences.fasta") with open_file(output_filename, "w") as handle: total_sequences_written = 0 for sequence in sequences: sequences_written = write_sequences(sequence, handle) total_sequences_written += sequences_written with open(output_filename, "r") as handle: assert total_sequences_written == len( [line for line in handle if line.startswith(">")])
def to_mutations(aln_file, ref, aa=False): res = {} ambiguous = 'X' if aa else 'N' with open_file(aln_file, 'r') as fh: for si, (name, seq) in enumerate(SimpleFastaParser(fh)): if si % 1000 == 0 and si: print(f"sequence {si}") res[name] = ",".join([ f"{a}{p}{d}" for a, p, d in get_differences(ref, seq, ambiguous) ]) return res
def to_mutations(aln_file, ref, aa=False): res = {} ambiguous = 'X' if aa else 'N' ref_int = seq = np.frombuffer(ref.encode('utf-8'), dtype=np.int8).copy() with open_file(aln_file, 'rt') as fh: for si, (name, seq) in enumerate(SimpleFastaParser(fh)): if si % 1000 == 0 and si: print(f"sequence {si}") res[name] = ",".join([ f"{a}{p}{d}" for a, p, d in get_differences(ref, ref_int, seq, ambiguous) ]) return res
# https://docs.python.org/3/library/tarfile.html#tarfile.TarFile.extractfile sequence_files.append(sequence_file) tar_handles.append(tar_handle) except FileNotFoundError as error: print(f"ERROR: {error}", file=sys.stderr) sys.exit(1) else: sequence_files.append(sequence_filename) # Replace whitespace and everything following pipes with nothing. pattern = "( )|(\|.*)" if args.strip_prefixes: prefixes = "|".join(args.strip_prefixes) pattern = f"^({prefixes})|{pattern}" with open_file(args.output, "w") as output_handle: # In order to prefer the latter files, we have to reverse the order of # the files. sequences = read_sequences(*reversed(sequence_files)) renamed_sequences = rename_sequences(sequences, pattern) deduplicated_sequences = drop_duplicate_sequences( renamed_sequences, args.error_on_duplicate_strains ) try: for sequence in deduplicated_sequences: write_sequences(sequence, output_handle) except DuplicateSequenceError as error: print( f"ERROR: The following strains have duplicate sequences: {error}",
nargs='+', type=int, help="list of sites to mask") parser.add_argument("--output", required=True, help="FASTA file of output alignment") args = parser.parse_args() begin_length = 0 if args.mask_from_beginning: begin_length = args.mask_from_beginning end_length = 0 if args.mask_from_end: end_length = args.mask_from_end with open_file(args.output, 'w') as outfile: for record in read_sequences(args.alignment): seq = str(record.seq) if args.mask_terminal_gaps: seq = mask_terminal_gaps(seq) start = "N" * begin_length middle = seq[begin_length:-end_length] end = "N" * end_length seq_list = list(start + middle + end) if args.mask_sites: for site in args.mask_sites: seq_list[site - 1] = "N" record.seq = Seq("".join(seq_list)) write_sequences(record, outfile)
def analyze_divergence(sequences, metadata, reference, mask_5p=0, mask_3p=0): int_ref = sequence_to_int_array(reference, fill_gaps=False) diagnostics = defaultdict(dict) fill_value = 110 gap_value = 45 ws = 50 known_true_clusters = [(28880, 28883)] known_true_cluster_array = np.ones_like(int_ref, dtype=int) for b, e in known_true_clusters: known_true_cluster_array[b:e] = 0 cluster_cut_off = 10 with open_file(sequences) as fasta: for h, s in SimpleFastaParser(fasta): left_gaps = len(s) - len(s.lstrip('-')) right_gaps = len(s) - len(s.rstrip('-')) s = sequence_to_int_array(s, fill_value=fill_value, fill_gaps=False) # mask from both ends to avoid exclusion for problems at sites that will be masked anyway if mask_5p: s[:mask_5p] = fill_value if mask_3p: s[-mask_3p:] = fill_value # fill terminal gaps -- those will be filled anyway if left_gaps: s[:left_gaps] = fill_value if right_gaps: s[-right_gaps:] = fill_value # determine non-gap non-N mismatches snps = (int_ref != s) & (s != fill_value) & (s != gap_value) # determine N positions filled = s == fill_value # determine gap positions (cast to int to detect start and ends) gaps = np.array(s == gap_value, dtype=int) gap_start = np.where(np.diff(gaps) == 1)[0] gap_end = np.where(np.diff(gaps) == -1)[0] # determined mutation clusters by convolution with an array of ones => running window average clusters = np.array(np.convolve(snps * known_true_cluster_array, np.ones(ws), mode='same') >= cluster_cut_off, dtype=int) # determine start and end of clusters. extend by half window size on both ends. cluster_start = [0] if clusters[0] else [] cluster_start.extend([ max(0, x - ws // 2) for x in np.where(np.diff(clusters) == 1)[0] ]) cluster_end = [ min(int_ref.shape[0], x + ws // 2) for x in np.where(np.diff(clusters) == -1)[0] ] if clusters[-1]: cluster_end.append(int_ref.shape[0]) diagnostics[h] = { 'snps': list(np.where(snps)[0]), 'gaps': list(zip(gap_start, gap_end)), 'gap_sum': np.sum(gaps), 'no_data': np.sum(filled) - mask_3p - mask_5p, 'clusters': [(b, e, np.sum(snps[b:e])) for b, e in zip(cluster_start, cluster_end)] } return diagnostics
args = parser.parse_args() # load entire alignment and the alignment of focal sequences (upper case -- probably not necessary) ref = SeqIO.read(args.reference, 'genbank').seq metadata, _ = read_metadata(args.metadata) diagnostics = analyze_divergence(args.alignment, metadata, ref, mask_5p=args.mask_from_beginning, mask_3p=args.mask_from_end) snp_cutoff = 25 no_data_cutoff = 3000 flagged_sequences = [] # output diagnostics for each sequence, ordered by divergence with open_file(args.output_diagnostics, 'w') as diag: diag.write('\t'.join([ 'strain', 'divergence', 'excess divergence', '#Ns', '#gaps', 'clusters', 'gaps', 'all_snps', 'gap_list' ]) + '\n') for s, d in sorted(diagnostics.items(), key=lambda x: len(x[1]['snps']), reverse=True): expected_div = expected_divergence( metadata[s]['date']) if s in metadata else np.nan diag.write('\t'.join( map(str, [ s, len(d['snps']), round(len(d['snps']) - expected_div, 2), d['no_data'], d['gap_sum'],
if column not in combined_data[strain]: combined_data[strain][column] = EMPTY for idx in range(1, len(metadata)): for strain, row in metadata[idx]['data'].items(): if strain not in combined_data: combined_data[strain] = {c:EMPTY for c in combined_columns} for column in combined_columns: if column in row: existing_value = combined_data[strain][column] new_value = row[column] # overwrite _ANY_ existing value if the overwriting value is non empty (and different)! if new_value != EMPTY and new_value != existing_value: if existing_value != EMPTY: print(f"[{strain}::{column}] Overwriting {combined_data[strain][column]} with {new_value}") combined_data[strain][column] = new_value # one-hot encoding for origin # note that we use "yes" / "no" here as Booleans are problematic for `augur filter` for metadata_entry in metadata: origin = metadata_entry['origin'] for strain in combined_data: combined_data[strain][origin] = "yes" if strain in metadata_entry['strains'] else "no" print(f"Combined metadata: {len(combined_data.keys())} strains x {len(combined_columns)} columns") with open_file(args.output, 'w') as fh: tsv_writer = csv.writer(fh, delimiter='\t') tsv_writer.writerow(combined_columns) for row in combined_data.values(): tsv_writer.writerow([row[column] for column in combined_columns])
# https://docs.python.org/3/library/tarfile.html#tarfile.TarFile.extractfile sequence_files.append(sequence_file) tar_handles.append(tar_handle) except FileNotFoundError as error: print(f"ERROR: {error}", file=sys.stderr) sys.exit(1) else: sequence_files.append(sequence_filename) # Replace whitespace and everything following pipes with nothing. pattern = "( )|(\|.*)" if args.strip_prefixes: prefixes = "|".join(args.strip_prefixes) pattern = f"^({prefixes})|{pattern}" with open_file(args.output, "w", threads=1) as output_handle: # In order to prefer the latter files, we have to reverse the order of # the files. sequences = read_sequences(*reversed(sequence_files)) renamed_sequences = rename_sequences(sequences, pattern) deduplicated_sequences = drop_duplicate_sequences( renamed_sequences, args.error_on_duplicate_strains) try: for sequence in deduplicated_sequences: write_sequences(sequence, output_handle) except DuplicateSequenceError as error: print( f"ERROR: The following strains have duplicate sequences: {error}", file=sys.stderr) sys.exit(1)