Пример #1
0
def read_strains(*files, comment_char="#"):
    """Reads strain names from one or more plain text files and returns the
    set of distinct strains.

    Strain names can be commented with full-line or inline comments. For
    example, the following is a valid strain names file:

        # this is a comment at the top of the file
        strain1  # exclude strain1 because it isn't sequenced properly
        strain2
          # this is an empty line that will be ignored.

    Parameters
    ----------
    files : one or more str
        one or more names of text files with one strain name per line

    Returns
    -------
    set :
        strain names from the given input files

    """
    strains = set()
    for input_file in files:
        with open_file(input_file, 'r') as ifile:
            for line in ifile:
                # Allow comments anywhere in a given line.
                strain_name = line.split(comment_char)[0].strip()
                if len(strain_name) > 0:
                    strains.add(strain_name)

    return strains
Пример #2
0
def to_mutations(aln_file, ref, aa=False):
    res = {}
    ref_array = to_numpy_array(ref)
    with open_file(aln_file, 'r') as fh:
        for name, seq in SimpleFastaParser(fh):
            res[name] = summarise_differences(ref_array, to_numpy_array(seq),
                                              aa)
    return res
Пример #3
0
    def test_write_sequences_by_external_handle(self, tmpdir, sequences):
        output_filename = Path(tmpdir) / Path("new_sequences.fasta")

        with open_file(output_filename, "w") as handle:
            total_sequences_written = 0
            for sequence in sequences:
                sequences_written = write_sequences(sequence, handle)
                total_sequences_written += sequences_written

        with open(output_filename, "r") as handle:
            assert total_sequences_written == len(
                [line for line in handle if line.startswith(">")])
Пример #4
0
def to_mutations(aln_file, ref, aa=False):
    res = {}
    ambiguous = 'X' if aa else 'N'

    with open_file(aln_file, 'r') as fh:
        for si, (name, seq) in enumerate(SimpleFastaParser(fh)):
            if si % 1000 == 0 and si:
                print(f"sequence {si}")
            res[name] = ",".join([
                f"{a}{p}{d}"
                for a, p, d in get_differences(ref, seq, ambiguous)
            ])

    return res
Пример #5
0
def to_mutations(aln_file, ref, aa=False):
    res = {}
    ambiguous = 'X' if aa else 'N'
    ref_int = seq = np.frombuffer(ref.encode('utf-8'), dtype=np.int8).copy()

    with open_file(aln_file, 'rt') as fh:
        for si, (name, seq) in enumerate(SimpleFastaParser(fh)):
            if si % 1000 == 0 and si:
                print(f"sequence {si}")
            res[name] = ",".join([
                f"{a}{p}{d}"
                for a, p, d in get_differences(ref, ref_int, seq, ambiguous)
            ])

    return res
Пример #6
0
                # https://docs.python.org/3/library/tarfile.html#tarfile.TarFile.extractfile
                sequence_files.append(sequence_file)
                tar_handles.append(tar_handle)
            except FileNotFoundError as error:
                print(f"ERROR: {error}", file=sys.stderr)
                sys.exit(1)
        else:
            sequence_files.append(sequence_filename)

    # Replace whitespace and everything following pipes with nothing.
    pattern = "( )|(\|.*)"
    if args.strip_prefixes:
        prefixes = "|".join(args.strip_prefixes)
        pattern = f"^({prefixes})|{pattern}"

    with open_file(args.output, "w") as output_handle:
        # In order to prefer the latter files, we have to reverse the order of
        # the files.
        sequences = read_sequences(*reversed(sequence_files))
        renamed_sequences = rename_sequences(sequences, pattern)
        deduplicated_sequences = drop_duplicate_sequences(
            renamed_sequences,
            args.error_on_duplicate_strains
        )

        try:
            for sequence in deduplicated_sequences:
                write_sequences(sequence, output_handle)
        except DuplicateSequenceError as error:
            print(
                f"ERROR: The following strains have duplicate sequences: {error}",
Пример #7
0
                        nargs='+',
                        type=int,
                        help="list of sites to mask")
    parser.add_argument("--output",
                        required=True,
                        help="FASTA file of output alignment")
    args = parser.parse_args()

    begin_length = 0
    if args.mask_from_beginning:
        begin_length = args.mask_from_beginning
    end_length = 0
    if args.mask_from_end:
        end_length = args.mask_from_end

    with open_file(args.output, 'w') as outfile:
        for record in read_sequences(args.alignment):
            seq = str(record.seq)
            if args.mask_terminal_gaps:
                seq = mask_terminal_gaps(seq)

            start = "N" * begin_length
            middle = seq[begin_length:-end_length]
            end = "N" * end_length
            seq_list = list(start + middle + end)
            if args.mask_sites:
                for site in args.mask_sites:
                    seq_list[site - 1] = "N"
            record.seq = Seq("".join(seq_list))
            write_sequences(record, outfile)
Пример #8
0
def analyze_divergence(sequences, metadata, reference, mask_5p=0, mask_3p=0):
    int_ref = sequence_to_int_array(reference, fill_gaps=False)
    diagnostics = defaultdict(dict)
    fill_value = 110
    gap_value = 45
    ws = 50
    known_true_clusters = [(28880, 28883)]
    known_true_cluster_array = np.ones_like(int_ref, dtype=int)
    for b, e in known_true_clusters:
        known_true_cluster_array[b:e] = 0

    cluster_cut_off = 10
    with open_file(sequences) as fasta:
        for h, s in SimpleFastaParser(fasta):
            left_gaps = len(s) - len(s.lstrip('-'))
            right_gaps = len(s) - len(s.rstrip('-'))
            s = sequence_to_int_array(s,
                                      fill_value=fill_value,
                                      fill_gaps=False)
            # mask from both ends to avoid exclusion for problems at sites that will be masked anyway
            if mask_5p:
                s[:mask_5p] = fill_value
            if mask_3p:
                s[-mask_3p:] = fill_value

            # fill terminal gaps -- those will be filled anyway
            if left_gaps:
                s[:left_gaps] = fill_value
            if right_gaps:
                s[-right_gaps:] = fill_value

            # determine non-gap non-N mismatches
            snps = (int_ref != s) & (s != fill_value) & (s != gap_value)
            # determine N positions
            filled = s == fill_value
            # determine gap positions (cast to int to detect start and ends)
            gaps = np.array(s == gap_value, dtype=int)
            gap_start = np.where(np.diff(gaps) == 1)[0]
            gap_end = np.where(np.diff(gaps) == -1)[0]

            # determined mutation clusters by convolution with an array of ones => running window average
            clusters = np.array(np.convolve(snps * known_true_cluster_array,
                                            np.ones(ws),
                                            mode='same') >= cluster_cut_off,
                                dtype=int)
            # determine start and end of clusters. extend by half window size on both ends.
            cluster_start = [0] if clusters[0] else []
            cluster_start.extend([
                max(0, x - ws // 2)
                for x in np.where(np.diff(clusters) == 1)[0]
            ])
            cluster_end = [
                min(int_ref.shape[0], x + ws // 2)
                for x in np.where(np.diff(clusters) == -1)[0]
            ]
            if clusters[-1]:
                cluster_end.append(int_ref.shape[0])

            diagnostics[h] = {
                'snps':
                list(np.where(snps)[0]),
                'gaps':
                list(zip(gap_start, gap_end)),
                'gap_sum':
                np.sum(gaps),
                'no_data':
                np.sum(filled) - mask_3p - mask_5p,
                'clusters': [(b, e, np.sum(snps[b:e]))
                             for b, e in zip(cluster_start, cluster_end)]
            }

    return diagnostics
Пример #9
0
    args = parser.parse_args()

    # load entire alignment and the alignment of focal sequences (upper case -- probably not necessary)
    ref = SeqIO.read(args.reference, 'genbank').seq
    metadata, _ = read_metadata(args.metadata)

    diagnostics = analyze_divergence(args.alignment,
                                     metadata,
                                     ref,
                                     mask_5p=args.mask_from_beginning,
                                     mask_3p=args.mask_from_end)
    snp_cutoff = 25
    no_data_cutoff = 3000
    flagged_sequences = []
    # output diagnostics for each sequence, ordered by divergence
    with open_file(args.output_diagnostics, 'w') as diag:
        diag.write('\t'.join([
            'strain', 'divergence', 'excess divergence', '#Ns', '#gaps',
            'clusters', 'gaps', 'all_snps', 'gap_list'
        ]) + '\n')
        for s, d in sorted(diagnostics.items(),
                           key=lambda x: len(x[1]['snps']),
                           reverse=True):
            expected_div = expected_divergence(
                metadata[s]['date']) if s in metadata else np.nan
            diag.write('\t'.join(
                map(str, [
                    s,
                    len(d['snps']),
                    round(len(d['snps']) -
                          expected_div, 2), d['no_data'], d['gap_sum'],
Пример #10
0
            if column not in combined_data[strain]:
                combined_data[strain][column] = EMPTY    
    
    for idx in range(1, len(metadata)):
        for strain, row in metadata[idx]['data'].items():
            if strain not in combined_data:
                combined_data[strain] = {c:EMPTY for c in combined_columns}
            for column in combined_columns:
                if column in row:
                    existing_value = combined_data[strain][column]
                    new_value = row[column]
                    # overwrite _ANY_ existing value if the overwriting value is non empty (and different)!
                    if new_value != EMPTY and new_value != existing_value:
                        if existing_value != EMPTY:
                            print(f"[{strain}::{column}] Overwriting {combined_data[strain][column]} with {new_value}")
                        combined_data[strain][column] = new_value

    # one-hot encoding for origin
    # note that we use "yes" / "no" here as Booleans are problematic for `augur filter`
    for metadata_entry in metadata:
        origin = metadata_entry['origin']
        for strain in combined_data:
            combined_data[strain][origin] = "yes" if strain in metadata_entry['strains'] else "no"

    print(f"Combined metadata: {len(combined_data.keys())} strains x {len(combined_columns)} columns")

    with open_file(args.output, 'w') as fh:
        tsv_writer = csv.writer(fh, delimiter='\t')
        tsv_writer.writerow(combined_columns)
        for row in combined_data.values():
            tsv_writer.writerow([row[column] for column in combined_columns])
Пример #11
0
                # https://docs.python.org/3/library/tarfile.html#tarfile.TarFile.extractfile
                sequence_files.append(sequence_file)
                tar_handles.append(tar_handle)
            except FileNotFoundError as error:
                print(f"ERROR: {error}", file=sys.stderr)
                sys.exit(1)
        else:
            sequence_files.append(sequence_filename)

    # Replace whitespace and everything following pipes with nothing.
    pattern = "( )|(\|.*)"
    if args.strip_prefixes:
        prefixes = "|".join(args.strip_prefixes)
        pattern = f"^({prefixes})|{pattern}"

    with open_file(args.output, "w", threads=1) as output_handle:
        # In order to prefer the latter files, we have to reverse the order of
        # the files.
        sequences = read_sequences(*reversed(sequence_files))
        renamed_sequences = rename_sequences(sequences, pattern)
        deduplicated_sequences = drop_duplicate_sequences(
            renamed_sequences, args.error_on_duplicate_strains)

        try:
            for sequence in deduplicated_sequences:
                write_sequences(sequence, output_handle)
        except DuplicateSequenceError as error:
            print(
                f"ERROR: The following strains have duplicate sequences: {error}",
                file=sys.stderr)
            sys.exit(1)