示例#1
0
 def test_read_sequences_from_multiple_files_with_different_compression(
         self, fasta_filename, gzip_fasta_filename, lzma_fasta_filename):
     sequences = read_sequences(fasta_filename,
                                gzip_fasta_filename,
                                lzma_fasta_filename,
                                format="fasta")
     assert len(list(sequences)) == 9
示例#2
0
 def test_read_sequences_from_multiple_files_or_buffers(
         self, fasta_filename, additional_fasta_filename):
     with open(fasta_filename) as fasta_handle:
         sequences = read_sequences(fasta_handle,
                                    additional_fasta_filename,
                                    format="fasta")
         assert len(list(sequences)) == 6
示例#3
0
            except FileNotFoundError as error:
                print(f"ERROR: {error}", file=sys.stderr)
                sys.exit(1)
        else:
            sequence_files.append(sequence_filename)

    # Replace whitespace and everything following pipes with nothing.
    pattern = "( )|(\|.*)"
    if args.strip_prefixes:
        prefixes = "|".join(args.strip_prefixes)
        pattern = f"^({prefixes})|{pattern}"

    with open_file(args.output, "w") as output_handle:
        # In order to prefer the latter files, we have to reverse the order of
        # the files.
        sequences = read_sequences(*reversed(sequence_files))
        renamed_sequences = rename_sequences(sequences, pattern)
        deduplicated_sequences = drop_duplicate_sequences(
            renamed_sequences,
            args.error_on_duplicate_strains
        )

        try:
            for sequence in deduplicated_sequences:
                write_sequences(sequence, output_handle)
        except DuplicateSequenceError as error:
            print(
                f"ERROR: The following strains have duplicate sequences: {error}",
                file=sys.stderr
            )
            sys.exit(1)
示例#4
0
                        nargs='+',
                        type=int,
                        help="list of sites to mask")
    parser.add_argument("--output",
                        required=True,
                        help="FASTA file of output alignment")
    args = parser.parse_args()

    begin_length = 0
    if args.mask_from_beginning:
        begin_length = args.mask_from_beginning
    end_length = 0
    if args.mask_from_end:
        end_length = args.mask_from_end

    with open_file(args.output, 'w') as outfile:
        for record in read_sequences(args.alignment):
            seq = str(record.seq)
            if args.mask_terminal_gaps:
                seq = mask_terminal_gaps(seq)

            start = "N" * begin_length
            middle = seq[begin_length:-end_length]
            end = "N" * end_length
            seq_list = list(start + middle + end)
            if args.mask_sites:
                for site in args.mask_sites:
                    seq_list[site - 1] = "N"
            record.seq = Seq("".join(seq_list))
            write_sequences(record, outfile)
    parser.add_argument('--output',
                        type=str,
                        metavar="FASTA",
                        required=True,
                        help="output FASTA")
    args = parser.parse_args()

    sequence_hash_by_name = {}
    duplicate_strains = set()

    counter = 0
    with open(args.output, "w") as output_handle:
        # Stream sequences from all input files into a single output file,
        # skipping duplicate records (same strain and sequence) and noting
        # mismatched sequences for the same strain name.
        for record in read_sequences(*args.input):
            counter += 1
            if counter % 10000 == 0:
                print(f"Processed {counter} records")

            # Hash each sequence and check whether another sequence with the
            # same name already exists and if the hash is different.
            sequence_hash = hashlib.sha256(str(
                record.seq).encode("utf-8")).hexdigest()
            if record.name in sequence_hash_by_name:
                # If the hashes differ (multiple entries with the same
                # strain name but different sequences), we keep the first
                # sequence and add the strain to a list of duplicates to
                # report at the end.
                if sequence_hash_by_name.get(record.name) != sequence_hash:
                    duplicate_strains.add(record.name)
示例#6
0
import argparse
from augur.io import open_file, read_sequences, write_sequences
import re

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--sequences",
                        nargs="+",
                        required=True,
                        help="sequences to be sanitized")
    parser.add_argument(
        "--strip-prefixes",
        nargs="+",
        help="prefixes to strip from strain names in the sequences")
    parser.add_argument("--output", required=True, help="sanitized sequences")

    args = parser.parse_args()

    if args.strip_prefixes:
        prefixes = "|".join(args.strip_prefixes)
        pattern = f"^({prefixes})"
    else:
        pattern = ""

    with open_file(args.output, "w") as output_handle:
        for sequence in read_sequences(*args.sequences):
            sequence.id = re.sub(pattern, "", sequence.id)
            write_sequences(sequence, output_handle)
示例#7
0
        type=int,
        default=10000,
        help=
        "number of samples in the global alignment to process at once. Reduce this number to reduce memory usage at the cost of increased run-time."
    )
    parser.add_argument("--output",
                        type=str,
                        required=True,
                        help="FASTA file of output alignment")
    args = parser.parse_args()

    # load entire alignment and the alignment of focal sequences (upper case -- probably not necessary)
    ref = sequence_to_int_array(SeqIO.read(args.reference, 'fasta').seq)
    alignment_length = len(ref)

    focal_seqs = read_sequences(args.focal_alignment)
    focal_seqs_dict = calculate_snp_matrix(focal_seqs,
                                           consensus=ref,
                                           ignore_seqs=args.ignore_seqs)

    if focal_seqs_dict is None:
        print(
            f"ERROR: There are no valid sequences in the focal alignment, '{args.focal_alignment}', to compare against the full alignment.",
            "Check your subsampling settings for the focal alignment or consider disabling proximity-based subsampling.",
            file=sys.stderr)
        sys.exit(1)

    seqs = read_sequences(args.alignment)

    # export priorities
    fh_out = open(args.output, 'w')
示例#8
0
from augur.io import open_file, read_sequences, write_sequences
import re

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--sequences",
                        nargs="+",
                        required=True,
                        help="sequences to be sanitized")
    parser.add_argument(
        "--strip-prefixes",
        nargs="+",
        help="prefixes to strip from strain names in the sequences")
    parser.add_argument("--output", required=True, help="sanitized sequences")

    args = parser.parse_args()

    if args.strip_prefixes:
        prefixes = "|".join(args.strip_prefixes)
        pattern = f"^({prefixes})"
    else:
        pattern = ""

    with open_file(args.output, "w") as output_handle:
        # In order to prefer the latter files, we have to reverse the order of
        # the files.
        for sequence in read_sequences(*reversed(args.sequences)):
            sequence.id = re.sub(pattern, "", sequence.id)
            write_sequences(sequence, output_handle)
示例#9
0
 def test_read_sequences_from_multiple_files(self, fasta_filename,
                                             additional_fasta_filename):
     sequences = read_sequences(fasta_filename,
                                additional_fasta_filename,
                                format="fasta")
     assert len(list(sequences)) == 6
示例#10
0
 def test_read_sequences_from_single_file(self, fasta_filename):
     sequences = read_sequences(fasta_filename, format="fasta")
     assert len(list(sequences)) == 3
示例#11
0
 def test_read_single_genbank_record_from_a_path(self, genbank_reference):
     reference = next(
         read_sequences(Path(genbank_reference), format="genbank"))
     assert reference.id == "KX369547.1"
示例#12
0
 def test_read_single_fasta_record(self, fasta_filename):
     record = next(read_sequences(fasta_filename, format="fasta"))
     assert record.id == "SEQ_1"
    parser.add_argument('--input', type=str,  nargs="+", metavar="FASTA", required=True, help="input FASTAs")
    parser.add_argument('--warn-about-duplicates', action="store_true", help="warn the user about duplicate strains instead of exiting with an error. The output will include the first occurrence of a duplicate sequence.")
    parser.add_argument('--output', type=str, metavar="FASTA", required=True, help="output FASTA")
    args = parser.parse_args()

    sequence_hash_by_name = {}
    duplicate_strains = set()

    counter = 0
    with open_file(args.output, "w") as output_handle:
        # Stream sequences from all input files into a single output file,
        # skipping duplicate records (same strain and sequence) and noting
        # mismatched sequences for the same strain name.  In order to
        # prefer the latter files, we have to reverse the order of the
        # files.
        for record in read_sequences(*reversed(args.input)):
            counter += 1
            if counter % 10000 == 0:
                print(f"Processed {counter} records")

            # Hash each sequence and check whether another sequence with the
            # same name already exists and if the hash is different.
            sequence_hash = hashlib.sha256(str(record.seq).encode("utf-8")).hexdigest()
            if record.name in sequence_hash_by_name:
                # If the hashes differ (multiple entries with the same
                # strain name but different sequences), we keep the first
                # sequence and add the strain to a list of duplicates to
                # report at the end.
                if sequence_hash_by_name.get(record.name) != sequence_hash:
                    duplicate_strains.add(record.name)