예제 #1
0
    parser.add_argument(
        "--error-on-duplicate-strains",
        action="store_true",
        help=
        "exit with an error if any duplicate strains are found. By default, duplicates are resolved by preferring most recent accession id or the last record."
    )
    parser.add_argument("--output", required=True, help="sanitized metadata")

    args = parser.parse_args()

    # If the input is a tarball, try to find a metadata file inside the archive.
    metadata_file = args.metadata
    tar_handle = None
    if ".tar" in Path(args.metadata).suffixes:
        try:
            metadata_file, tar_handle = extract_tar_file_contents(
                args.metadata, "metadata")
        except FileNotFoundError as error:
            print(f"ERROR: {error}", file=sys.stderr)
            sys.exit(1)

    # Read metadata with pandas because Augur's read_metadata utility does not
    # support metadata without a "strain" or "name" field.
    metadata = pd.read_csv(metadata_file,
                           sep=None,
                           engine="python",
                           skipinitialspace=True,
                           dtype={
                               "strain": "string",
                               "name": "string",
                           }).fillna("")
예제 #2
0
    parser.add_argument("--sequences", nargs="+", required=True, help="sequences to be sanitized")
    parser.add_argument("--strip-prefixes", nargs="+", help="prefixes to strip from strain names in the sequences")
    parser.add_argument('--error-on-duplicate-strains', action="store_true", help="exit with an error when the same strain is detected multiple times with different sequences. By default, use the first occurrence of each duplicated sequence.")
    parser.add_argument("--output", required=True, help="sanitized sequences")

    args = parser.parse_args()

    sequence_files = []
    tar_handles = []
    for sequence_filename in args.sequences:
        # If the input is a tarball, try to find a sequence file inside the
        # archive.
        if ".tar" in Path(sequence_filename).suffixes:
            try:
                sequence_file, tar_handle = extract_tar_file_contents(
                    sequence_filename,
                    "sequences"
                )

                # The extracted tar file is an io.BufferedReader which provides
                # a binary stream. BioPython's SeqIO reader expects a text
                # stream. We decode each line of the BufferedReader in a
                # generator such that decoding happens on the fly per line
                # downstream. For more details see:
                # https://docs.python.org/3/library/tarfile.html#tarfile.TarFile.extractfile
                sequence_files.append(sequence_file)
                tar_handles.append(tar_handle)
            except FileNotFoundError as error:
                print(f"ERROR: {error}", file=sys.stderr)
                sys.exit(1)
        else:
            sequence_files.append(sequence_filename)
예제 #3
0
    parser.add_argument("--output", required=True, help="sanitized metadata")

    args = parser.parse_args()

    # Get user-defined metadata id columns to look for.
    metadata_id_columns = args.metadata_id_columns

    # Get user-defined database id columns to use for deduplication.
    database_id_columns = args.database_id_columns

    # If the input is a tarball, try to find a metadata file inside the archive.
    metadata_file = args.metadata
    metadata_is_temporary = False
    if ".tar" in Path(args.metadata).suffixes:
        try:
            temporary_dir, metadata_file = extract_tar_file_contents(
                args.metadata, "metadata")
            metadata_is_temporary = True
        except FileNotFoundError as error:
            print(f"ERROR: {error}", file=sys.stderr)
            sys.exit(1)

    # In the first pass through the metadata, map strain names to database ids.
    # We will use this mapping to deduplicate records in the second pass.
    # Additionally, this pass checks for missing id columns and the presence of
    # any duplicate records, in case the user has requested an error on
    # duplicates.
    try:
        database_ids_by_strain = get_database_ids_by_strain(
            metadata_file,
            metadata_id_columns,
            database_id_columns,