parser.add_argument( "--error-on-duplicate-strains", action="store_true", help= "exit with an error if any duplicate strains are found. By default, duplicates are resolved by preferring most recent accession id or the last record." ) parser.add_argument("--output", required=True, help="sanitized metadata") args = parser.parse_args() # If the input is a tarball, try to find a metadata file inside the archive. metadata_file = args.metadata tar_handle = None if ".tar" in Path(args.metadata).suffixes: try: metadata_file, tar_handle = extract_tar_file_contents( args.metadata, "metadata") except FileNotFoundError as error: print(f"ERROR: {error}", file=sys.stderr) sys.exit(1) # Read metadata with pandas because Augur's read_metadata utility does not # support metadata without a "strain" or "name" field. metadata = pd.read_csv(metadata_file, sep=None, engine="python", skipinitialspace=True, dtype={ "strain": "string", "name": "string", }).fillna("")
parser.add_argument("--sequences", nargs="+", required=True, help="sequences to be sanitized") parser.add_argument("--strip-prefixes", nargs="+", help="prefixes to strip from strain names in the sequences") parser.add_argument('--error-on-duplicate-strains', action="store_true", help="exit with an error when the same strain is detected multiple times with different sequences. By default, use the first occurrence of each duplicated sequence.") parser.add_argument("--output", required=True, help="sanitized sequences") args = parser.parse_args() sequence_files = [] tar_handles = [] for sequence_filename in args.sequences: # If the input is a tarball, try to find a sequence file inside the # archive. if ".tar" in Path(sequence_filename).suffixes: try: sequence_file, tar_handle = extract_tar_file_contents( sequence_filename, "sequences" ) # The extracted tar file is an io.BufferedReader which provides # a binary stream. BioPython's SeqIO reader expects a text # stream. We decode each line of the BufferedReader in a # generator such that decoding happens on the fly per line # downstream. For more details see: # https://docs.python.org/3/library/tarfile.html#tarfile.TarFile.extractfile sequence_files.append(sequence_file) tar_handles.append(tar_handle) except FileNotFoundError as error: print(f"ERROR: {error}", file=sys.stderr) sys.exit(1) else: sequence_files.append(sequence_filename)
parser.add_argument("--output", required=True, help="sanitized metadata") args = parser.parse_args() # Get user-defined metadata id columns to look for. metadata_id_columns = args.metadata_id_columns # Get user-defined database id columns to use for deduplication. database_id_columns = args.database_id_columns # If the input is a tarball, try to find a metadata file inside the archive. metadata_file = args.metadata metadata_is_temporary = False if ".tar" in Path(args.metadata).suffixes: try: temporary_dir, metadata_file = extract_tar_file_contents( args.metadata, "metadata") metadata_is_temporary = True except FileNotFoundError as error: print(f"ERROR: {error}", file=sys.stderr) sys.exit(1) # In the first pass through the metadata, map strain names to database ids. # We will use this mapping to deduplicate records in the second pass. # Additionally, this pass checks for missing id columns and the presence of # any duplicate records, in case the user has requested an error on # duplicates. try: database_ids_by_strain = get_database_ids_by_strain( metadata_file, metadata_id_columns, database_id_columns,