Exemplo n.º 1
0
#!/usr/bin/python

from famli.famli_helpers import parse_alignment

with open("/usr/famli/tests/example.diamond.aln", "rt") as f:
    n_reads, output = parse_alignment(f)

n_dedup = sum([d["nreads"] for d in output])

assert n_reads == 360, n_reads
assert n_dedup == 359, n_dedup

# Three references survived filtering
assert len(output) == 3

print("PASSED TESTS")
Exemplo n.º 2
0
    def filter(self):
        """Filter a set of alignments with FAMLI."""
        parser = argparse.ArgumentParser(
            description="""Filter a set of existing alignments in tabular
                format with FAMLI""")

        parser.add_argument("--input",
                            type=str,
                            help="Location for input alignment file.")
        parser.add_argument("--output",
                            type=str,
                            help="Location for output JSON file.")
        parser.add_argument("--output-aln",
                            default=None,
                            type=str,
                            help="Location for output alignment file.")
        parser.add_argument("--threads",
                            type=int,
                            help="""Number of processors to use.""",
                            default=4)
        parser.add_argument("--logfile",
                            type=str,
                            help="""(Optional) Write log to this file.""")
        parser.add_argument("--batchsize",
                            type=int,
                            help="""Number of reads to process at a time.""")
        parser.add_argument("--qseqid-ix",
                            default=0,
                            type=int,
                            help="""Alignment column for query sequence ID.
                            (0-indexed column ix)""")
        parser.add_argument("--sseqid-ix",
                            default=1,
                            type=int,
                            help="""Alignment column for subject sequence ID.
                            (0-indexed column ix)""")
        parser.add_argument(
            "--sstart-ix",
            default=8,
            type=int,
            help="""Alignment column for subject start position.
                            (0-indexed column ix, 1-indexed start position)""")
        parser.add_argument("--send-ix",
                            default=9,
                            type=int,
                            help="""Alignment column for subject end position.
                            (0-indexed column ix, 1-indexed end position)""")
        parser.add_argument("--bitscore-ix",
                            default=11,
                            type=int,
                            help="""Alignment column for alignment bitscore.
                            (0-indexed column ix)""")
        parser.add_argument("--slen-ix",
                            default=13,
                            type=int,
                            help="""Alignment column for subject length.
                            (0-indexed column ix)""")
        parser.add_argument("--sd-mean-cutoff",
                            default=1.0,
                            type=float,
                            help="""Threshold for filtering max SD / MEAN""")
        parser.add_argument("--strim-5",
                            default=18,
                            type=int,
                            help="""Amount to trim from 5' end of subject""")
        parser.add_argument("--strim-3",
                            default=18,
                            type=int,
                            help="""Amount to trim from 3' end of subject""")

        args = parser.parse_args(sys.argv[2:])

        start_time = time.time()

        assert os.path.exists(args.input)

        # Set up logging
        logFormatter = logging.Formatter(
            '%(asctime)s %(levelname)-8s [FAMLI parse] %(message)s')
        rootLogger = logging.getLogger()
        rootLogger.setLevel(logging.INFO)

        if args.logfile:
            # Write to file
            fileHandler = logging.FileHandler(args.logfile)
            fileHandler.setFormatter(logFormatter)
            rootLogger.addHandler(fileHandler)

        # Write to STDOUT
        consoleHandler = logging.StreamHandler()
        consoleHandler.setFormatter(logFormatter)
        rootLogger.addHandler(consoleHandler)

        if args.input.endswith(".gz"):
            f = gzip.open(args.input, "rt")
        else:
            f = open(args.input, "rt")

        aligned_reads, output, alignments = parse_alignment(
            f,
            QSEQID_i=args.qseqid_ix,
            SSEQID_i=args.sseqid_ix,
            SSTART_i=args.sstart_ix,
            SEND_i=args.send_ix,
            BITSCORE_i=args.bitscore_ix,
            SLEN_i=args.slen_ix,
            SD_MEAN_CUTOFF=args.sd_mean_cutoff,
            STRIM_5=args.strim_5,
            STRIM_3=args.strim_3,
            threads=args.threads,
            batchsize=args.batchsize,
        )

        f.close()

        if args.output:
            with open(args.output, "wt") as fo:
                json.dump(output, fo, indent=4)

        if args.output_aln is not None:
            filter_alignment(args.input,
                             args.output_aln,
                             alignments,
                             QSEQID_i=args.qseqid_ix,
                             SSEQID_i=args.sseqid_ix)

        elapsed = round(time.time() - start_time, 2)
        logging.info("Time elapsed: {:,}".format(elapsed))
Exemplo n.º 3
0
    def align(self):
        """Align a set of reads with DIAMOND and run FAMLI."""
        parser = argparse.ArgumentParser(
            description=
            """Align a set of reads with DIAMOND, filter alignments with FAMLI,
                and return the results""")

        parser.add_argument(
            "--input",
            type=str,
            required=True,
            help="""Location for input file(s). Combine multiple files with +.
                                    (Supported: sra://, s3://, or ftp://).""")
        parser.add_argument("--sample-name",
                            type=str,
                            required=True,
                            help="""Name of sample, sets output filename.""")
        parser.add_argument("--ref-db",
                            type=str,
                            required=True,
                            help="""Folder containing reference database.
                                    (Supported: s3://, ftp://, or local path).
                                    """)
        parser.add_argument("--output-folder",
                            type=str,
                            required=True,
                            help="""Folder to place results.
                                    (Supported: s3://, or local path).""")
        parser.add_argument("--min-score",
                            type=float,
                            default=20,
                            help="Minimum alignment score to report.")
        parser.add_argument("--blocks",
                            type=int,
                            default=5,
                            help="""Number of blocks used when aligning.
                                  Value relates to the amount of memory used.
                                  Roughly 6Gb RAM used by DIAMOND per block.
                                  """)
        parser.add_argument("--query-gencode",
                            type=int,
                            default=11,
                            help="Genetic code used to translate nucleotides.")
        parser.add_argument("--threads",
                            type=int,
                            default=16,
                            help="Number of threads to use aligning.")
        parser.add_argument("--min-qual",
                            type=int,
                            default=None,
                            help="Trim reads to a minimum Q score.")
        parser.add_argument("--temp-folder",
                            type=str,
                            default='/share',
                            help="Folder used for temporary files.")
        parser.add_argument("--batchsize",
                            type=int,
                            help="""Number of reads to process at a time.""")
        parser.add_argument(
            "--delete-all-files-in-temp-folder",
            action="store_true",
            help=
            """If flag is set, DELETE ALL OF THE FILES IN THE TEMP FOLDER before starting."""
        )

        args = parser.parse_args(sys.argv[2:])

        # Make sure that there are no commas or whitespaces in the input
        input_str = args.input
        assert ' ' not in input_str, input_str
        assert ',' not in input_str, input_str

        # Make a temporary folder for all files to be placed in
        temp_folder = os.path.join(args.temp_folder, str(uuid.uuid4())[:8])
        assert os.path.exists(temp_folder) is False
        os.mkdir(temp_folder)

        # Set up logging
        log_fp = os.path.join(temp_folder, "log.txt")
        logFormatter = logging.Formatter(
            '%(asctime)s %(levelname)-8s [FAMLI] %(message)s')
        rootLogger = logging.getLogger()
        rootLogger.setLevel(logging.INFO)

        # Write to file
        fileHandler = logging.FileHandler(log_fp)
        fileHandler.setFormatter(logFormatter)
        rootLogger.addHandler(fileHandler)
        # Also write to STDOUT
        consoleHandler = logging.StreamHandler()
        consoleHandler.setFormatter(logFormatter)
        rootLogger.addHandler(consoleHandler)

        # Delete the files in scratch, if specified
        if args.delete_all_files_in_temp_folder:
            logging.info("Deleting all files in temp folder " +
                         args.temp_folder)
            for fp in os.listdir(args.temp_folder):
                fp = os.path.join(args.temp_folder, fp)
                logging.info("Deleting " + fp)
                shutil.rmtree(fp)
            logging.info("Done deleting files in temporary folder")

        # Check to see if DIAMOND is available
        logging.info("Checking for a working copy of DIAMOND")
        run_cmds(["diamond", "--version"])

        # Get the reference database
        try:
            db_fp = get_reference_database(args.ref_db, temp_folder)
        except:
            exit_and_clean_up(temp_folder)

        # Set up the NCBI fastq-dump cache folder within the temp folder
        set_up_sra_cache_folder(temp_folder)

        logging.info("Reference database: " + db_fp)

        # Align the input data and calculate the overall abundance

        # Keep track of the time elapsed to process this sample
        start_time = time.time()

        logging.info("Processing input argument: " + input_str)

        # Multiple input reads may be separated with a '+'
        input_str = input_str.split("+")
        # Make sure that they are all unique arguments
        assert len(input_str) == len(set(input_str)), "Duplicate arguments"
        # Make sure that the filenames are also all unique
        assert len(input_str) == len(set([s.split('/')[-1] for s in input_str
                                          ])), "Duplicate filenames"

        # Capture each command in a try statement
        # Get the input reads
        read_fps = []
        for s in input_str:
            logging.info("Fetching {}".format(s))
            try:
                read_fps.append(
                    get_reads_from_url(s, temp_folder, min_qual=args.min_qual))
            except:
                exit_and_clean_up(temp_folder)

        # Combine the files into a single FASTQ
        read_fp = os.path.join(temp_folder, "input.fastq")
        combine_fastqs(read_fps, read_fp)

        # Run the alignment
        try:
            align_fp = align_reads(
                read_fp,  # FASTQ file path
                db_fp,  # Local path to DB
                temp_folder,  # Folder for results
                query_gencode=args.query_gencode,
                threads=args.threads,
                min_score=args.min_score,
                blocks=args.blocks,
            )
        except:
            exit_and_clean_up(temp_folder)

        # Process the alignments, reassigning multi-mapped reads
        try:
            with open(align_fp, "rt") as align_handle:
                aligned_reads, abund, alignments = parse_alignment(
                    align_handle,
                    batchsize=args.batchsize,
                )
        except:
            exit_and_clean_up(temp_folder)

        # Calculate the number of deduplicated reads
        deduplicated_reads = sum([d["nreads"] for d in abund])

        # Name the output file based on the input file
        # Ultimately adding ".json.gz" to the input file name
        if args.sample_name is not None:
            output_prefix = args.sample_name
        else:
            output_prefix = input_str[0].split("/")[-1]
        logging.info(
            "Using sample name {} for output prefix".format(output_prefix))

        # Count the total number of reads
        logging.info("Counting the total number of reads")
        n_reads = count_fastq_reads(read_fp)
        logging.info("Reads in input file: {:,}".format(n_reads))

        # Read in the logs
        logging.info("Reading in the logs")
        logs = open(log_fp, 'rt').readlines()

        # Wrap up all of the results into a single JSON
        # and write it to the output folder
        output = {
            "input_path": "+".join(input_str),
            "input": output_prefix,
            "sample": args.sample_name,
            "output_folder": args.output_folder,
            "logs": logs,
            "ref_db": db_fp,
            "ref_db_url": args.ref_db,
            "results": abund,
            "total_reads": n_reads,
            "aligned_reads": aligned_reads,
            "deduplicated_reads": deduplicated_reads,
            "time_elapsed": time.time() - start_time,
            "params": {
                "batchsize": args.batchsize,
                "min_score": args.min_score,
                "blocks": args.blocks,
                "query_gencode": args.query_gencode,
                "threads": args.threads,
                "min_qual": args.min_qual
            }
        }
        return_results(output, output_prefix, args.output_folder, temp_folder)

        # Delete any files that were created for this sample
        logging.info("Removing temporary folder: " + temp_folder)
        shutil.rmtree(temp_folder)

        # Stop logging
        logging.info("Done")
        logging.shutdown()
Exemplo n.º 4
0
#!/usr/bin/python

from famli.famli_helpers import parse_alignment

with open("/usr/famli/tests/example.diamond.aln", "rt") as f:
    n_reads, output, alignments = parse_alignment(f)

assert isinstance(n_reads, int)
assert isinstance(output, list)
assert isinstance(alignments, list)

n_dedup = sum([d["nreads"] for d in output])

assert n_reads == 360, n_reads
assert n_dedup == 360, n_dedup

# Three references survived filtering
assert len(output) == 3

print("PASSED TESTS")