def preprocess_reads(args): """ Preprocesses fastq files by removing UMIs from reads and appending them to the read names. """ if args.config: config = configparser.ConfigParser() config.read(args.config) #config.read('../config/demo_config.ini') config_validation(conf_paths=dict( config.items('PATHS'))) ##Check whether PATHS in config file exist else: config = None prepfile = handle_arg(args.prepfile, config['PATHS']['prep_file'] if config else None, 'ERR: No prepfile provided in args or config.') output_path = handle_arg( args.output_path, config['PATHS']['output_path'] if config else None, 'ERR: No output path provided in args or config.') arg_exists(sys.argv) ##Check whether args directories/files exist reheader_fastqs(r1_file=args.read1, r2_file=args.read2, r3_file=args.read3, output_path=output_path, prepname=args.prepname, prepfile=prepfile)
def collapse(args): """Base collapses from given BAM and umi family file.""" if args.config: config = configparser.ConfigParser() config.read(args.config) config_validation(conf_paths=dict( config.items('PATHS'))) ##Check whether PATHS in config file exist else: config = None region = args.region if any(item not in region for item in ["chr", ":", "-"]): raise ValueError( 'ERR: Incorrect region string (should look like chr1:1200000-1250000).' ) sys.exit(1) contig = region.split(":")[0] region_start = int(region.split(":")[1].split("-")[0]) region_end = int(region.split(":")[1].split("-")[1]) bam_file = handle_arg(args.bam_file, config['PATHS']['bam_file'] if config else None, 'ERR: No BAM file provided in args or config.') output_path = handle_arg( args.output_path, config['PATHS']['output_path'] if config else None, 'ERR: No output path provided in args or config.') arg_exists(sys.argv) ##Check whether args directories/files exist if args.umi_file: umi_file = args.umi_file elif config: umi_file = config['PATHS']['umi_file'] if 'umi_file' in config[ 'PATHS'] else None if umi_file: try: umi_table = pickle.load(open(umi_file, "rb")) except IOError: print("ERR: Unable to load .umis file.", file=sys.stderr) sys.exit(1) else: umi_table = None print(timestamp() + "Generating consensus...") generate_consensus_output(contig=contig, region_start=region_start, region_end=region_end, bam_file=bam_file, umi_table=umi_table, output_path=output_path, config=config) print(timestamp() + "Consensus generated. Consensus file written to {}.".format( output_path))
def preprocess_reads(args): """ Preprocesses fastq files by removing UMIs from reads and appending them to the read names. """ if args.config: config = configparser.ConfigParser() config.read(args.config) else: config = None prepfile = handle_arg(args.prepfile, config['PATHS']['prep_file'] if config else None, 'ERR: No prepfile provided in args or config.') output_path = handle_arg( args.output_path, config['PATHS']['output_path'] if config else None, 'ERR: No output path provided in args or config.') reheader_fastqs(r1_file=args.read1, r2_file=args.read2, r3_file=args.read3, output_path=output_path, prepname=args.prepname, prepfile=prepfile)
def group_umis(args): """Groups and error-corrects UMIs into families.""" if args.config: config = configparser.ConfigParser() config.read(args.config) else: config = None region = args.region if any(item not in region for item in [":", "-"]): raise ValueError( 'ERR: Incorrect region string (should look like chr1:1200000-1250000).' ) sys.exit(1) contig = region.split(":")[0] region_start = int(region.split(":")[1].split("-")[0]) region_end = int(region.split(":")[1].split("-")[1]) bam_file = handle_arg(args.bam_file, config['PATHS']['bam_file'] if config else None, 'ERR: No BAM file provided in args or config.') output_path = handle_arg( args.output_path, config['PATHS']['output_path'] if config else None, 'ERR: No output path provided in args or config.') print(timestamp() + "Grouping UMIs...") ## Generate an error-corrected list of UMI families umi_families = get_umi_families(contig=contig, region_start=region_start, region_end=region_end, bam_file=bam_file, config=config) umi_file = "{}/{}.umis".format(output_path, region) pickle.dump(umi_families, open(umi_file, "wb")) print(timestamp() + "UMI grouping complete. Output written to {}.".format(output_path))
def call_variants(args): """Generates VCF files from given cons file.""" if args.config: config = configparser.ConfigParser() config.read(args.config) config_validation(conf_paths=dict( config.items('PATHS'))) ##Check whether PATHS in config file exist else: config = None cons_file = args.cons_file f_sizes = args.f_sizes.split(',') output_path = handle_arg( args.output_path, config['PATHS']['output_path'] if config else None, 'No output path provided in args or config.') region = args.region arg_exists(sys.argv) ##Check whether args directories/files exist cons_is_merged = check_consfile(cons_file) if cons_is_merged: region_start = region.split("_")[0] region_end = region.split("_")[1] else: if any(x not in region for x in ["chr", ":", "-"]): raise ValueError( 'Incorrect region string (should look like chr1:1200000-1250000).' ) sys.exit(1) contig = region.split(":")[0] region_start = int(region.split(":")[1].split("-")[0]) region_end = int(region.split(":")[1].split("-")[1]) print(timestamp() + "Generating VCFs...") get_vcf_output(cons_file=cons_file, region_start=region_start, region_end=region_end, output_path=output_path, config=config, run_id=run_id) print(timestamp() + "VCFs generated. VCF files written to {}.".format(output_path))
def call_variants(args): """Generates VCF files from given cons file.""" if args.config: config = configparser.ConfigParser() config.read(args.config) else: config = None cons_file = args.cons_file f_sizes = args.f_sizes.split(',') region = args.region if any(x not in region for x in [":", "-"]): raise ValueError( 'Incorrect region string (should look like chr1:1200000-1250000).') sys.exit(1) contig = region.split(":")[0] region_start = int(region.split(":")[1].split("-")[0]) region_end = int(region.split(":")[1].split("-")[1]) output_path = handle_arg( args.output_path, config['PATHS']['output_path'] if config else None, 'No output path provided in args or config.') print(timestamp() + "Generating VCFs...") generate_vcf_output(cons_file=cons_file, f_sizes=f_sizes, contig=contig, region_start=region_start, region_end=region_end, output_path=output_path, config=config) print(timestamp() + "VCFs generated. VCF files written to {}.".format(output_path))
parser.add_argument('-o', '--output_path', help='Path to write output files to.') parser.add_argument('-c', '--config', help='Path to your config file.') parser.add_argument('-t', '--tally', help='Path to your tally (output of UMI_count.py).') args = parser.parse_args() if args.config: config = configparser.ConfigParser() config.read(config_file) else: config = None region = args.region if any(x not in region for x in ["chr", ":", "-"]): raise ValueError('Incorrect region string (should look like chr1:1200000-1250000).') sys.exit(1) contig = region.split(":")[0] region_start = int(region.split(":")[1].split("-")[0]) region_end = int(region.split(":")[1].split("-")[1]) bam_file = handle_arg(args.bam_file, config['PATHS']['bam_file'] if config else None, 'No BAM file provided in args or config.') output_path = handle_arg(args.output_path, config['PATHS']['output_path'] if config else None, 'No output path provided in args or config.') tally_file = handle_arg(args.tally, output_path + '/' + region + '.tally' if config else None, 'No tally file provided.') ## Output generate_consensus_output(contig, region_start, region_end, bam_file, tally_file, output_path, config)
def group_umis(args): """Groups and error-corrects UMIs into families.""" if args.config: config = configparser.ConfigParser() config.read(args.config) config_validation(conf_paths=dict( config.items('PATHS'))) ##Check whether PATHS in config file exist else: config = None region = args.region if any(item not in region for item in ["chr", ":", "-"]): raise ValueError( 'ERR: Incorrect region string (should look like chr1:1200000-1250000).' ) sys.exit(1) contig = region.split(":")[0] region_start = int(region.split(":")[1].split("-")[0]) region_end = int(region.split(":")[1].split("-")[1]) bam_file = handle_arg(args.bam_file, config['PATHS']['bam_file'] if config else None, 'ERR: No BAM file provided in args or config.') output_path = handle_arg( args.output_path, config['PATHS']['output_path'] if config else None, 'ERR: No output path provided in args or config.') arg_exists(sys.argv) ##Check whether args directories/files exist print(timestamp() + "Grouping UMIs...") ## Generate an error-corrected list of UMI families umi_families, umi_groups = get_umi_families(contig=contig, region_start=region_start, region_end=region_end, bam_file=bam_file, config=config) total_parent_umi_count, total_child_umi_count, num_of_children, freq_of_parent_umis = umi_datafile( umi_groups) filename = "{}/datafile_{}.csv".format(output_path, region) headers = [ 'CHR', 'START', 'END', 'PTU', 'CTU', 'CHILD_NUMS', 'FREQ_PARENTS' ] csv.register_dialect('myDialect', delimiter='\t', quoting=csv.QUOTE_NONE) csvrow = { 'CHR': contig, 'START': str(region_start), 'END': str(region_end), 'PTU': str(total_parent_umi_count), 'CTU': str(total_child_umi_count), 'CHILD_NUMS': num_of_children, 'FREQ_PARENTS': freq_of_parent_umis } info = [ contig, region_start, region_end, total_parent_umi_count, total_child_umi_count, num_of_children, freq_of_parent_umis ] file = open(filename, "w") writer = csv.DictWriter(file, dialect='myDialect', fieldnames=headers) writer.writeheader() writer.writerow(csvrow) umi_file = "{}/{}.umis".format(output_path, region) pickle.dump(umi_families, open(umi_file, "wb")) print(timestamp() + "UMI grouping complete. Output written to {}.".format(output_path))