def test_iteration(self): p1 = Pipe(name='p1') p1.pipe_cache = ['ok'] p2 = Pipe(name='p2') p3 = Pipe(name='p3') pline = Pipeline(pipes=[p1,p2,p3]) self.assertEqual(pline.next(),'ok')
class Executor: def __init__(self, cf): unet = Modified3DUNet(1, 1) resnet = resnet34() # device = torch.device('cpu') unet.load_state_dict(torch.load(cf.pathToSegmentator)) resnet.load_state_dict(torch.load(cf.pathToClassifier)) unet.eval() resnet.eval() self.segmentator = Estimator( unet, save_folder='./experiments/unet_full_pipe_eval/', cuda_device=0, optimizer=Adam, loss_fn=dice_loss) self.classify = Estimator( resnet, save_folder='./experiments/res_full_pipe_eval/', cuda_device=1, optimizer=Adam, loss_fn=torch.nn.CrossEntropyLoss()) self.pipe = Pipe(cf, self.classify, self.segmentator) try: shutil.rmtree(cf.save_path) except Exception as e: print(str(e)) try: os.makedirs(cf.save_path) print("Directory created") except Exception as e: print(str(e)) def unpack(self, pathToArchive, pathToConverted, numWorkers=3): prep = Preprocess(pathToArchive, pathToConverted, numWorkers) prep.start() self.pipe.add_dataset(pathToConverted) def start(self): return self.pipe.start_inference()
class PipeTestCase(TestCase): def setUp(self): self.p1 = Pipe() def test_empty(self): self.assertEqual(self.p1.is_empty(), True) self.p1.pipe_cache = [ None ] self.assertEqual(self.p1.is_empty(), True) def test_output(self): self.p1.pipe_cache = [1,2,3] self.assertEqual(self.p1.output(),1) self.assertEqual(self.p1.output(),2) self.assertEqual(self.p1.output(),3) self.assertEqual(self.p1.output(),None) def test_execute(self): self.assertEqual(self.p1.execute(1),1)
def build_ferpa(): pipeline = Pipeline() pipeline.add_pipe('name', Pipe(Lookup(read_names()), Drop())) pipeline.add_pipe('zip', Pipe(ZipCodeClassifier, ZipCodeFilter())) pipeline.add_pipe('address', Pipe(AddressClassifier(), AddressFilter())) pipeline.add_pipe('date', Pipe(DateClassifier(), DateFilter())) pipeline.add_pipe('phone_number', Pipe(PhoneNumberClassifier, Drop())) pipeline.add_pipe('email', Pipe(EmailClassifier, Drop())) pipeline.add_pipe('ssn', Pipe(SSNClassifier, Drop())) return pipeline
def __init__(self, cf): unet = Modified3DUNet(1, 1) resnet = resnet34() # device = torch.device('cpu') unet.load_state_dict(torch.load(cf.pathToSegmentator)) resnet.load_state_dict(torch.load(cf.pathToClassifier)) unet.eval() resnet.eval() self.segmentator = Estimator( unet, save_folder='./experiments/unet_full_pipe_eval/', cuda_device=0, optimizer=Adam, loss_fn=dice_loss) self.classify = Estimator( resnet, save_folder='./experiments/res_full_pipe_eval/', cuda_device=1, optimizer=Adam, loss_fn=torch.nn.CrossEntropyLoss()) self.pipe = Pipe(cf, self.classify, self.segmentator) try: shutil.rmtree(cf.save_path) except Exception as e: print(str(e)) try: os.makedirs(cf.save_path) print("Directory created") except Exception as e: print(str(e))
from asynmsg.msgengine import MessageEngine from callback import PipeCallback from constants import TRADE_ORDER_HANDLER_FOR_CREATE_ORDER from handler import TradeOrderHandlerForCreateOrder from pipeline import Pipe if __name__ == '__main__': pipe_callback = PipeCallback() MessageEngine.instance().register( TRADE_ORDER_HANDLER_FOR_CREATE_ORDER, TradeOrderHandlerForCreateOrder(pipe_callback)) MessageEngine.instance().send(TRADE_ORDER_HANDLER_FOR_CREATE_ORDER, amount=101) result = pipe_callback.get_result(10) print(result) # # 串行执行 # # 1. 创建订单 # # 2. 预支付 pipe = Pipe() pipe.add_jobs([''])
def setUp(self): self.p1 = Pipe()
def call_variants(): """Cell free pipeline2 variant calling. """ parser = argparse.ArgumentParser() parser.add_argument('input_bam', help="Path of the input bam file.") parser.add_argument( "-r", "--reference", help="Path to reference genome or containing directory.", required=True) parser.add_argument( "-C", "--callers", help= "Variant callers to use. Valid values are varscan, vardict and mutect2. Defaults to 'varscan,vardict'.", default="varscan,vardict") parser.add_argument( "-n", "--name", help= "Sample name used to name output files. Will be guessed from input bam if not provided", default="") parser.add_argument( "-p", "--panel", help= "Path to covermi panel which must contain targets bedfile. Required for annotation.", default="") parser.add_argument("-v", "--vep", help="Path to vep cache. Required for annotation.", default="") parser.add_argument( "-f", "--min-vaf", help="Minimum variant allele frequency for a variant to be called.", type=float, default=0) parser.add_argument( "-a", "--min-alt-reads", help="Minimum number of alt reads for a variant to be called.", type=int, default=2) parser.add_argument("-o", "--output", help="Path to write output files to.", default=".") parser.add_argument( "-t", "--threads", help= "Number of threads to use, defaults to all available threads if not specified.", type=int, default=None) args = parser.parse_args() threads = args.threads or run(["getconf", "_NPROCESSORS_ONLN" ]).stdout.strip() if not args.name: fn = os.path.basename(args.input_bam) args.name = fn[:-4] if fn.endswith(".bam") else fn args.callers = args.callers.lower().replace(",", " ").split() for caller in args.callers: if caller not in ("varscan", "vardict", "mutect2"): sys.exit(f"{caller} is not a recognised variant caller") args.reference = os.path.abspath(args.reference) args.input_bam = os.path.abspath(args.input_bam) if args.panel: args.panel = os.path.abspath(args.panel) if args.vep: args.vep = os.path.abspath(args.vep) os.chdir(args.output) args.reference = (glob.glob(f"{args.reference}/*.fna") + glob.glob(f"{args.reference}/*.fa") + glob.glob(f"{args.reference}/*.fasta") + [args.reference])[0] pipe = Pipe() targets_bedfile = glob.glob(f"{args.panel}/*.bed") if args.panel else [] targets_bedfile = targets_bedfile[0] if len(targets_bedfile) == 1 else "" if "vardict" in args.callers and not targets_bedfile: sys.exit(f"No targets bedfile found (required by vardict)") if "mutect2" in args.callers and not os.path.exists( f"{args.input_bam}.bai"): sys.exit(f"No index found for {args.input_bam} (required by mutect2)") ############################################################################################################### ### VARSCAN ### ############################################################################################################### if "varscan" in args.callers: mpileup = f"{args.name}.mpileup" pipe([ "samtools", "mpileup", "-o", mpileup, "-f", args.reference, "-A", "-B", "-q", "10", "-d", "10000000", args.input_bam ]) pvalue_vcf = f"{args.name}.pvalue.vcf" with open(pvalue_vcf, "wb") as f_out: pipe([ "varscan", "mpileup2cns", mpileup, "--variants", "--output-vcf", "1", "--min-coverage", "1", "--min-var-freq", args.min_vaf, "--min-avg-qual", "20", "--min-reads2", args.min_alt_reads, "--p-value", "0.05", "--strand-filter", "1" ], stdout=f_out) os.unlink(mpileup) vcf = f"{args.name}.varscan.unfiltered.vcf" if targets_bedfile else f"{args.name}.varscan.vcf" pipe(["postprocess_varscan_vcf", pvalue_vcf, "--output", vcf]) os.unlink(pvalue_vcf) if targets_bedfile: unfiltered_vcf = vcf vcf = f"{args.name}.varscan.vcf" pipe([ "filter_vcf", unfiltered_vcf, "--output", vcf, "--bed", targets_bedfile ]) os.unlink(unfiltered_vcf) if args.vep and args.panel: pipe([ "annotate_panel", "--vep", args.vep, "--output", f"{args.name}.varscan.annotation.tsv", "--reference", args.reference, "--threads", threads, "--panel", args.panel, vcf ]) ############################################################################################################### ### VARDICT ### ############################################################################################################### if "vardict" in args.callers: vardict_table = f"{args.name}.vardict.tsv" with open(vardict_table, "wb") as f_out: pipe( [ "vardictjava", "-K", # include Ns in depth calculation "-deldupvar", # variants are only called if start position is inside the region interest "-G", args.reference, "-N", args.name, "-b", args.input_bam, "-Q", "10", "-f", args.min_vaf, "-r", args.min_alt_reads, "-th", threads, "-u", # count mate pair overlap only once "-fisher", # perform work of teststrandbias.R targets_bedfile ], stdout=f_out) unfiltered_vcf = f"{args.name}.vardict.unfiltered.vcf" with open(vardict_table, "rb") as f_in: with open(unfiltered_vcf, "wb") as f_out: pipe( [ "var2vcf_valid.pl", "-A", # output all variants at same position "-f", args.min_vaf, "-N", args.name ], stdin=f_in, stdout=f_out) os.unlink(vardict_table) vcf = f"{args.name}.vardict.vcf" # Although vardict take the targets bedfile as an argument is does call occasional variants just outside pipe([ "filter_vcf", unfiltered_vcf, "--output", vcf, "--bed", targets_bedfile ]) os.unlink(unfiltered_vcf) if args.vep and args.panel: pipe([ "annotate_panel", "--vep", args.vep, "--output", f"{args.name}.vardict.annotation.tsv", "--reference", args.reference, "--threads", threads, "--panel", args.panel, vcf ]) ############################################################################################################### ### MUTECT2 ### ############################################################################################################### if "mutect2" in args.callers: unmutectfiltered_vcf = f"{args.name}.unmutectfiltered.mutect2.vcf" pipe([ "gatk", "Mutect2", "-R", args.reference, "-I", args.input_bam, "-O", unmutectfiltered_vcf, "--create-output-variant-index", "false", "--max-reads-per-alignment-start", "0", "--disable-read-filter", "NotDuplicateReadFilter", "--disable-read-filter", "GoodCigarReadFilter" ]) multiallelic_vcf = f"{args.name}.multiallelic.mutect2.vcf" pipe([ "gatk", "FilterMutectCalls", "-R", args.reference, "-V", unmutectfiltered_vcf, "-O", multiallelic_vcf, "--filtering-stats", "false", "--create-output-variant-index", "false" ]) os.unlink(unmutectfiltered_vcf) os.unlink(f"{unmutectfiltered_vcf}.stats") vcf = f"{args.name}.mutect2.unfiltered.vcf" if targets_bedfile else f"{args.name}.mutect2.vcf" pipe([ "postprocess_mutect2_vcf", "--output", vcf, "--min-alt-reads", args.min_alt_reads, "--min-vaf", args.min_vaf, multiallelic_vcf ]) os.unlink(multiallelic_vcf) if targets_bedfile: unfiltered_vcf = vcf vcf = f"{args.name}.mutect2.vcf" pipe([ "filter_vcf", unfiltered_vcf, "--output", vcf, "--bed", targets_bedfile ]) os.unlink(unfiltered_vcf) if args.vep and args.panel: pipe([ "annotate_panel", "--vep", args.vep, "--output", f"{args.name}.mutect2.annotation.tsv", "--reference", args.reference, "--threads", threads, "--panel", args.panel, vcf ]) print(pipe.durations, file=sys.stderr, flush=True)
def build_hipaa(): pipeline = Pipeline() pipeline.add_pipe('name', Pipe(Lookup(read_names()), Drop())) pipeline.add_pipe('zip', Pipe(ZipCodeClassifier, ZipCodeFilter())) pipeline.add_pipe('address', Pipe(AddressClassifier(), AddressFilter())) pipeline.add_pipe('date', Pipe(DateClassifier(), DateFilter())) pipeline.add_pipe('phone_number', Pipe(PhoneNumberClassifier, Drop())) pipeline.add_pipe('email', Pipe(EmailClassifier, Drop())) pipeline.add_pipe('url', Pipe(URLClassifier, Drop())) pipeline.add_pipe('ssn', Pipe(SSNClassifier, Drop())) pipeline.add_pipe('ip_address', Pipe(IPAddressClassifier, Drop())) pipeline.add_pipe('mac_address', Pipe(MACAddressClassifier, Drop())) pipeline.add_pipe('face', Pipe(FaceClassifier(), Drop())) # TODO: This is far too sensitive # pipeline.add_pipe('number', Pipe(NumberClassifier, Drop())) return pipeline
from build_svd_features import * from pipeline import Pipe from model import xgboost_model, lightgbm_model from sklearn.cross_validation import train_test_split import xgboost as xgb import lightgbm as lgb if __name__ == '__main__': train_path = 'data/train_test.csv' val_path = 'data/train_ensemble.csv' test_path = 'data/test_clean.csv' firsttime = True if firsttime: training_pipe = Pipe(train_path, val_path) train = training_pipe.make('pickle/train') train.to_csv('train_feature.csv', index=False) testing_pipe = Pipe(val_path, test_path) test = testing_pipe.make('pickle/test') test.to_csv('test_features.csv', index=False) else: train = pd.read_csv('train_feature.csv') test = pd.read_csv('test_features.csv') featlist = train.columns.tolist() featlist.remove('is_listened') X = train[featlist].as_matrix() y = train['is_listened'].as_matrix()
def multiplexing(): """Cell free pipeline. """ parser = argparse.ArgumentParser() parser.add_argument('input_sam', help="Paths of input sorted undeduplicated sam file.") parser.add_argument( "-n", "--name", help= "Sample name used to name output files. Will be guessed from input sam if not provide.", default="") parser.add_argument( "-u", "--umi", help= "UMI type (prism, thruplex_hv or thruplex) or empty strng if no umis.", default="") parser.add_argument( "-m", "--min-family-size", help="Minimum family size. Families smaller than this will be filtered", type=int, default=1) parser.add_argument("-l", "--interval", help="Step size to increment downsampling by.", type=int, required=True) parser.add_argument( "-r", "--reference", help="Path to reference genome or containing directory.", required=True) parser.add_argument( "-p", "--panel", help="Path to covermi panel which must contain targets bedfile.", required=True) parser.add_argument("-o", "--output", help="Path to write output files to.", default=".") args = parser.parse_args() threads = run(["getconf", "_NPROCESSORS_ONLN"]).stdout.strip() if not args.name: args.name = input_sam.split("/")[-1].split(".")[0] args.reference = os.path.abspath(args.reference) args.input_sam = os.path.abspath(args.input_sam) args.panel = os.path.abspath(args.panel) os.chdir(args.output) args.reference = (glob.glob(f"{args.reference}/*.fna") + glob.glob(f"{args.reference}/*.fa") + glob.glob(f"{args.reference}/*.fasta") + [args.reference])[0] ref_dir = os.path.dirname(args.reference) if glob.glob(f"{ref_dir}/*.sa"): bwa = "bwa" elif glob.glob(f"{ref_dir}/*.0123"): bwa = "bwa-mem2" else: sys.exit("Invalid bwa indexes") targets_bedfile = (glob.glob(f"{args.panel}/*.bed") + [None])[0] stats = f"{args.name}.stats.json" pipe = Pipe() output_file = f"{args.name}.multiplexing.tsv" namesorted_sam = f"{args.name}.namesorted.sam" pipe([ "samtools", "sort", "-n", "-o", namesorted_sam, "-@", threads, args.input_sam ]) with open(output_file, "wt") as f_out: writer = csv.writer(f_out) writer.writerow([ "sample", "reads", "mean_depth", "mean_family_size", "singleton_rate", "triplicate_plus_rate", "quadruplicate_plus_rate" ]) requested_reads = 0 returned_reads = 0 while returned_reads == requested_reads: requested_reads += args.interval downsampled_sam = f"{args.name}.downsampled.sam" cp = pipe([ "downsample_sam", "--output", downsampled_sam, "--number", requested_reads, namesorted_sam ], stderr=subprocess.PIPE) returned_reads = int(cp.stderr.decode()) sorted_sam = f"{args.name}.sorted.downsampled.sam" pipe([ "samtools", "sort", "-o", sorted_sam, "-@", threads, downsampled_sam ]) os.unlink(downsampled_sam) deduplicated_fastq = f"{args.name}.deduplicated.fastq" pipe([ "elduderino", "--output", deduplicated_fastq, "--stats", stats, "--min-family-size", args.min_family_size, "--umi", args.umi, sorted_sam ]) os.unlink(sorted_sam) deduplicated_sam = f"{args.name}.deduplicated.sam" with open(deduplicated_sam, "wb") as f: pipe( [ bwa, "mem", "-t", threads, "-p", # interleaved paired end fastq "-C", # Append fastq comment to sam "-Y", # Soft clip non-primary reads args.reference, deduplicated_fastq ], stdout=f) os.unlink(deduplicated_fastq) bam = f"{args.name}.bam" pipe([ "samtools", "sort", "-o", bam, "-@", threads, deduplicated_sam ]) os.unlink(deduplicated_sam) pipe([ "covermi_stats", "--panel", args.panel, "--stats", stats, bam ]) os.unlink(bam) with open(stats, "rt") as f: data = json.load(f) os.unlink(stats) writer.writerow([ args.name, returned_reads, data["coverage"]["mean_depth"], data["mean_family_size"], data["singleton_rate"], data["triplicate_plus_rate"], data["quadruplicate_plus_rate"] ]) f_out.flush() os.unlink(namesorted_sam) print(pipe.durations, file=sys.stderr, flush=True)
def rossiPipeline(): """Cell free pipeline. """ print(f"rossiPipeline {__version__}", file=sys.stderr) parser = argparse.ArgumentParser() parser.add_argument( 'input_fastqs', nargs="+", help= "Paths of input fastq or fastq.gz files. Order is important if paired end reads." ) parser.add_argument( "-r", "--reference", help="Path to reference genome or containing directory.", required=True) parser.add_argument( "-n", "--name", help= "Sample name used to name output files. Will be guessed from input fastq if not provided", default="") parser.add_argument( "-p", "--panel", help="Path to covermi panel which must contain targets bedfile.", default="") parser.add_argument( "-u", "--umi", help= "UMI type (prism, thruplex_hv or thruplex) or empty strng if no umis.", default="") parser.add_argument("-v", "--vep", help="Path to vep datargs.", default="") parser.add_argument( "-f", "--min-vaf", help= "Minimum variant allele frequency for a variant to be called when using VarDict.", type=float, default=None) parser.add_argument( "-a", "--min-alt-reads", help="Minimum number of alt reads for a variant to be called.", type=float, default=2) parser.add_argument( "-c", "--cnv", help= "Whitespace separated list of target names, as specified in targets bedfile, over which to calculate copy number variation.", default="") parser.add_argument( "-d", "--sizes", help= "Whitespace separated list of reference names over which to calculate fragment size distribution.", default="") parser.add_argument( "-b", "--translocations", help= "Call translocations (supplementary reads aligned to different chromosomes).", action="store_const", const=True, default=False) parser.add_argument("-o", "--output", help="Path to write output files to.", default=".") parser.add_argument( "-t", "--threads", help= "Number of threads to use, defaults to all available threads if not specified.", type=int, default=None) parser.add_argument( "-C", "--callers", help= "Variant callers to use. Valid values are varscan, vardict and mutect2. Defaults to 'varscan,vardict'.", default="varscan,vardict") args = parser.parse_args() threads = args.threads or run(["getconf", "_NPROCESSORS_ONLN" ]).stdout.strip() if not args.name: args.name = guess_sample_name(args.input_fastqs) if not args.name: sys.exit("Ambiguous sample name") if " " in args.name: args.name - args.name.replace(" ", "_") if args.min_vaf is None: args.min_vaf = 0.01 if args.min_family_size == 1 else 0.001 args.reference = os.path.abspath(args.reference) args.input_fastqs = [os.path.abspath(path) for path in args.input_fastqs] if args.panel: args.panel = os.path.abspath(args.panel) if args.vep: args.vep = os.path.abspath(args.vep) os.chdir(args.output) args.reference = (glob.glob(f"{args.reference}/*.fna") + glob.glob(f"{args.reference}/*.fa") + glob.glob(f"{args.reference}/*.fasta") + [args.reference])[0] ref_dir = os.path.dirname(args.reference) if glob.glob(f"{ref_dir}/*.sa"): bwa = "bwa" elif glob.glob(f"{ref_dir}/*.0123"): bwa = "bwa-mem2" else: sys.exit("Invalid bwa indexes") targets_bedfile = (glob.glob(f"{args.panel}/*.bed") + [None])[0] if args.panel else "" stats = f"{args.name}.stats.json" pipe = Pipe() # FastUniq requires ungzipped fastqs ungzipped_fastqs = [] temp_fastqs = [] for fastq in args.input_fastqs: if fastq.endswith(".gz"): run(["gunzip", "-k", fastq]) fastq = fastq[:-3] temp_fastqs.append(fastq) ungzipped_fastqs.append(fastq) if len(ungzipped_fastqs) > 2: with open(f"{args.name}_R1.fastq", "wb") as f_out: pipe(["cat"] + ungzipped_fastqs[::2], stdout=f_out) with open(f"{args.name}_R2.fastq", "wb") as f_out: pipe(["cat"] + ungzipped_fastqs[1::2], stdout=f_out) ungzipped_fastqs = [f"{args.name}_r1.fastq", f"{args.name}_r2.fastq"] for fastq in temp_fastqs: os.unlink(fastq) temp_fastqs = list(ungzipped_fastqs) fastq_names = f"{args.name}.fastqs.txt" with open(fastq_names, "wt") as f_out: f_out.write("{}\n{}\n".format(*ungzipped_fastqs)) deduplicated_fastqs = [ f"{args.name}_R1.deduplicated.fastq", f"{args.name}_R2.deduplicated.fastq" ] pipe([ "fastuniq", "-i", fastq_names, "-o", deduplicated_fastqs[0], "-p", deduplicated_fastqs[1] ]) os.unlink(fastq_names) os.unlink(temp_fastqs) # Remove umis and do some basic fastq qc interleaved_fastq = f"{args.name}.interleaved.fastq" command = [ "udini", "--output", interleaved_fastq, "--stats", stats, "--umi", args.umi ] pipe(command + deduplicated_fastqs) for fastq in deduplicated_fastqs: os.unlink(fastq) base_sam = f"{args.name}.base.sam" with open(base_sam, "wb") as f_out: pipe( [ bwa, "mem", "-t", threads, "-p", # interleaved paired end fastq "-C", # Append fastq comment to sam "-v", "1", # Output errors only args.reference, interleaved_fastq ], stdout=f_out) os.unlink(interleaved_fastq) namesorted_sam = f"{args.name}.namesorted.sam" pipe([ "samtools", "sort", "-n", # sort by name "-o", namesorted_sam, "-@", threads, base_sam ]) os.unlink(base_sam) pipe([ "size", "--stats", stats, "--rnames", args.sizes, "--output", f"{args.name}.sizes.pdf", namesorted_sam ]) ontarget_sam = f"{args.name}.ontarget.sam" pipe([ "ontarget", "--output", ontarget_sam, "--bed", targets_bedfile, "--stats", stats, "--cnv", args.cnv, "--threads", threads, namesorted_sam ]) os.unlink(namesorted_sam) # This is likely not necessary namesorted_sam = f"{args.name}.namesorted.sam" pipe([ "samtools", "sort", "-n", # sort by name "-o", namesorted_sam, "-@", threads, ontarget_sam ]) os.unlink(ontarget_sam) fixed_sam = f"{args.name}.fixed.sam" pipe(["samtools", "fixmate", namesorted_sam, fixed_sam]) os.unlink(namesorted_sam) if args.translocations: pipe([ "breakpoint", "--output", f"{args.name}.translocations.tsv", fixed_sam ]) no_read_groups_bam = f"{args.name}.no_read_groups.bam" pipe([ "samtools", "sort", "-o", no_read_groups_bam, "-@", threads, fixed_sam ]) os.unlink(fixed_sam) bam = f"{args.name}.bam" # This step is only required to satisfy Mutect2 and possibly other gatk tools pipe([ "gatk", "AddOrReplaceReadGroups", f"I={no_read_groups_bam}", f"O={bam}", "LB=lb", "PL=ILLUMINA", "PU=pu", f"SM={args.name}" ]) os.unlink(no_read_groups_bam) pipe(["samtools", "index", bam]) if args.panel: pipe([ "covermi_stats", "--panel", args.panel, "--output", f"{args.name}.covermi.pdf", "--stats", stats, bam ]) pipe([ "call_variants", "--reference", args.reference, "--callers", args.callers, "--name", args.name, "--panel", args.panel, "--vep", args.vep, "--min-vaf", args.min_vaf, "--min-alt-reads", args.min_family_size, "--output", ".", # We have already changed directory into the current directory "--threads", threads, bam ]) #vaf_plot = f"{args.name}.vaf.pdf" pipe([ "vcf_stats", f"{args.name}.vardict.vcf", # May need to change this depending on variant caller performance "--stats", stats ]) #"--output", vaf_plot]) print(pipe.durations, file=sys.stderr, flush=True)
import lightgbm as lgb if __name__ == '__main__': # Input files train_path = 'data/archive/train_clean.csv' test_path = 'data/archive/test_clean.csv' # Intermediate files TRAIN_PATH_INTERMEDIATE = 'data/archive/train_intermediate.csv' TEST_PATH_INTERMEDIATE = 'data/archive/test_intermediate.csv' # Set boolean to use pre-made features or build on the fly firsttime = True if firsttime: training_pipe = Pipe(train_path, train_path) train = training_pipe.make('train_intermediate') train.to_csv(TRAIN_PATH_INTERMEDIATE, index=False) testing_pipe = Pipe(train_path, test_path) test = testing_pipe.make('test_intermediate') test.to_csv(TEST_PATH_INTERMEDIATE, index=False) else: train = pd.read_csv('train_feature.csv') test = pd.read_csv('test_features.csv') # Preparing train and test dataset for ensemble layer featlist = train.columns.tolist() featlist.remove('is_listened') X = train[featlist].as_matrix()
def cfPipeline(): """Cell free pipeline. """ print(f"cfPipeline { __version__}", file=sys.stderr) parser = argparse.ArgumentParser() parser.add_argument('input_fastqs', nargs="+", help="Paths of input fastq or fastq.gz files. Order is important if paired end reads.") parser.add_argument("-r", "--reference", help="Path to reference genome or containing directory.", required=True) parser.add_argument("-n", "--name", help="Sample name used to name output files. Will be guessed from input fastq if not provided", default="") parser.add_argument("-p", "--panel", help="Path to covermi panel which must contain targets bedfile.", default="") parser.add_argument("-u", "--umi", help="UMI type (prism, thruplex_hv or thruplex) or empty strng if no umis.", default="") parser.add_argument("-v", "--vep", help="Path to vep datargs.", default="") parser.add_argument("-m", "--min-family-size", help="Minimum family size. Families smaller than this will be filtered", type=int, default=1) parser.add_argument("-f", "--min-vaf", help="Minimum variant allele frequency for a variant to be called when using VarDict.", type=float, default=None) parser.add_argument("-a", "--min-alt-reads", help="Minimum number of alt reads for a variant to be called.", type=float, default=2) parser.add_argument("-c", "--cnv", help="Whitespace separated list of target names, as specified in targets bedfile, over which to calculate copy number variation.", default="") parser.add_argument("-d", "--sizes", help="Whitespace separated list of reference names over which to calculate fragment size distribution.", default="") parser.add_argument("-b", "--translocations", help="Call translocations (supplementary reads aligned to different chromosomes).", action="store_const", const=True, default=False) parser.add_argument("-i", "--interleaved", help="Each input fastq contains alternating reads 1 and 2.", action="store_const", const=True, default=False) parser.add_argument("-o", "--output", help="Path to write output files to.", default=".") parser.add_argument("-t", "--threads", help="Number of threads to use, defaults to all available threads if not specified.", type=int, default=None) parser.add_argument("-s", "--sam-only", help="Quit after producing initial undeduplicated sam.", action="store_const", const=True, default=False) parser.add_argument("-C", "--callers", help="Variant callers to use. Valid values are varscan, vardict and mutect2. Defaults to 'varscan,vardict'.", default="varscan,vardict") parser.add_argument("-D", "--optical-duplicate-distance", help="Maximum pixel distance between two cluster to be considered optical duplicates.", default=None) args = parser.parse_args() threads = args.threads or run(["getconf", "_NPROCESSORS_ONLN"]).stdout.strip() if not args.name: args.name = guess_sample_name(args.input_fastqs) if not args.name: sys.exit("Ambiguous sample name") if " " in args.name: args.name - args.name.replace(" ", "_") if args.min_vaf is None: args.min_vaf = 0.01 if args.min_family_size == 1 else 0.001 args.reference = os.path.abspath(args.reference) args.input_fastqs = [os.path.abspath(path) for path in args.input_fastqs] if args.panel: args.panel = os.path.abspath(args.panel) if args.vep: args.vep = os.path.abspath(args.vep) os.chdir(args.output) args.reference = (glob.glob(f"{args.reference}/*.fna") + glob.glob(f"{args.reference}/*.fa") + glob.glob(f"{args.reference}/*.fasta") + [args.reference])[0] ref_dir = os.path.dirname(args.reference) if glob.glob(f"{ref_dir}/*.sa"): bwa = "bwa" elif glob.glob(f"{ref_dir}/*.0123"): bwa = "bwa-mem2" else: sys.exit("Invalid bwa indexes") targets_bedfile = (glob.glob(f"{args.panel}/*.bed") + [None])[0] if args.panel else "" stats = f"{args.name}.stats.json" pipe = Pipe() # Remove umis and do some basic fastq qc interleaved_fastq = f"{args.name}.interleaved.fastq" command = ["udini", "--output", interleaved_fastq, "--stats", stats, "--umi", args.umi] if args.interleaved: command.append("--interleaved") pipe(command + args.input_fastqs) base_sam = f"{args.name}.base.sam" with open(base_sam, "wb") as f_out: pipe([bwa, "mem", "-t", threads, "-p", # interleaved paired end fastq "-C", # Append fastq comment to sam "-v", "1", # Output errors only args.reference, interleaved_fastq], stdout=f_out) os.unlink(interleaved_fastq) sorted_sam = f"{args.name}.sorted.sam" pipe(["samtools", "sort", "-o", sorted_sam, "-@", threads, base_sam]) os.unlink(base_sam) if args.sam_only: return deduplicated_fastq = f"{args.name}.deduplicated.fastq" if args.optical_duplicate_distance is not None: optical_duplicate_distance = ["--optical-duplicate-distance", args.optical_duplicate_distance] else: optical_duplicate_distance = [] pipe(["elduderino", "--output", deduplicated_fastq, "--stats", stats, "--min-family-size", args.min_family_size, "--umi", args.umi] + optical_duplicate_distance + [sorted_sam]) os.unlink(sorted_sam) deduplicated_sam = f"{args.name}.deduplicated.sam" with open(deduplicated_sam, "wb") as f_out: pipe([bwa, "mem", "-t", threads, "-p", # interleaved paired end fastq "-C", # Append fastq comment to sam "-Y", # Soft clip non-primary reads "-v", "1", # Output errors only args.reference, deduplicated_fastq], stdout=f_out) os.unlink(deduplicated_fastq) namesorted_sam = f"{args.name}.namesorted.sam" pipe(["samtools", "sort", "-n", # sort by name "-o", namesorted_sam, "-@", threads, deduplicated_sam]) os.unlink(deduplicated_sam) pipe(["size", "--stats", stats, "--rnames", args.sizes, "--output", f"{args.name}.sizes.pdf", namesorted_sam]) ontarget_sam = f"{args.name}.ontarget.sam" pipe(["ontarget", "--output", ontarget_sam, "--bed", targets_bedfile, "--stats", stats, "--cnv", args.cnv, "--threads", threads, namesorted_sam]) os.unlink(namesorted_sam) untrimmed_sam = f"{args.name}.untrimmed.sam" pipe(["samtools", "sort", "-o", untrimmed_sam, "-@", threads, ontarget_sam]) os.unlink(ontarget_sam) trimmed_sam = f"{args.name}.trimmed.sam" pipe(["trim", "--output", trimmed_sam, "--reference", args.reference, untrimmed_sam]) os.unlink(untrimmed_sam) namesorted_sam = f"{args.name}.namesorted.sam" pipe(["samtools", "sort", "-n", # sort by name "-o", namesorted_sam, "-@", threads, trimmed_sam]) os.unlink(trimmed_sam) fixed_sam = f"{args.name}.fixed.sam" pipe(["samtools", "fixmate", namesorted_sam, fixed_sam]) os.unlink(namesorted_sam) if args.translocations: pipe(["breakpoint", "--output", f"{args.name}.translocations.tsv", fixed_sam]) no_read_groups_bam = f"{args.name}.no_read_groups.bam" pipe(["samtools", "sort", "-o", no_read_groups_bam, "-@", threads, fixed_sam]) os.unlink(fixed_sam) bam = f"{args.name}.bam" # This step is only required to satisfy Mutect2 and possibly other gatk tools pipe(["gatk", "AddOrReplaceReadGroups", f"I={no_read_groups_bam}", f"O={bam}", "LB=lb", "PL=ILLUMINA", "PU=pu", f"SM={args.name}"]) os.unlink(no_read_groups_bam) pipe(["samtools", "index", bam]) if args.panel: pipe(["covermi_stats", "--panel", args.panel, "--output", f"{args.name}.covermi.pdf", "--stats", stats, bam]) pipe(["call_variants", "--reference", args.reference, "--callers", args.callers, "--name", args.name, "--panel", args.panel, "--vep", args.vep, "--min-vaf", args.min_vaf, "--min-alt-reads", 2, "--output", ".", # We have already changed directory into the current directory "--threads", threads, bam]) #vaf_plot = f"{args.name}.vaf.pdf" pipe(["vcf_stats", f"{args.name}.vardict.vcf", # May need to change this depending on variant caller performance "--stats", stats]) #"--output", vaf_plot]) print(pipe.durations, file=sys.stderr, flush=True)