示例#1
0
 def test_iteration(self):
     p1 = Pipe(name='p1')
     p1.pipe_cache = ['ok']
     p2 = Pipe(name='p2')
     p3 = Pipe(name='p3')
     pline = Pipeline(pipes=[p1,p2,p3])
     self.assertEqual(pline.next(),'ok')
示例#2
0
class Executor:
    def __init__(self, cf):
        unet = Modified3DUNet(1, 1)
        resnet = resnet34()
        #        device = torch.device('cpu')

        unet.load_state_dict(torch.load(cf.pathToSegmentator))
        resnet.load_state_dict(torch.load(cf.pathToClassifier))

        unet.eval()
        resnet.eval()

        self.segmentator = Estimator(
            unet,
            save_folder='./experiments/unet_full_pipe_eval/',
            cuda_device=0,
            optimizer=Adam,
            loss_fn=dice_loss)
        self.classify = Estimator(
            resnet,
            save_folder='./experiments/res_full_pipe_eval/',
            cuda_device=1,
            optimizer=Adam,
            loss_fn=torch.nn.CrossEntropyLoss())

        self.pipe = Pipe(cf, self.classify, self.segmentator)

        try:
            shutil.rmtree(cf.save_path)
        except Exception as e:
            print(str(e))

        try:
            os.makedirs(cf.save_path)
            print("Directory created")
        except Exception as e:
            print(str(e))

    def unpack(self, pathToArchive, pathToConverted, numWorkers=3):
        prep = Preprocess(pathToArchive, pathToConverted, numWorkers)
        prep.start()
        self.pipe.add_dataset(pathToConverted)

    def start(self):
        return self.pipe.start_inference()
示例#3
0
class PipeTestCase(TestCase):
    def setUp(self):
        self.p1 = Pipe()

    def test_empty(self):
        self.assertEqual(self.p1.is_empty(), True)
        self.p1.pipe_cache = [ None ]
        self.assertEqual(self.p1.is_empty(), True)

    def test_output(self):
        self.p1.pipe_cache = [1,2,3]
        self.assertEqual(self.p1.output(),1)
        self.assertEqual(self.p1.output(),2)
        self.assertEqual(self.p1.output(),3)
        self.assertEqual(self.p1.output(),None)

    def test_execute(self):
        self.assertEqual(self.p1.execute(1),1)
示例#4
0
def build_ferpa():
    pipeline = Pipeline()
    pipeline.add_pipe('name', Pipe(Lookup(read_names()), Drop()))
    pipeline.add_pipe('zip', Pipe(ZipCodeClassifier, ZipCodeFilter()))
    pipeline.add_pipe('address', Pipe(AddressClassifier(), AddressFilter()))
    pipeline.add_pipe('date', Pipe(DateClassifier(), DateFilter()))
    pipeline.add_pipe('phone_number', Pipe(PhoneNumberClassifier, Drop()))
    pipeline.add_pipe('email', Pipe(EmailClassifier, Drop()))
    pipeline.add_pipe('ssn', Pipe(SSNClassifier, Drop()))
    return pipeline
示例#5
0
    def __init__(self, cf):
        unet = Modified3DUNet(1, 1)
        resnet = resnet34()
        #        device = torch.device('cpu')

        unet.load_state_dict(torch.load(cf.pathToSegmentator))
        resnet.load_state_dict(torch.load(cf.pathToClassifier))

        unet.eval()
        resnet.eval()

        self.segmentator = Estimator(
            unet,
            save_folder='./experiments/unet_full_pipe_eval/',
            cuda_device=0,
            optimizer=Adam,
            loss_fn=dice_loss)
        self.classify = Estimator(
            resnet,
            save_folder='./experiments/res_full_pipe_eval/',
            cuda_device=1,
            optimizer=Adam,
            loss_fn=torch.nn.CrossEntropyLoss())

        self.pipe = Pipe(cf, self.classify, self.segmentator)

        try:
            shutil.rmtree(cf.save_path)
        except Exception as e:
            print(str(e))

        try:
            os.makedirs(cf.save_path)
            print("Directory created")
        except Exception as e:
            print(str(e))
示例#6
0
文件: main.py 项目: chaimch/pipe
from asynmsg.msgengine import MessageEngine
from callback import PipeCallback
from constants import TRADE_ORDER_HANDLER_FOR_CREATE_ORDER
from handler import TradeOrderHandlerForCreateOrder
from pipeline import Pipe

if __name__ == '__main__':
    pipe_callback = PipeCallback()
    MessageEngine.instance().register(
        TRADE_ORDER_HANDLER_FOR_CREATE_ORDER,
        TradeOrderHandlerForCreateOrder(pipe_callback))
    MessageEngine.instance().send(TRADE_ORDER_HANDLER_FOR_CREATE_ORDER,
                                  amount=101)
    result = pipe_callback.get_result(10)
    print(result)
    # # 串行执行
    # # 1. 创建订单
    # # 2. 预支付
    pipe = Pipe()
    pipe.add_jobs([''])
示例#7
0
 def setUp(self):
     self.p1 = Pipe()
示例#8
0
def call_variants():
    """Cell free pipeline2 variant calling.
    """

    parser = argparse.ArgumentParser()
    parser.add_argument('input_bam', help="Path of the input bam file.")
    parser.add_argument(
        "-r",
        "--reference",
        help="Path to reference genome or containing directory.",
        required=True)
    parser.add_argument(
        "-C",
        "--callers",
        help=
        "Variant callers to use. Valid values are varscan, vardict and mutect2. Defaults to 'varscan,vardict'.",
        default="varscan,vardict")
    parser.add_argument(
        "-n",
        "--name",
        help=
        "Sample name used to name output files. Will be guessed from input bam if not provided",
        default="")
    parser.add_argument(
        "-p",
        "--panel",
        help=
        "Path to covermi panel which must contain targets bedfile. Required for annotation.",
        default="")
    parser.add_argument("-v",
                        "--vep",
                        help="Path to vep cache. Required for annotation.",
                        default="")
    parser.add_argument(
        "-f",
        "--min-vaf",
        help="Minimum variant allele frequency for a variant to be called.",
        type=float,
        default=0)
    parser.add_argument(
        "-a",
        "--min-alt-reads",
        help="Minimum number of alt reads for a variant to be called.",
        type=int,
        default=2)
    parser.add_argument("-o",
                        "--output",
                        help="Path to write output files to.",
                        default=".")
    parser.add_argument(
        "-t",
        "--threads",
        help=
        "Number of threads to use, defaults to all available threads if not specified.",
        type=int,
        default=None)
    args = parser.parse_args()

    threads = args.threads or run(["getconf", "_NPROCESSORS_ONLN"
                                   ]).stdout.strip()

    if not args.name:
        fn = os.path.basename(args.input_bam)
        args.name = fn[:-4] if fn.endswith(".bam") else fn

    args.callers = args.callers.lower().replace(",", " ").split()
    for caller in args.callers:
        if caller not in ("varscan", "vardict", "mutect2"):
            sys.exit(f"{caller} is not a recognised variant caller")

    args.reference = os.path.abspath(args.reference)
    args.input_bam = os.path.abspath(args.input_bam)
    if args.panel:
        args.panel = os.path.abspath(args.panel)
    if args.vep:
        args.vep = os.path.abspath(args.vep)
    os.chdir(args.output)

    args.reference = (glob.glob(f"{args.reference}/*.fna") +
                      glob.glob(f"{args.reference}/*.fa") +
                      glob.glob(f"{args.reference}/*.fasta") +
                      [args.reference])[0]
    pipe = Pipe()

    targets_bedfile = glob.glob(f"{args.panel}/*.bed") if args.panel else []
    targets_bedfile = targets_bedfile[0] if len(targets_bedfile) == 1 else ""

    if "vardict" in args.callers and not targets_bedfile:
        sys.exit(f"No targets bedfile found (required by vardict)")
    if "mutect2" in args.callers and not os.path.exists(
            f"{args.input_bam}.bai"):
        sys.exit(f"No index found for {args.input_bam} (required by mutect2)")

    ###############################################################################################################
    ### VARSCAN                                                                                                 ###
    ###############################################################################################################
    if "varscan" in args.callers:
        mpileup = f"{args.name}.mpileup"
        pipe([
            "samtools", "mpileup", "-o", mpileup, "-f", args.reference, "-A",
            "-B", "-q", "10", "-d", "10000000", args.input_bam
        ])

        pvalue_vcf = f"{args.name}.pvalue.vcf"
        with open(pvalue_vcf, "wb") as f_out:
            pipe([
                "varscan", "mpileup2cns", mpileup, "--variants",
                "--output-vcf", "1", "--min-coverage", "1", "--min-var-freq",
                args.min_vaf, "--min-avg-qual", "20", "--min-reads2",
                args.min_alt_reads, "--p-value", "0.05", "--strand-filter", "1"
            ],
                 stdout=f_out)
        os.unlink(mpileup)

        vcf = f"{args.name}.varscan.unfiltered.vcf" if targets_bedfile else f"{args.name}.varscan.vcf"
        pipe(["postprocess_varscan_vcf", pvalue_vcf, "--output", vcf])
        os.unlink(pvalue_vcf)

        if targets_bedfile:
            unfiltered_vcf = vcf
            vcf = f"{args.name}.varscan.vcf"
            pipe([
                "filter_vcf", unfiltered_vcf, "--output", vcf, "--bed",
                targets_bedfile
            ])
            os.unlink(unfiltered_vcf)

        if args.vep and args.panel:
            pipe([
                "annotate_panel", "--vep", args.vep, "--output",
                f"{args.name}.varscan.annotation.tsv", "--reference",
                args.reference, "--threads", threads, "--panel", args.panel,
                vcf
            ])

    ###############################################################################################################
    ### VARDICT                                                                                                 ###
    ###############################################################################################################
    if "vardict" in args.callers:
        vardict_table = f"{args.name}.vardict.tsv"
        with open(vardict_table, "wb") as f_out:
            pipe(
                [
                    "vardictjava",
                    "-K",  # include Ns in depth calculation
                    "-deldupvar",  # variants are only called if start position is inside the region interest
                    "-G",
                    args.reference,
                    "-N",
                    args.name,
                    "-b",
                    args.input_bam,
                    "-Q",
                    "10",
                    "-f",
                    args.min_vaf,
                    "-r",
                    args.min_alt_reads,
                    "-th",
                    threads,
                    "-u",  # count mate pair overlap only once
                    "-fisher",  # perform work of teststrandbias.R
                    targets_bedfile
                ],
                stdout=f_out)

        unfiltered_vcf = f"{args.name}.vardict.unfiltered.vcf"
        with open(vardict_table, "rb") as f_in:
            with open(unfiltered_vcf, "wb") as f_out:
                pipe(
                    [
                        "var2vcf_valid.pl",
                        "-A",  # output all variants at same position
                        "-f",
                        args.min_vaf,
                        "-N",
                        args.name
                    ],
                    stdin=f_in,
                    stdout=f_out)
        os.unlink(vardict_table)

        vcf = f"{args.name}.vardict.vcf"
        # Although vardict take the targets bedfile as an argument is does call occasional variants just outside
        pipe([
            "filter_vcf", unfiltered_vcf, "--output", vcf, "--bed",
            targets_bedfile
        ])
        os.unlink(unfiltered_vcf)

        if args.vep and args.panel:
            pipe([
                "annotate_panel", "--vep", args.vep, "--output",
                f"{args.name}.vardict.annotation.tsv", "--reference",
                args.reference, "--threads", threads, "--panel", args.panel,
                vcf
            ])

    ###############################################################################################################
    ### MUTECT2                                                                                                 ###
    ###############################################################################################################
    if "mutect2" in args.callers:
        unmutectfiltered_vcf = f"{args.name}.unmutectfiltered.mutect2.vcf"
        pipe([
            "gatk", "Mutect2", "-R", args.reference, "-I", args.input_bam,
            "-O", unmutectfiltered_vcf, "--create-output-variant-index",
            "false", "--max-reads-per-alignment-start", "0",
            "--disable-read-filter", "NotDuplicateReadFilter",
            "--disable-read-filter", "GoodCigarReadFilter"
        ])

        multiallelic_vcf = f"{args.name}.multiallelic.mutect2.vcf"
        pipe([
            "gatk", "FilterMutectCalls", "-R", args.reference, "-V",
            unmutectfiltered_vcf, "-O", multiallelic_vcf, "--filtering-stats",
            "false", "--create-output-variant-index", "false"
        ])
        os.unlink(unmutectfiltered_vcf)
        os.unlink(f"{unmutectfiltered_vcf}.stats")

        vcf = f"{args.name}.mutect2.unfiltered.vcf" if targets_bedfile else f"{args.name}.mutect2.vcf"
        pipe([
            "postprocess_mutect2_vcf", "--output", vcf, "--min-alt-reads",
            args.min_alt_reads, "--min-vaf", args.min_vaf, multiallelic_vcf
        ])
        os.unlink(multiallelic_vcf)

        if targets_bedfile:
            unfiltered_vcf = vcf
            vcf = f"{args.name}.mutect2.vcf"
            pipe([
                "filter_vcf", unfiltered_vcf, "--output", vcf, "--bed",
                targets_bedfile
            ])
            os.unlink(unfiltered_vcf)

        if args.vep and args.panel:
            pipe([
                "annotate_panel", "--vep", args.vep, "--output",
                f"{args.name}.mutect2.annotation.tsv", "--reference",
                args.reference, "--threads", threads, "--panel", args.panel,
                vcf
            ])

    print(pipe.durations, file=sys.stderr, flush=True)
示例#9
0
def build_hipaa():
    pipeline = Pipeline()
    pipeline.add_pipe('name', Pipe(Lookup(read_names()), Drop()))
    pipeline.add_pipe('zip', Pipe(ZipCodeClassifier, ZipCodeFilter()))
    pipeline.add_pipe('address', Pipe(AddressClassifier(), AddressFilter()))
    pipeline.add_pipe('date', Pipe(DateClassifier(), DateFilter()))
    pipeline.add_pipe('phone_number', Pipe(PhoneNumberClassifier, Drop()))
    pipeline.add_pipe('email', Pipe(EmailClassifier, Drop()))
    pipeline.add_pipe('url', Pipe(URLClassifier, Drop()))
    pipeline.add_pipe('ssn', Pipe(SSNClassifier, Drop()))
    pipeline.add_pipe('ip_address', Pipe(IPAddressClassifier, Drop()))
    pipeline.add_pipe('mac_address', Pipe(MACAddressClassifier, Drop()))
    pipeline.add_pipe('face', Pipe(FaceClassifier(), Drop()))
    # TODO: This is far too sensitive
    # pipeline.add_pipe('number', Pipe(NumberClassifier, Drop()))
    return pipeline
示例#10
0
from build_svd_features import *
from pipeline import Pipe
from model import xgboost_model, lightgbm_model
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import lightgbm as lgb

if __name__ == '__main__':

    train_path = 'data/train_test.csv'
    val_path = 'data/train_ensemble.csv'
    test_path = 'data/test_clean.csv'

    firsttime = True
    if firsttime:
        training_pipe = Pipe(train_path, val_path)
        train = training_pipe.make('pickle/train')
        train.to_csv('train_feature.csv', index=False)

        testing_pipe = Pipe(val_path, test_path)
        test = testing_pipe.make('pickle/test')
        test.to_csv('test_features.csv', index=False)
    else:
        train = pd.read_csv('train_feature.csv')
        test = pd.read_csv('test_features.csv')

    featlist = train.columns.tolist()
    featlist.remove('is_listened')
    X = train[featlist].as_matrix()
    y = train['is_listened'].as_matrix()
示例#11
0
def multiplexing():
    """Cell free pipeline.
    """

    parser = argparse.ArgumentParser()
    parser.add_argument('input_sam',
                        help="Paths of input sorted undeduplicated sam file.")
    parser.add_argument(
        "-n",
        "--name",
        help=
        "Sample name used to name output files. Will be guessed from input sam if not provide.",
        default="")
    parser.add_argument(
        "-u",
        "--umi",
        help=
        "UMI type (prism, thruplex_hv or thruplex) or empty strng if no umis.",
        default="")
    parser.add_argument(
        "-m",
        "--min-family-size",
        help="Minimum family size. Families smaller than this will be filtered",
        type=int,
        default=1)
    parser.add_argument("-l",
                        "--interval",
                        help="Step size to increment downsampling by.",
                        type=int,
                        required=True)
    parser.add_argument(
        "-r",
        "--reference",
        help="Path to reference genome or containing directory.",
        required=True)
    parser.add_argument(
        "-p",
        "--panel",
        help="Path to covermi panel which must contain targets bedfile.",
        required=True)
    parser.add_argument("-o",
                        "--output",
                        help="Path to write output files to.",
                        default=".")
    args = parser.parse_args()

    threads = run(["getconf", "_NPROCESSORS_ONLN"]).stdout.strip()

    if not args.name:
        args.name = input_sam.split("/")[-1].split(".")[0]

    args.reference = os.path.abspath(args.reference)
    args.input_sam = os.path.abspath(args.input_sam)
    args.panel = os.path.abspath(args.panel)
    os.chdir(args.output)

    args.reference = (glob.glob(f"{args.reference}/*.fna") +
                      glob.glob(f"{args.reference}/*.fa") +
                      glob.glob(f"{args.reference}/*.fasta") +
                      [args.reference])[0]
    ref_dir = os.path.dirname(args.reference)
    if glob.glob(f"{ref_dir}/*.sa"):
        bwa = "bwa"
    elif glob.glob(f"{ref_dir}/*.0123"):
        bwa = "bwa-mem2"
    else:
        sys.exit("Invalid bwa indexes")
    targets_bedfile = (glob.glob(f"{args.panel}/*.bed") + [None])[0]
    stats = f"{args.name}.stats.json"
    pipe = Pipe()
    output_file = f"{args.name}.multiplexing.tsv"

    namesorted_sam = f"{args.name}.namesorted.sam"
    pipe([
        "samtools", "sort", "-n", "-o", namesorted_sam, "-@", threads,
        args.input_sam
    ])

    with open(output_file, "wt") as f_out:
        writer = csv.writer(f_out)
        writer.writerow([
            "sample", "reads", "mean_depth", "mean_family_size",
            "singleton_rate", "triplicate_plus_rate", "quadruplicate_plus_rate"
        ])

        requested_reads = 0
        returned_reads = 0
        while returned_reads == requested_reads:
            requested_reads += args.interval

            downsampled_sam = f"{args.name}.downsampled.sam"
            cp = pipe([
                "downsample_sam", "--output", downsampled_sam, "--number",
                requested_reads, namesorted_sam
            ],
                      stderr=subprocess.PIPE)
            returned_reads = int(cp.stderr.decode())

            sorted_sam = f"{args.name}.sorted.downsampled.sam"
            pipe([
                "samtools", "sort", "-o", sorted_sam, "-@", threads,
                downsampled_sam
            ])
            os.unlink(downsampled_sam)

            deduplicated_fastq = f"{args.name}.deduplicated.fastq"
            pipe([
                "elduderino", "--output", deduplicated_fastq, "--stats", stats,
                "--min-family-size", args.min_family_size, "--umi", args.umi,
                sorted_sam
            ])
            os.unlink(sorted_sam)

            deduplicated_sam = f"{args.name}.deduplicated.sam"
            with open(deduplicated_sam, "wb") as f:
                pipe(
                    [
                        bwa,
                        "mem",
                        "-t",
                        threads,
                        "-p",  # interleaved paired end fastq
                        "-C",  # Append fastq comment to sam
                        "-Y",  # Soft clip non-primary reads
                        args.reference,
                        deduplicated_fastq
                    ],
                    stdout=f)
            os.unlink(deduplicated_fastq)

            bam = f"{args.name}.bam"
            pipe([
                "samtools", "sort", "-o", bam, "-@", threads, deduplicated_sam
            ])
            os.unlink(deduplicated_sam)

            pipe([
                "covermi_stats", "--panel", args.panel, "--stats", stats, bam
            ])
            os.unlink(bam)

            with open(stats, "rt") as f:
                data = json.load(f)
            os.unlink(stats)
            writer.writerow([
                args.name, returned_reads, data["coverage"]["mean_depth"],
                data["mean_family_size"], data["singleton_rate"],
                data["triplicate_plus_rate"], data["quadruplicate_plus_rate"]
            ])
            f_out.flush()

    os.unlink(namesorted_sam)
    print(pipe.durations, file=sys.stderr, flush=True)
示例#12
0
def rossiPipeline():
    """Cell free pipeline.
    """

    print(f"rossiPipeline {__version__}", file=sys.stderr)

    parser = argparse.ArgumentParser()
    parser.add_argument(
        'input_fastqs',
        nargs="+",
        help=
        "Paths of input fastq or fastq.gz files. Order is important if paired end reads."
    )
    parser.add_argument(
        "-r",
        "--reference",
        help="Path to reference genome or containing directory.",
        required=True)
    parser.add_argument(
        "-n",
        "--name",
        help=
        "Sample name used to name output files. Will be guessed from input fastq if not provided",
        default="")
    parser.add_argument(
        "-p",
        "--panel",
        help="Path to covermi panel which must contain targets bedfile.",
        default="")
    parser.add_argument(
        "-u",
        "--umi",
        help=
        "UMI type (prism, thruplex_hv or thruplex) or empty strng if no umis.",
        default="")
    parser.add_argument("-v", "--vep", help="Path to vep datargs.", default="")
    parser.add_argument(
        "-f",
        "--min-vaf",
        help=
        "Minimum variant allele frequency for a variant to be called when using VarDict.",
        type=float,
        default=None)
    parser.add_argument(
        "-a",
        "--min-alt-reads",
        help="Minimum number of alt reads for a variant to be called.",
        type=float,
        default=2)
    parser.add_argument(
        "-c",
        "--cnv",
        help=
        "Whitespace separated list of target names, as specified in targets bedfile, over which to calculate copy number variation.",
        default="")
    parser.add_argument(
        "-d",
        "--sizes",
        help=
        "Whitespace separated list of reference names over which to calculate fragment size distribution.",
        default="")
    parser.add_argument(
        "-b",
        "--translocations",
        help=
        "Call translocations (supplementary reads aligned to different chromosomes).",
        action="store_const",
        const=True,
        default=False)
    parser.add_argument("-o",
                        "--output",
                        help="Path to write output files to.",
                        default=".")
    parser.add_argument(
        "-t",
        "--threads",
        help=
        "Number of threads to use, defaults to all available threads if not specified.",
        type=int,
        default=None)
    parser.add_argument(
        "-C",
        "--callers",
        help=
        "Variant callers to use. Valid values are varscan, vardict and mutect2. Defaults to 'varscan,vardict'.",
        default="varscan,vardict")
    args = parser.parse_args()

    threads = args.threads or run(["getconf", "_NPROCESSORS_ONLN"
                                   ]).stdout.strip()

    if not args.name:
        args.name = guess_sample_name(args.input_fastqs)
        if not args.name:
            sys.exit("Ambiguous sample name")

    if " " in args.name:
        args.name - args.name.replace(" ", "_")

    if args.min_vaf is None:
        args.min_vaf = 0.01 if args.min_family_size == 1 else 0.001

    args.reference = os.path.abspath(args.reference)
    args.input_fastqs = [os.path.abspath(path) for path in args.input_fastqs]
    if args.panel:
        args.panel = os.path.abspath(args.panel)
    if args.vep:
        args.vep = os.path.abspath(args.vep)
    os.chdir(args.output)

    args.reference = (glob.glob(f"{args.reference}/*.fna") +
                      glob.glob(f"{args.reference}/*.fa") +
                      glob.glob(f"{args.reference}/*.fasta") +
                      [args.reference])[0]
    ref_dir = os.path.dirname(args.reference)
    if glob.glob(f"{ref_dir}/*.sa"):
        bwa = "bwa"
    elif glob.glob(f"{ref_dir}/*.0123"):
        bwa = "bwa-mem2"
    else:
        sys.exit("Invalid bwa indexes")
    targets_bedfile = (glob.glob(f"{args.panel}/*.bed") +
                       [None])[0] if args.panel else ""
    stats = f"{args.name}.stats.json"
    pipe = Pipe()

    # FastUniq requires ungzipped fastqs
    ungzipped_fastqs = []
    temp_fastqs = []
    for fastq in args.input_fastqs:
        if fastq.endswith(".gz"):
            run(["gunzip", "-k", fastq])
            fastq = fastq[:-3]
            temp_fastqs.append(fastq)
        ungzipped_fastqs.append(fastq)

    if len(ungzipped_fastqs) > 2:
        with open(f"{args.name}_R1.fastq", "wb") as f_out:
            pipe(["cat"] + ungzipped_fastqs[::2], stdout=f_out)
        with open(f"{args.name}_R2.fastq", "wb") as f_out:
            pipe(["cat"] + ungzipped_fastqs[1::2], stdout=f_out)
        ungzipped_fastqs = [f"{args.name}_r1.fastq", f"{args.name}_r2.fastq"]
        for fastq in temp_fastqs:
            os.unlink(fastq)
        temp_fastqs = list(ungzipped_fastqs)

    fastq_names = f"{args.name}.fastqs.txt"
    with open(fastq_names, "wt") as f_out:
        f_out.write("{}\n{}\n".format(*ungzipped_fastqs))

    deduplicated_fastqs = [
        f"{args.name}_R1.deduplicated.fastq",
        f"{args.name}_R2.deduplicated.fastq"
    ]
    pipe([
        "fastuniq", "-i", fastq_names, "-o", deduplicated_fastqs[0], "-p",
        deduplicated_fastqs[1]
    ])
    os.unlink(fastq_names)
    os.unlink(temp_fastqs)

    # Remove umis and do some basic fastq qc
    interleaved_fastq = f"{args.name}.interleaved.fastq"
    command = [
        "udini", "--output", interleaved_fastq, "--stats", stats, "--umi",
        args.umi
    ]
    pipe(command + deduplicated_fastqs)
    for fastq in deduplicated_fastqs:
        os.unlink(fastq)

    base_sam = f"{args.name}.base.sam"
    with open(base_sam, "wb") as f_out:
        pipe(
            [
                bwa,
                "mem",
                "-t",
                threads,
                "-p",  # interleaved paired end fastq
                "-C",  # Append fastq comment to sam
                "-v",
                "1",  # Output errors only 
                args.reference,
                interleaved_fastq
            ],
            stdout=f_out)
    os.unlink(interleaved_fastq)

    namesorted_sam = f"{args.name}.namesorted.sam"
    pipe([
        "samtools",
        "sort",
        "-n",  # sort by name
        "-o",
        namesorted_sam,
        "-@",
        threads,
        base_sam
    ])
    os.unlink(base_sam)

    pipe([
        "size", "--stats", stats, "--rnames", args.sizes, "--output",
        f"{args.name}.sizes.pdf", namesorted_sam
    ])

    ontarget_sam = f"{args.name}.ontarget.sam"
    pipe([
        "ontarget", "--output", ontarget_sam, "--bed", targets_bedfile,
        "--stats", stats, "--cnv", args.cnv, "--threads", threads,
        namesorted_sam
    ])
    os.unlink(namesorted_sam)

    # This is likely not necessary
    namesorted_sam = f"{args.name}.namesorted.sam"
    pipe([
        "samtools",
        "sort",
        "-n",  # sort by name
        "-o",
        namesorted_sam,
        "-@",
        threads,
        ontarget_sam
    ])
    os.unlink(ontarget_sam)

    fixed_sam = f"{args.name}.fixed.sam"
    pipe(["samtools", "fixmate", namesorted_sam, fixed_sam])
    os.unlink(namesorted_sam)

    if args.translocations:
        pipe([
            "breakpoint", "--output", f"{args.name}.translocations.tsv",
            fixed_sam
        ])

    no_read_groups_bam = f"{args.name}.no_read_groups.bam"
    pipe([
        "samtools", "sort", "-o", no_read_groups_bam, "-@", threads, fixed_sam
    ])
    os.unlink(fixed_sam)

    bam = f"{args.name}.bam"
    # This step is only required to satisfy Mutect2 and possibly other gatk tools
    pipe([
        "gatk", "AddOrReplaceReadGroups", f"I={no_read_groups_bam}",
        f"O={bam}", "LB=lb", "PL=ILLUMINA", "PU=pu", f"SM={args.name}"
    ])
    os.unlink(no_read_groups_bam)

    pipe(["samtools", "index", bam])

    if args.panel:
        pipe([
            "covermi_stats", "--panel", args.panel, "--output",
            f"{args.name}.covermi.pdf", "--stats", stats, bam
        ])

    pipe([
        "call_variants",
        "--reference",
        args.reference,
        "--callers",
        args.callers,
        "--name",
        args.name,
        "--panel",
        args.panel,
        "--vep",
        args.vep,
        "--min-vaf",
        args.min_vaf,
        "--min-alt-reads",
        args.min_family_size,
        "--output",
        ".",  # We have already changed directory into the current directory
        "--threads",
        threads,
        bam
    ])

    #vaf_plot = f"{args.name}.vaf.pdf"
    pipe([
        "vcf_stats",
        f"{args.name}.vardict.vcf",  # May need to change this depending on variant caller performance
        "--stats",
        stats
    ])
    #"--output", vaf_plot])

    print(pipe.durations, file=sys.stderr, flush=True)
示例#13
0
import lightgbm as lgb

if __name__ == '__main__':

    # Input files
    train_path = 'data/archive/train_clean.csv'
    test_path = 'data/archive/test_clean.csv'

    # Intermediate files
    TRAIN_PATH_INTERMEDIATE = 'data/archive/train_intermediate.csv'
    TEST_PATH_INTERMEDIATE = 'data/archive/test_intermediate.csv'

    # Set boolean to use pre-made features or build on the fly
    firsttime = True
    if firsttime:
        training_pipe = Pipe(train_path, train_path)
        train = training_pipe.make('train_intermediate')
        train.to_csv(TRAIN_PATH_INTERMEDIATE, index=False)

        testing_pipe = Pipe(train_path, test_path)
        test = testing_pipe.make('test_intermediate')
        test.to_csv(TEST_PATH_INTERMEDIATE, index=False)
    else:
        train = pd.read_csv('train_feature.csv')
        test = pd.read_csv('test_features.csv')

    # Preparing train and test dataset for ensemble layer
    featlist = train.columns.tolist()
    featlist.remove('is_listened')

    X = train[featlist].as_matrix()
示例#14
0
def cfPipeline():
    """Cell free pipeline.
    """
    
    print(f"cfPipeline { __version__}", file=sys.stderr)
    
    parser = argparse.ArgumentParser()
    parser.add_argument('input_fastqs', nargs="+", help="Paths of input fastq or fastq.gz files. Order is important if paired end reads.")
    parser.add_argument("-r", "--reference", help="Path to reference genome or containing directory.", required=True)
    parser.add_argument("-n", "--name", help="Sample name used to name output files. Will be guessed from input fastq if not provided", default="")
    parser.add_argument("-p", "--panel", help="Path to covermi panel which must contain targets bedfile.", default="")
    parser.add_argument("-u", "--umi", help="UMI type (prism, thruplex_hv or thruplex) or empty strng if no umis.", default="")
    parser.add_argument("-v", "--vep", help="Path to vep datargs.", default="")
    parser.add_argument("-m", "--min-family-size", help="Minimum family size. Families smaller than this will be filtered", type=int, default=1)
    parser.add_argument("-f", "--min-vaf", help="Minimum variant allele frequency for a variant to be called when using VarDict.", type=float, default=None)
    parser.add_argument("-a", "--min-alt-reads", help="Minimum number of alt reads for a variant to be called.", type=float, default=2)
    parser.add_argument("-c", "--cnv", help="Whitespace separated list of target names, as specified in targets bedfile, over which to calculate copy number variation.", default="")
    parser.add_argument("-d", "--sizes", help="Whitespace separated list of reference names over which to calculate fragment size distribution.", default="")
    parser.add_argument("-b", "--translocations", help="Call translocations (supplementary reads aligned to different chromosomes).", action="store_const", const=True, default=False)
    parser.add_argument("-i", "--interleaved", help="Each input fastq contains alternating reads 1 and 2.", action="store_const", const=True, default=False)
    parser.add_argument("-o", "--output", help="Path to write output files to.", default=".")
    parser.add_argument("-t", "--threads", help="Number of threads to use, defaults to all available threads if not specified.", type=int, default=None)
    parser.add_argument("-s", "--sam-only", help="Quit after producing initial undeduplicated sam.", action="store_const", const=True, default=False)
    parser.add_argument("-C", "--callers", help="Variant callers to use. Valid values are varscan, vardict and mutect2. Defaults to 'varscan,vardict'.", default="varscan,vardict")
    parser.add_argument("-D", "--optical-duplicate-distance", help="Maximum pixel distance between two cluster to be considered optical duplicates.", default=None)
    args = parser.parse_args()

    threads = args.threads or run(["getconf", "_NPROCESSORS_ONLN"]).stdout.strip()

    if not args.name:
        args.name = guess_sample_name(args.input_fastqs)
        if not args.name:
            sys.exit("Ambiguous sample name")

    if " " in args.name:
        args.name - args.name.replace(" ", "_")

    if args.min_vaf is None:
        args.min_vaf = 0.01 if args.min_family_size == 1 else 0.001

    args.reference = os.path.abspath(args.reference)
    args.input_fastqs = [os.path.abspath(path) for path in args.input_fastqs]
    if args.panel:
        args.panel = os.path.abspath(args.panel)
    if args.vep:
        args.vep = os.path.abspath(args.vep)
    os.chdir(args.output)

    args.reference = (glob.glob(f"{args.reference}/*.fna") + glob.glob(f"{args.reference}/*.fa") + glob.glob(f"{args.reference}/*.fasta") + [args.reference])[0]
    ref_dir = os.path.dirname(args.reference)
    if glob.glob(f"{ref_dir}/*.sa"):
        bwa = "bwa"
    elif glob.glob(f"{ref_dir}/*.0123"):
        bwa = "bwa-mem2"
    else:
        sys.exit("Invalid bwa indexes")
    targets_bedfile = (glob.glob(f"{args.panel}/*.bed") + [None])[0] if args.panel else ""
    stats = f"{args.name}.stats.json"
    pipe = Pipe()


    # Remove umis and do some basic fastq qc
    interleaved_fastq = f"{args.name}.interleaved.fastq"
    command = ["udini", "--output", interleaved_fastq,
                        "--stats", stats,
                        "--umi", args.umi]
    if args.interleaved:
        command.append("--interleaved")
    pipe(command + args.input_fastqs)


    base_sam = f"{args.name}.base.sam"
    with open(base_sam, "wb") as f_out:
        pipe([bwa, "mem", "-t", threads, 
                          "-p", # interleaved paired end fastq
                          "-C", # Append fastq comment to sam
                          "-v", "1", # Output errors only 
                          args.reference, 
                          interleaved_fastq], stdout=f_out)
    os.unlink(interleaved_fastq)


    sorted_sam = f"{args.name}.sorted.sam"
    pipe(["samtools", "sort", "-o", sorted_sam,
                              "-@", threads,
                              base_sam])
    os.unlink(base_sam)

    if args.sam_only:
        return


    deduplicated_fastq = f"{args.name}.deduplicated.fastq"
    if args.optical_duplicate_distance is not None:
        optical_duplicate_distance = ["--optical-duplicate-distance", args.optical_duplicate_distance]
    else:
        optical_duplicate_distance = []
    pipe(["elduderino", "--output", deduplicated_fastq,
                        "--stats", stats,
                        "--min-family-size", args.min_family_size,
                        "--umi", args.umi] +
                        optical_duplicate_distance +
                        [sorted_sam])
    os.unlink(sorted_sam)


    deduplicated_sam = f"{args.name}.deduplicated.sam"
    with open(deduplicated_sam, "wb") as f_out:
        pipe([bwa, "mem", "-t", threads, 
                          "-p", # interleaved paired end fastq
                          "-C", # Append fastq comment to sam
                          "-Y", # Soft clip non-primary reads
                          "-v", "1", # Output errors only 
                          args.reference, 
                          deduplicated_fastq], stdout=f_out)
    os.unlink(deduplicated_fastq)


    namesorted_sam = f"{args.name}.namesorted.sam"
    pipe(["samtools", "sort", "-n", # sort by name
                              "-o", namesorted_sam,
                              "-@", threads,
                              deduplicated_sam])
    os.unlink(deduplicated_sam)


    pipe(["size", "--stats", stats,
                  "--rnames", args.sizes,
                  "--output", f"{args.name}.sizes.pdf",
                  namesorted_sam])


    ontarget_sam = f"{args.name}.ontarget.sam"
    pipe(["ontarget", "--output", ontarget_sam,
                      "--bed", targets_bedfile,
                      "--stats", stats,
                      "--cnv", args.cnv,
                      "--threads", threads,
                      namesorted_sam])
    os.unlink(namesorted_sam)
    
    
    untrimmed_sam = f"{args.name}.untrimmed.sam"
    pipe(["samtools", "sort", "-o", untrimmed_sam,
                              "-@", threads, 
                              ontarget_sam])
    os.unlink(ontarget_sam)
    
    
    trimmed_sam = f"{args.name}.trimmed.sam"
    pipe(["trim", "--output", trimmed_sam,
                  "--reference", args.reference,
                  untrimmed_sam])
    os.unlink(untrimmed_sam)
    
    
    namesorted_sam = f"{args.name}.namesorted.sam"
    pipe(["samtools", "sort", "-n", # sort by name
                              "-o", namesorted_sam,
                              "-@", threads, 
                              trimmed_sam])
    os.unlink(trimmed_sam)


    fixed_sam = f"{args.name}.fixed.sam"
    pipe(["samtools", "fixmate", namesorted_sam, fixed_sam])
    os.unlink(namesorted_sam)


    if args.translocations:
        pipe(["breakpoint", "--output", f"{args.name}.translocations.tsv",
                            fixed_sam])


    no_read_groups_bam = f"{args.name}.no_read_groups.bam"
    pipe(["samtools", "sort", "-o", no_read_groups_bam,
                              "-@", threads,
                              fixed_sam])
    os.unlink(fixed_sam)


    bam = f"{args.name}.bam"
    # This step is only required to satisfy Mutect2 and possibly other gatk tools
    pipe(["gatk", "AddOrReplaceReadGroups", f"I={no_read_groups_bam}", f"O={bam}", "LB=lb", "PL=ILLUMINA", "PU=pu", f"SM={args.name}"])
    os.unlink(no_read_groups_bam)


    pipe(["samtools", "index", bam])


    if args.panel:
        pipe(["covermi_stats", "--panel", args.panel,
                               "--output", f"{args.name}.covermi.pdf",
                               "--stats", stats,
                               bam])


    pipe(["call_variants", "--reference", args.reference,
                           "--callers", args.callers,
                           "--name", args.name,
                           "--panel", args.panel,
                           "--vep", args.vep,
                           "--min-vaf", args.min_vaf,
                           "--min-alt-reads", 2,
                           "--output", ".", # We have already changed directory into the current directory
                           "--threads", threads,
                           bam])


    #vaf_plot = f"{args.name}.vaf.pdf"
    pipe(["vcf_stats", f"{args.name}.vardict.vcf", # May need to change this depending on variant caller performance
                       "--stats", stats])
                       #"--output", vaf_plot])

    print(pipe.durations, file=sys.stderr, flush=True)