Exemplo n.º 1
0
# -*- coding: utf-8 -*-

##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Imports~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# Std lib
from os.path import join

# Third party lib
import pandas as pd

# Local imports
from NanoSnake.common import *
from snakemake.remote.FTP import RemoteProvider as FTPRemoteProvider
from snakemake.remote.HTTP import RemoteProvider as HTTPRemoteProvider
FTP = FTPRemoteProvider()
HTTP = HTTPRemoteProvider()

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~check config file version~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# Minimum snakemake version
config_version=7
if not "config_version" in config or config["config_version"]!= config_version:
    raise NanoSnakeError ("Wrong configuration file version. Please regenerate config with `--generate_template config -o`")

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~Define samples sheet reference and getters~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
sample_df=pd.read_csv (config["sample_sheet"], comment="#", skip_blank_lines=True, sep="\t", index_col=0)
sample_list=list(sample_df.index)

def get_fastq (wildcards):
    return sample_df.loc[wildcards.sample, "fastq"]
def get_fast5 (wildcards):
    return sample_df.loc[wildcards.sample, "fast5"]
def get_seqsum (wildcards):
Exemplo n.º 2
0
## Pipeline for generating HG001, HG003, and HG004 specific homozygous SV callsets and genotyping UB flowcells

## URLs and snakemake modules for downloading input ref and vcfs
from snakemake.remote.FTP import RemoteProvider as FTPRemoteProvider
FTP = FTPRemoteProvider()

v06_url = "ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/AshkenazimTrio/analysis/NIST_SVs_Integration_v0.6/HG002_SVs_Tier1_v0.6.vcf.gz"
HG001_pbsv_url = "ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/NA12878/analysis/PacBio_pbsv_05212019/HG001_hs37d5.pbsv.vcf.gz"
hs37d5_url = "ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"

## Start of pipeline
rule all:
    input: 
        "vcfs/HG003_HG004_giab_sv_v0.6_homvar.vcf", 
        "vcfs/HG003_HG004_giab_sv_v0.6_homvar/indels.0.pdf", 
        "vcfs/HG001_pbsv_homvar.vcf",
        "vcfs/HG001_pbsv_homvar/indels.0.pdf",
        "vcfs/HG001_HG003_HG004_homvar.clustered.vcf",
        "svviz_out_FAH59421"
#         "svviz_out_FAH71622"
        

rule get_hs37d5:
    input: FTP.remote(hs37d5_url)
    output: "resources/hs37d5.fna"
    shell: "gunzip -c {input} > {output}"

rule index_ref:
    input: "resources/hs37d5.fna"
    output: "resources/hs37d5.fna.fai"
    wrapper: "0.38.0/bio/samtools/faidx"
Exemplo n.º 3
0
# Imports
from os.path import join
from snakemake.remote.FTP import RemoteProvider as FTPRemoteProvider
FTP = FTPRemoteProvider()

# Input and output data
ref1_input = "ftp://ftp.ensemblgenomes.org/pub/fungi/release-46/fasta/saccharomyces_cerevisiae/cdna/Saccharomyces_cerevisiae.R64-1-1.cdna.all.fa.gz"
ref1_output = "ref1.fa"
ref1_tsv = "ref1.tsv"
ref1_index = ref1_output+".fai"
ref2_input = "ftp://ftp.ensembl.org/pub/release-99/fasta/oryzias_latipes/cdna/Oryzias_latipes.ASM223467v1.cdna.all.fa.gz"
ref2_output = "ref2.fa"
ref2_tsv = "ref2.tsv"
ref2_index = ref2_output+".fai"
ref3_input = "ftp://ftp.ensembl.org/pub/release-99/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.abinitio.fa.gz"
ref3_output = "ref3.fa"
ref3_tsv = "ref3.tsv"
ref3_index = ref3_output+".fai"

# Rules
rule all:
    input: [ref1_output, ref1_index, ref1_tsv, ref2_output, ref2_index, ref2_tsv, ref3_output, ref3_index, ref3_tsv]

rule get_transcriptome_yeast:
    input: ref=FTP.remote(ref1_input)
    output: ref=ref1_output, tsv=ref1_tsv, index=ref1_index
    log: "get_transcriptome_yeast.log"
    wrapper: "get_transcriptome"

rule get_transcriptome_medaka:
    input: ref=FTP.remote(ref2_input)
Exemplo n.º 4
0
# Imports
from os.path import join
from snakemake.remote.FTP import RemoteProvider as FTPRemoteProvider
from snakemake.remote.HTTP import RemoteProvider as HTTPRemoteProvider
FTP = FTPRemoteProvider()
HTTP = HTTPRemoteProvider()

# Input and output data
ref1_input = join(config["data_dir"], "reference", "small_ref.fa")
ref2_input = join(config["data_dir"], "reference", "ref.fa.gz")
ref3_input = "ftp://ftp.ensembl.org/pub/release-98/fasta/danio_rerio/dna/Danio_rerio.GRCz11.dna.chromosome.MT.fa.gz"
ref4_input = "https://www.ebi.ac.uk/~aleg/extra/Danio_rerio.GRCz11.dna.chromosome.MT.fa.gz"
ref1_output = "ref1.fa"
ref1_index = ref1_output+".fai"
ref2_output = "ref2.fa"
ref2_index = ref2_output+".fai"
ref3_output = "ref3.fa"
ref3_index = ref3_output+".fai"
ref4_output = "ref4.fa"
ref4_index = ref4_output+".fai"

# Rules
rule all:
    input: [ref1_output, ref1_index, ref2_output, ref2_index, ref3_output, ref3_index, ref4_output, ref4_index]

rule get_genome_from_fa:
    input: ref=ref1_input
    output: ref=ref1_output, index=ref1_index
    log: "get_genome_from_fa.log"
    wrapper: "get_genome"
Exemplo n.º 5
0
    logger.error ("The provided sample sheet is in the correct format, Please regenerate a template file with `--generate_template sample_sheet -o`")
    sys.exit()
logger.debug(sample_df)
sample_list=list(sample_df.index)

logger.info("Define number of chunks")
try:
    nchunk=int(config["pbt_alignment_split"]["n_chunks"])
except:
    nchunk=4
chunk_list=list(range(nchunk))

logger.info("Specify way to download reference files")
ref=config["genome"]
if ref.startswith("ftp"):
    ref=FTP().remote(ref)
elif ref.startswith("http"):
    ref=HTTP().remote(ref)

gff3=config["annotation"]
if gff3.startswith("ftp"):
    gff3=FTP().remote(gff3)
elif gff3.startswith("http"):
    gff3=HTTP().remote(gff3)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~Define all output depending on config file~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
logger.info("Define conditional target files")
target_files=[]
target_files.extend(expand(join("results","main","filtered_alignments","{sample}.bam"), sample=sample_list))

if config["dna_methylation_call"] is True:
Exemplo n.º 6
0
# Imports
from os.path import join
from snakemake.remote.FTP import RemoteProvider as FTPRemoteProvider
FTP = FTPRemoteProvider()

# Input and output data
gff3_input = join(config["data_dir"], "reference", "ref.gff3.gz")
gff3_input_ftp = "ftp://ftp.ensemblgenomes.org/pub/fungi/release-45/gff3/saccharomyces_cerevisiae/Saccharomyces_cerevisiae.R64-1-1.45.gff3.gz"
gff3_output_1 = "ref_1.gff3"
gtf_output_1 = "ref_1.gtf"
gff3_output_2 = "ref_2.gff3"
gtf_output_2 = "ref_2.gtf"

# Rules
rule all:
    input: [gff3_output_1, gtf_output_1, gff3_output_2, gtf_output_2]

rule get_annotation_from_local:
    input: gff3=gff3_input
    output: gff3=gff3_output_1, gtf=gtf_output_1
    params: opt=""
    log: "get_annotation_from_local.log"
    wrapper: "get_annotation"

rule get_annotation_from_ftp:
    input: gff3=FTP.remote(gff3_input_ftp)
    output: gff3=gff3_output_2, gtf=gtf_output_2
    params: opt=""
    log: "get_annotation_from_ftp.log"
    wrapper: "get_annotation"
Exemplo n.º 7
0
# -*- coding: utf-8 -*-

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Imports~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# Std lib
from os.path import join

# Third party lib
import pandas as pd

# Local imports
from pycoSnake.common import *
from snakemake.remote.FTP import RemoteProvider as FTPRemoteProvider
from snakemake.remote.HTTP import RemoteProvider as HTTPRemoteProvider
FTP = FTPRemoteProvider()
HTTP = HTTPRemoteProvider()

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~check config file version~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# Minimum snakemake version
config_version=6
if not "config_version" in config or config["config_version"]!= config_version:
    raise pycoSnakeError ("Wrong configuration file version. Please regenerate config with `--generate_template config -o`")

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~Define samples sheet reference and getters~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
sample_df=pd.read_csv (config["sample_sheet"], comment="#", skip_blank_lines=True, sep="\t", index_col=0)
sample_list=list(sample_df.index)

def get_fastq1 (wildcards):
    return sample_df.loc[wildcards.sample, "fastq1"]
def get_fastq2 (wildcards):
    return sample_df.loc[wildcards.sample, "fastq2"]