def sample_get_rawfile_detail(sample): from pymisca.events import LinkEvent from pymisca.ext import f as _f node = pyext.file__asModule('/home/feng/envs/0726-polyq/src/validate_fastq.py') node.rawMeta = rawMeta() node.DATA_ACC = sample['data_acc'] node.WORKDIR = WORKDIR() # pyext.path.Path('/home/feng/envs/0726-polyq/WORKDIR.submit/').realpath() # node.WORKDIR = pyext.path.Path('/home/feng/envs/0830-polyq/WORKDIR/').realpath() node.valid_fastq() sample.rawfile_nodes = nodes = node.combined_valid_fastq()['OUTPUT_NODES'] sample.rawfile_files_orig = [x['OUTPUT_FILE'] for x in nodes] # sample.rawfile_files = [x['OUTPUT_FILE'].relpath(WORKDIR()) for x in nodes] #### Relinking because GEO needs a flat directory tree sample.rawfile_files = [ LinkEvent( x['OUTPUT_FILE'], WORKDIR()/"ftp"/_f('{sample.data_acc}.{x["OUTPUT_FILE"].basename()}'), 1,).dest.relpath(WORKDIR()/"ftp") for x in nodes] # print nodes[0]._data.keys() sample.rawfile_checksums = [x['FILE_MD5']['MD5_HEX'] for x in nodes] sample.rawfile_readlengths = [ '75' for x in nodes] sample.rawfile_is_paired = 'paired-end' if len(nodes) > 1 else 'single' template = u''' !Sample_raw_file_name = {{','.join(sample.rawfile_files)}} !Sample_raw_file_type = fastq !Sample_raw_file_checksum = {{','.join(sample.rawfile_checksums)}} !Sample_raw_file_read_length = {{','.join(sample.rawfile_readlengths)}} !Sample_raw_file_single_or_paired-end = {{sample.rawfile_is_paired}} !Sample_raw_file_instrument_model = NextSeq 500 ''' return pyext.jf2(template)
def sample_rnaseq_processing_protocol(sample): from pymisca.events import CopyEvent,LinkEvent from pymisca.ext import f as _f import os OUTDIR = WORKDIR() / sample['data_acc']/ 'supp' # sample.data_acc_control = _get_data_acc_control(sample) rec = df_mappedData_rnaseq().loc[sample['data_acc']] for attrName,key in [ ('file_count','txt'), ('file_bam','bam'), # ('file_npk','narrowPeak') ]: fullname= rec[key] sample[attrName+'_orig'] = fullname if pyext.pd.isnull(fullname): sample[attrName] = 'NA' else: basename = os.path.basename(fullname) sample[attrName] = LinkEvent( CopyEvent(fullname, OUTDIR / basename ).dest, WORKDIR()/_f("ftp/{sample.data_acc}.supp.{basename}"), 1 ).dest.relpath(WORKDIR() / 'ftp') # sample.file_count = CopyEvent( rec['txt'], # OUTDIR / os.path.basename(rec['txt']) # ).dest.relpath(WORKDIR()) # sample.file_bam = CopyEvent(rec['bam'], # OUTDIR / os.path.basename(rec['bam']) # ).dest.relpath(WORKDIR()) template = u''' !Sample_data_processing = Raw fastq were uploaded to Bluebee Genomics Platform and analysed with \ Quantseq FWD analysis pipeline (https://www.lexogen.com/quantseq-data-analysis/). Briefly, the raw reads \ were trimmed with BBDuk and aligned to GTF-annotated genome with STAR. !Sample_data_processing = Supplementary_files_format_and_content: *.txt: TSV table containing abundance of transcripts by STAR. !Sample_data_processing = Supplementary_files_format_and_content: *.bam: Genomic alignment by STAR. !Sample_supplementary_file_1 = {{sample.file_bam}} !Sample_supplementary_file_2 = {{sample.file_count}} ''' res = pyext.jf2(template,) return res
def sample_chipseq_processing_protocol(sample): from pymisca.events import CopyEvent, LinkEvent from pymisca.ext import f as _f import os def _get_data_acc_control(sample): buf = ''' 198C,195CS13 176C,176CS21 182C,176CS21 189C,176CS21 192C,192CS19 '''.replace(' ', '') mapper = dict( [x.strip().split(',') for x in buf.splitlines() if x.strip()]) res = mapper.get(sample.data_acc.split('S')[0]) return res OUTDIR = WORKDIR() / sample['data_acc'] / 'supp' sample.data_acc_control = _get_data_acc_control(sample) rec = df_mappedData_chipseq().loc[sample['data_acc']] # fullname= rec['bw'] # basename = os.path.basename(fullname) # sample.file_bw = LinkEvent( CopyEvent( fullname, # OUTDIR / basename, # ).dest, # WORKDIR()/_f("ftp/{sample.data_acc}.supp.{basename}"), # 1,).dest.relpath(WORKDIR()/ 'ftp') for attrName, key in [('file_bw', 'bw'), ('file_bam', 'bam'), ('file_npk', 'narrowPeak')]: fullname = rec[key] sample[attrName + '_orig'] = fullname if pyext.pd.isnull(fullname): sample[attrName] = 'NA' else: basename = os.path.basename(fullname) sample[attrName] = LinkEvent( CopyEvent(fullname, OUTDIR / basename).dest, WORKDIR() / _f("ftp/{sample.data_acc}.supp.{basename}"), 1).dest.relpath(WORKDIR() / 'ftp') # if not len(rec['narrowPeak']): # if pyext.pd.isnull(rec['narrowPeak']): # sample.file_npk = 'NA' # else: # fullname= rec['narrowPeak'] # basename = os.path.basename(fullname) # sample.file_npk = LinkEvent( # CopyEvent(fullname, # OUTDIR / basename, # ).dest, # WORKDIR()/_f("ftp/{sample.data_acc}.supp.{basename}"), # 1 # ).dest.relpath(WORKDIR()/'ftp') template = u''' !Sample_data_processing = Adapters were trimmed off from raw reads with Trimmomatic with argument "ILLUMINACLIP:$FA_ADAPTER:6:30:10 LEADING:3 TRAILING:3 MINLEN:36 SLIDINGWINDOW:4:15". \ Raw reads were mapped to the genome "TAIR10" with Bowtie2 under argument:"--no-mixed --no-discordant --no-unal -k2". Any read that mapped to more than one genomic location was discarded. \ PCR duplicate reads were removed with Picard using default setting. !Sample_data_processing = Genomic binding profile was quantified in RPKM (Reads Per Kilobase per Million mapped reads) using a bin-size of 10bp. "deeptools.bamCoverage" is used. !Sample_data_processing = For each treated ChIP-Seq library, peaks were called against a control {{sample.data_acc_control}} using MACS2 with argument "--keep-dup 1 -p 0.1". !Sample_data_processing = Supplementary_files_format_and_content: *.bam: Genomic alignements that were sorted, deduplicated and filtered for uniq-mapped reads. !Sample_data_processing = Supplementary_files_format_and_content: *_RPKM.bw: RPKM-normalised bigwig track at 10bp resolution !Sample_data_processing = Supplementary_files_format_and_content: *.narrowPeak: containing MACS2-called peaks. as described !Sample_supplementary_file_1 = {{sample.file_bam}} !Sample_supplementary_file_2 = {{sample.file_bw}} !Sample_supplementary_file_3 = {{sample.file_npk}} ''' res = pyext.jf2(template, ) return res
def sample_rnaseq_processing_protocol(sample): from pymisca.events import CopyEvent,LinkEvent from pymisca.ext import f as _f import os OUTDIR = WORKDIR() / sample['data_acc']/ 'supp' # sample.data_acc_control = _get_data_acc_control(sample) rec = df_mappedData_rnaseq().loc[sample['data_acc']] for attrName,key in [ ('file_bam','bam'), ('file_bw','bw'), ('file_count','count'), ('file_ct','ct'), # ('file_count','txt'), # ('file_npk','narrowPeak') ]: fullname= rec[key] sample[attrName+'_orig'] = fullname if pyext.pd.isnull(fullname) or (fullname ==[]): sample[attrName] = 'NA' else: basename = os.path.basename(fullname) sample[attrName] = LinkEvent( CopyEvent(fullname,OUTDIR / basename).dest, WORKDIR()/_f("ftp/{sample.data_acc}.supp.{basename}"),1 ).dest.relpath(WORKDIR() / 'ftp') if sample['file_ct'] == 'NA': assert sample['file_count'] != 'NA' template = u''' !Sample_data_processing = Adapters were trimmed off from raw reads with Trimmomatic with argument "ILLUMINACLIP:$FA_ADAPTER:6:30:10 LEADING:3 TRAILING:3 MINLEN:36 SLIDINGWINDOW:4:15". !Sample_data_processing = Raw reads were aligned with Hisat2 with arguments "--no-mixed --rna-strandness RF --dta --fr" to produce a SAM file. !Sample_data_processing = Duplicate reads were removed with Picard using default setting !Sample_data_processing = Alignments in SAM file were assembled into transcripts abundances with stringtie with argument "--rf". !Sample_data_processing = Supplementary_files_format_and_content: .bam: HISAT2-aligned and picard deduplicated genomic alignment . !Sample_data_processing = Supplementary_files_format_and_content: .stringtie.count: TSV table containing abundance of transcripts with bed-formatted coordinates. !Sample_data_processing = Supplementary_files_format_and_content: RPKM.bw: RPKM normalised bigwig files !Sample_supplementary_file_1 = {{sample.file_bam}} !Sample_supplementary_file_2 = {{sample.file_count}} !Sample_supplementary_file_3 = {{sample.file_bw}} ''' else: assert sample['file_ct'] != 'NA' template = u''' !Sample_data_processing = Adapters were trimmed off using Trimmomatic with "ILLUMINACLIP:$FA_ADAPTER:2:10:5:1" !Sample_data_processing = The trimmed reads were aligned using Tophat with "--max-multihits --library-type fr-firststrand --no-mixed" !Sample_data_processing = Duplicate reads were removed with Picard using default setting !Sample_data_processing = Alignments in SAM file were assembled into transcripts abundances using htseq-count with "-r name -s no -f bam -t exon -i gene_id" against the GTF annotation" !Sample_data_processing = Supplementary_files_format_and_content: *.bam: Tophat-aligned and picard deduplicated genomic alignment . !Sample_data_processing = Supplementary_files_format_and_content: _htseq_count.ct: TSV table containing htseq-counted transcript abundances. !Sample_supplementary_file_1 = {{sample.file_bam}} !Sample_supplementary_file_2 = {{sample.file_ct}} ''' res = pyext.jf2(template,) return res