Exemplo n.º 1
0
    def run_cuffquant_ERCC_k(self):
        sh_file = "%s/s08.1.cuffquant_k.ERCC.sh" % (self.script_dir)
        sh_work_file = "%s/s08.1.cuffquant_k.ERCC_work.sh" % (self.script_dir)

        cflk_dir = self['sftw_name'].cflk_dir

        if not os.path.isdir(self.cuffquant_ercc_k):
            os.mkdir(self.cuffquant_ercc_k)

        sh_info = """
cflk_dir=$1
in_bam=$2
gtf_file=$3
out_dir=$4

$cflk_dir/cuffquant      \\
   -p 8  -u              \\
   -o $out_dir           \\
   $gtf_file             \\
   $in_bam
      """
        sh_work = ""
        for samp in self['samp']:
            brief_name = self['samp_info']['samp_brief'][samp]
            in_bam = "%s/%s/accepted_hits.bam" % (self.tophat, brief_name)
            out_dir = "%s/%s" % (self.cuffquant_ercc_k, brief_name)
            sh_work += "sh %s   %s %s %s %s \n" % (sh_file, cflk_dir, in_bam,
                                                   self['infile']['anno_file'],
                                                   out_dir)

        my_job = m_jobs.running_jobs(sh_file, sh_work_file)
        my_job.load_sh_file(sh_info)
        my_job.load_sh_work_file(sh_work)
        my_job.running_multi(cpu=8)
Exemplo n.º 2
0
    def merge_novo_known_GTF(self):
        sh_file = "%s/Generate_Transcriptome.sh" % (self.script_dir)
        sh_work_file = "%s/Generate_Transcriptome_work.sh" % (self.script_dir)

        sh_info = """
known_GTF=$1
unknown_GTF=$2
merge_GTF=$3
merge_ERCC_GTF=$4

sed 's/XLOC_/novoXLOC_/g' $unknown_GTF | sed 's/TCONS_/novoTCONS_/g' >$unknown_GTF.tmp
grep -P "^chr" $known_GTF | cat /dev/stdin $unknown_GTF.tmp | bedtools sort -i /dev/stdin | grep -P "^chr" >$merge_GTF
rm $unknown_GTF.tmp
grep -P "^ERCC|^RGC" $known_GTF | cat $merge_GTF /dev/stdin >$merge_ERCC_GTF
      """

        known_GTF = self['infile']['anno_file']
        unknown_GTF = "%s/novo_lnc_raw.combined.FPKM0.5_rep0.25.multiExon.gtf" % (
            self.data_dir)
        merge_GTF = "%s/all.exon.sort.gtf" % (self.data_dir)
        merge_ERCC_GTF = "%s/all.exon.sort.ERCC.gtf" % (self.data_dir)
        self['infile']['anno_file_merge'] = merge_GTF
        self['infile']['anno_file_merge_ERCC'] = merge_ERCC_GTF

        sh_work = "sh %s   %s %s %s %s" % (sh_file, known_GTF, unknown_GTF,
                                           merge_GTF, merge_ERCC_GTF)
        my_job = m_jobs.running_jobs(sh_file, sh_work_file)
        my_job.load_sh_file(sh_info)
        my_job.load_sh_work_file(sh_work)
        my_job.running_multi(cpu=1)
   def run_trim(self):
      home_dir     = os.path.abspath('./')
      
      cln_dir      = self['dir_name']['clean_data']
      trim_dir     = self['dir_name']['trim_data']
      script_dir   = "%s/scripts"         % (home_dir)
      bin_dir      = "%s/bin"             % (home_dir)
      
      sh_file      = "%s/s01.trim.sh"      % (script_dir)
      sh_work_file = "%s/s01.trim_work.sh" % (script_dir)
      py_trim      = "%s/step1.trim.py"    % (bin_dir)
      
      sh_info = """
py_trim=$1
in_fq1=$2
in_fq2=$3
out_dir=$4
out_prefix=$5

python $py_trim $in_fq1 $in_fq2 $out_dir $out_prefix
      """
      sh_work = ""
      for samp in self['sample']:
         brief_name = self['sam_info']['samp_brief'][samp]
         in_fq1      = "%s/%s/1.cln.fq.gz" % ( cln_dir ,samp )
         in_fq2      = "%s/%s/2.cln.fq.gz" % ( cln_dir ,samp )
         out_dir     = "%s"                % ( trim_dir      )
         out_prefix  = brief_name
         sh_work += "sh %s  %s %s %s %s %s\n" % ( sh_file,  py_trim, in_fq1, in_fq2, out_dir, out_prefix )
      
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
   def run_cuffquant(self):
      sh_file      = "%s/s08.cuffquant.sh"      % (self.script_dir)
      sh_work_file = "%s/s08.cuffquant_work.sh" % (self.script_dir)
      
      if not os.path.isdir( self.cuffquant ):
         os.mkdir( self.cuffquant )
      
      sh_info = """
in_bam=$1
gtf_file=$2
out_dir=$3

/data/Analysis/huboqiang/software/cufflinks-2.2.1.Linux_x86_64/cuffquant      \\
   -p 8  -u                                                                   \\
   -o $out_dir                                                                \\
   $gtf_file                                                                  \\
   $in_bam
      """
      sh_work = ""
      for samp in self['samp']:
         brief_name = self['samp_info']['samp_brief'][samp]
         in_bam      = "%s/%s/accepted_hits.bam"   % ( self.tophat,    brief_name   ) 
         out_dir     = "%s/%s"                     % ( self.cuffquant ,brief_name   )
         sh_work += "sh %s  %s %s %s \n" % ( sh_file,  in_bam, self['infile']['anno_file'], out_dir)
      
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
Exemplo n.º 5
0
    def run_cuffcomp_novo_trans(self):

        sh_file = "%s/s06.1.cuffcompare_novo.sh" % (self.script_dir)
        sh_work_file = "%s/s06.1.cuffcompare_novo_work.sh" % (self.script_dir)

        cflk_dir = self['sftw_name'].cflk_dir

        sh_info = """
cflk_dir=$1
out_prefix=$2
shift
shift

$cflk_dir/cuffcompare    \\
   -o  $out_prefix       \\
   -T  $@                \\
      """
        sh_work = ""
        out_prefix = "%s/novo_lnc_raw" % (self.data_dir)
        l_in_samp = [
            "%s/%s/transcripts.gtf" %
            (self.cufflink_u, self['samp_info']['samp_brief'][samp])
            for samp in self['samp']
        ]
        sh_work = "sh %s  %s %s %s" % (sh_file, cflk_dir, out_prefix,
                                       " ".join(l_in_samp))

        my_job = m_jobs.running_jobs(sh_file, sh_work_file)
        my_job.load_sh_file(sh_info)
        my_job.load_sh_work_file(sh_work)
        my_job.running_multi(cpu=8)
Exemplo n.º 6
0
    def run_cufflinks_u(self):

        sh_file = "%s/s05.cufflinks_GenomeMapped.sh" % (self.script_dir)
        sh_work_file = "%s/s05.cufflinks_GenomeMapped_work.sh" % (
            self.script_dir)

        cflk_dir = self['sftw_name'].cflk_dir

        sh_info = """
cflk_dir=$1
in_bam=$2
gtf_file=$3
out_dir=$4

$cflk_dir/cufflinks      \\
   -p 8  -u              \\
   -o $out_dir           \\
   $in_bam
      """
        sh_work = ""
        for samp in self['samp']:
            brief_name = self['samp_info']['samp_brief'][samp]
            in_bam = "%s/%s/accepted_hits.genome.sort.bam" % (self.tophat,
                                                              brief_name)
            out_dir = "%s/%s" % (self.cufflink_u, brief_name)
            sh_work += "sh %s  %s  %s %s %s \n" % (sh_file, cflk_dir, in_bam,
                                                   self['infile']['anno_file'],
                                                   out_dir)

        my_job = m_jobs.running_jobs(sh_file, sh_work_file)
        my_job.load_sh_file(sh_info)
        my_job.load_sh_work_file(sh_work)
        my_job.running_multi(cpu=8)
Exemplo n.º 7
0
    def run_cuffnorm_ERCC_k(self):
        sh_file = "%s/s09.1.cuffnorm.ERCC_k.sh" % (self.script_dir)
        sh_work_file = "%s/s09.1.cuffnorm.ERCC_k_work.sh" % (self.script_dir)
        if not os.path.isdir(self.cuffnorm_ercc_k):
            os.mkdir(self.cuffnorm_ercc_k)

        cflk_dir = self['sftw_name'].cflk_dir

        l_brief = []
        l_cxb = []
        for samp in self['samp']:
            brief_name = self['samp_info']['samp_brief'][samp]
            l_brief.append(brief_name)
            l_cxb.append("%s/%s/abundances.cxb" %
                         (self.cuffquant_ercc_k, brief_name))

        list_brief = ",".join(l_brief)
        list_cxb = " ".join(l_cxb)

        sh_info = """
cflk_dir=$1

$cflk_dir/cuffnorm        \\
   -p 8  -o %s  -L %s     \\
   %s                     \\
   %s
      """ % (self.cuffnorm_ercc_k, list_brief, self['infile']['anno_file'],
             list_cxb)

        sh_work = "sh %s  %s" % (sh_file, cflk_dir)
        my_job = m_jobs.running_jobs(sh_file, sh_work_file)
        my_job.load_sh_file(sh_info)
        my_job.load_sh_work_file(sh_work)
        my_job.running_multi(cpu=8)
Exemplo n.º 8
0
    def __get_HTS_clean_split(self):
        sh_file = "%s/p.HTSeq_split.sh" % (self.script_dir)
        sh_work_file = "%s/p.HTSeq_split_work.sh" % (self.script_dir)
        sh_info = """
infile=$1
out_Refseq=$2
out_NONCODE=$3
out_NSMB=$4

grep -v -P '^NONHSAG|XLOC_' $infile >$out_Refseq
head -n 1 $infile >$out_NONCODE && grep -P '^NONHSAG' $infile >>$out_NONCODE
head -n 1 $infile >$out_NSMB    && grep -P '^XLOC'    $infile >>$out_NSMB

      """

        infile = "%s/merge.dexseq_clean.gene.xls" % (self.HTS)
        out_Refseq = "%s/merge.dexseq_clean_refseq.gene.xls" % (self.HTS)
        out_NONCODE = "%s/merge.dexseq_clean_NONCODE.gene.xls" % (self.HTS)
        out_NSMB = "%s/merge.dexseq_clean_NSMB.gene.xls" % (self.HTS)
        sh_work = "sh %s  %s %s %s %s " % (sh_file, infile, out_Refseq,
                                           out_NONCODE, out_NSMB)
        my_job = m_jobs.running_jobs(sh_file, sh_work_file)
        my_job.load_sh_file(sh_info)
        my_job.load_sh_work_file(sh_work)
        my_job.running_multi(cpu=1)
Exemplo n.º 9
0
   def run_cuffcomp_novo_trans(self):

      sh_file      = "%s/s06.1.cuffcompare_novo.sh"      % (self.script_dir)
      sh_work_file = "%s/s06.1.cuffcompare_novo_work.sh" % (self.script_dir)
      
      cflk_dir     = self['sftw_name'].cflk_dir
      
      sh_info = """
cflk_dir=$1
out_prefix=$2
shift
shift

$cflk_dir/cuffcompare    \\
   -o  $out_prefix       \\
   -T  $@                \\
      """
      sh_work = ""
      out_prefix  = "%s/novo_lnc_raw"           % ( self.data_dir )
      l_in_samp   = [  "%s/%s/transcripts.gtf"  % ( self.cufflink_u,self['samp_info']['samp_brief'][samp] ) for samp in self['samp'] ]
      sh_work = "sh %s  %s %s %s" % ( sh_file, cflk_dir, out_prefix, " ".join(l_in_samp) )
      
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
      my_job.running_multi( cpu=8 )
Exemplo n.º 10
0
   def merge_novo_known_GTF(self):
      sh_file        =  "%s/Generate_Transcriptome.sh"       %  (self.script_dir)
      sh_work_file   =  "%s/Generate_Transcriptome_work.sh"  %  (self.script_dir)
      
      sh_info = """
known_GTF=$1
unknown_GTF=$2
merge_GTF=$3
merge_ERCC_GTF=$4

sed 's/XLOC_/novoXLOC_/g' $unknown_GTF | sed 's/TCONS_/novoTCONS_/g' >$unknown_GTF.tmp
grep -P "^chr" $known_GTF | cat /dev/stdin $unknown_GTF.tmp | bedtools sort -i /dev/stdin | grep -P "^chr" >$merge_GTF
rm $unknown_GTF.tmp
grep -P "^ERCC|^RGC" $known_GTF | cat $merge_GTF /dev/stdin >$merge_ERCC_GTF
      """
      
      known_GTF     = self['infile']['anno_file']
      unknown_GTF   = "%s/novo_lnc_raw.combined.FPKM0.5_rep0.25.multiExon.gtf" % ( self.data_dir )
      merge_GTF     = "%s/all.exon.sort.gtf"         % ( self.data_dir )
      merge_ERCC_GTF= "%s/all.exon.sort.ERCC.gtf"    % ( self.data_dir )
      self['infile']['anno_file_merge']      = merge_GTF
      self['infile']['anno_file_merge_ERCC'] = merge_ERCC_GTF
      
      sh_work = "sh %s   %s %s %s %s" % (  sh_file,  known_GTF , unknown_GTF, merge_GTF, merge_ERCC_GTF )
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
      my_job.running_multi( cpu=1 )
   def run_cufflinks_u(self):

      sh_file      = "%s/s05.cufflinks_GenomeMapped.sh"      % (self.script_dir)
      sh_work_file = "%s/s05.cufflinks_GenomeMapped_work.sh" % (self.script_dir)
      
      sh_info = """
in_bam=$1
gtf_file=$2
out_dir=$3

/data/Analysis/huboqiang/software/cufflinks-2.2.1.Linux_x86_64/cufflinks      \\
   -p 8  -u                                                                   \\
   -o $out_dir                                                                \\
   $in_bam
      """
      sh_work = ""
      for samp in self['samp']:
         brief_name = self['samp_info']['samp_brief'][samp]
         in_bam      = "%s/%s/accepted_hits.genome.sort.bam"% ( self.tophat,    brief_name   ) 
         out_dir     = "%s/%s"                              % ( self.cufflink_u,brief_name   )
         sh_work += "sh %s  %s %s %s \n" % ( sh_file,  in_bam, self['infile']['anno_file'], out_dir)
      
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
Exemplo n.º 12
0
   def run_cuffnorm_ERCC_k(self):
      sh_file      = "%s/s09.1.cuffnorm.ERCC_k.sh"      % (self.script_dir)
      sh_work_file = "%s/s09.1.cuffnorm.ERCC_k_work.sh" % (self.script_dir)
      if not os.path.isdir( self.cuffnorm_ercc_k ):
         os.mkdir( self.cuffnorm_ercc_k )
      
      cflk_dir     = self['sftw_name'].cflk_dir

      l_brief = []
      l_cxb   = []
      for samp in self['samp']:
         brief_name = self['samp_info']['samp_brief'][samp]
         l_brief.append( brief_name  )
         l_cxb.append(   "%s/%s/abundances.cxb" % (self.cuffquant_ercc_k,brief_name) )
      
      
      list_brief = ",".join( l_brief )
      list_cxb   = " ".join( l_cxb   )
      
      sh_info = """
cflk_dir=$1

$cflk_dir/cuffnorm        \\
   -p 8  -o %s  -L %s     \\
   %s                     \\
   %s
      """ % ( self.cuffnorm_ercc_k, list_brief, self['infile']['anno_file'], list_cxb )
      
      sh_work = "sh %s  %s" % (sh_file, cflk_dir)
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
      my_job.running_multi( cpu=8 )
Exemplo n.º 13
0
   def run_cuffquant_ERCC_k(self):
      sh_file      = "%s/s08.1.cuffquant_k.ERCC.sh"      % (self.script_dir)
      sh_work_file = "%s/s08.1.cuffquant_k.ERCC_work.sh" % (self.script_dir)
      
      cflk_dir     = self['sftw_name'].cflk_dir
      
      if not os.path.isdir( self.cuffquant_ercc_k ):
         os.mkdir( self.cuffquant_ercc_k )
      
      sh_info = """
cflk_dir=$1
in_bam=$2
gtf_file=$3
out_dir=$4

$cflk_dir/cuffquant      \\
   -p 8  -u              \\
   -o $out_dir           \\
   $gtf_file             \\
   $in_bam
      """
      sh_work = ""
      for samp in self['samp']:
         brief_name = self['samp_info']['samp_brief'][samp]
         in_bam      = "%s/%s/accepted_hits.bam"   % ( self.tophat,           brief_name   ) 
         out_dir     = "%s/%s"                     % ( self.cuffquant_ercc_k ,brief_name   )
         sh_work += "sh %s   %s %s %s %s \n" % ( sh_file,  cflk_dir,in_bam, self['infile']['anno_file'], out_dir)
      
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
      my_job.running_multi( cpu=8 )
Exemplo n.º 14
0
   def SRA2fastq(self):
      home_dir    = os.path.abspath('./')
      raw_dir     = self['dir_name']['raw_data']
      fq_dir      = self['dir_name']['fastq_data']
      
      if not os.path.isdir( fq_dir ):
         os.mkdir( fq_dir )
      
      script_dir   = "%s/scripts"         % (home_dir)
      
      fqDump       = self['sftw_name'].fastqDump
      
      sh_file       = "%s/scripts/s01.SRA2Fastq.sh"      % (home_dir)
      sh_work_file  = "%s/scripts/s01.SRA2Fastq_work.sh" % (home_dir)
      
      sh_info = """
samp_name=$1
fqDump=$2
raw_dir=$3
fq_dir=$4

$fqDump --split-files --gzip --outdir $fq_dir/${samp_name} $raw_dir/${samp_name}.sra 

mv $fq_dir/${samp_name}/${samp_name}_1.fastq.gz $fq_dir/${samp_name}/${samp_name}.1.fq.gz && \\
mv $fq_dir/${samp_name}/${samp_name}_2.fastq.gz $fq_dir/${samp_name}/${samp_name}.2.fq.gz
      """
      
      sh_work = ""
      for samp_name in self['sample']:
         sh_work  += " sh %s  %s %s %s %s\n" % ( sh_file,  samp_name,fqDump,  raw_dir,fq_dir  )
      
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
      my_job.running_multi( cpu=8 )
Exemplo n.º 15
0
   def __get_HTS_clean_split(self):
      sh_file        =  "%s/p.HTSeq_split.sh"       %  (self.script_dir)
      sh_work_file   =  "%s/p.HTSeq_split_work.sh"  %  (self.script_dir)
      sh_info = """
infile=$1
out_Refseq=$2
out_NONCODE=$3
out_NSMB=$4
inNeo=$5
outNeo=$6

grep -v -P '^NONHSAG|XLOC_' $infile >$out_Refseq
head -n 1 $infile >$out_NONCODE && grep -P '^NONHSAG' $infile >>$out_NONCODE
head -n 1 $infile >$out_NSMB    && grep -P '^XLOC'    $infile >>$out_NSMB

head -n 1 $inNeo >$outNeo
for i in `cut -f 1 %s/novo_lnc_raw.combined.FPKM0.5_rep0.25.multiExon.genlen | uniq`;do grep -w $i $inNeo ;done >>$outNeo
      """ % ( self.data_dir )
      
      infile      = "%s/merge.dexseq_clean.gene.xls"          % ( self.HTS )
      out_Refseq  = "%s/merge.dexseq_clean_refseq.gene.xls"   % ( self.HTS )
      out_NONCODE = "%s/merge.dexseq_clean_NONCODE.gene.xls"  % ( self.HTS )
      out_NSMB    = "%s/merge.dexseq_clean_NSMB.gene.xls"     % ( self.HTS )
      inNeo       = "%s/merge.dexseq_NeoRaw.gene.xls"         % ( self.HTS )
      outNeo      = "%s/merge.dexseq_NeoPass.gene.xls"        % ( self.HTS )
      sh_work = "sh %s  %s %s %s %s %s %s" % ( sh_file,infile,out_Refseq,out_NONCODE,out_NSMB,inNeo,outNeo )
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
      my_job.running_multi( cpu=1 )
Exemplo n.º 16
0
   def __get_HTS_clean_split(self):
      sh_file        =  "%s/p.HTSeq_split.sh"       %  (self.script_dir)
      sh_work_file   =  "%s/p.HTSeq_split_work.sh"  %  (self.script_dir)
      sh_info = """
infile=$1
out_Refseq=$2
out_NONCODE=$3
out_NSMB=$4
inNeo=$5
outNeo=$6

grep -v -P '^NONHSAG|XLOC_' $infile >$out_Refseq
head -n 1 $infile >$out_NONCODE && grep -P '^NONHSAG' $infile >>$out_NONCODE
head -n 1 $infile >$out_NSMB    && grep -P '^XLOC'    $infile >>$out_NSMB

head -n 1 $inNeo >$outNeo
for i in `cut -f 1 %s/novo_lnc_raw.combined.FPKM0.5_rep0.25.multiExon.genlen | uniq`;do grep -w $i $inNeo ;done >>$outNeo
      """ % ( self.data_dir )
      
      infile      = "%s/merge.dexseq_clean.gene.xls"          % ( self.HTS )
      out_Refseq  = "%s/merge.dexseq_clean_refseq.gene.xls"   % ( self.HTS )
      out_NONCODE = "%s/merge.dexseq_clean_NONCODE.gene.xls"  % ( self.HTS )
      out_NSMB    = "%s/merge.dexseq_clean_NSMB.gene.xls"     % ( self.HTS )
      inNeo       = "%s/merge.dexseq_NeoRaw.gene.xls"         % ( self.HTS )
      outNeo      = "%s/merge.dexseq_NeoPass.gene.xls"        % ( self.HTS )
      sh_work = "sh %s  %s %s %s %s %s %s" % ( sh_file,infile,out_Refseq,out_NONCODE,out_NSMB,inNeo,outNeo )
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
      my_job.running_multi( cpu=1 )
Exemplo n.º 17
0
   def run_cufflinks_u(self):

      sh_file      = "%s/s05.cufflinks_GenomeMapped.sh"      % (self.script_dir)
      sh_work_file = "%s/s05.cufflinks_GenomeMapped_work.sh" % (self.script_dir)
      
      cflk_dir     = self['sftw_name'].cflk_dir
      
      sh_info = """
cflk_dir=$1
in_bam=$2
gtf_file=$3
out_dir=$4

$cflk_dir/cufflinks      \\
   -p 8  -u              \\
   -o $out_dir           \\
   $in_bam
      """
      sh_work = ""
      for samp in self['samp']:
         brief_name = self['samp_info']['samp_brief'][samp]
         in_bam      = "%s/%s/accepted_hits.genome.sort.bam"% ( self.tophat,    brief_name   ) 
         out_dir     = "%s/%s"                              % ( self.cufflink_u,brief_name   )
         sh_work += "sh %s  %s  %s %s %s \n" % ( sh_file,  cflk_dir, in_bam, self['infile']['anno_file'], out_dir)
      
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
      my_job.running_multi( cpu=8 )
Exemplo n.º 18
0
   def run_hisat_mannual(self):
      home_dir     = os.path.abspath('./')
      
      fq_dir       = self['dir_name']['fastq_data']
      hisat_dir    = self['dir_name']['hisat_mannual_dir']
      
      if not os.path.isdir(  hisat_dir):
         os.mkdir( hisat_dir )
      
      script_dir   = "%s/scripts"         % (home_dir)
      
      hisat        = self['sftw_name'].hisat
      samtools_exe = self['sftw_name'].samtools
      
      sh_file      = "%s/s02.4.hisatMannual.sh"      % (script_dir)
      sh_work_file = "%s/s02.4.hisatMannual_work.sh" % (script_dir)
      
      sh_info = """
hisat=$1
fq_dir=$2
samp_name=$3
brief_name=$4
hisat_dir=$5
genome=$6
splice_file=$7
samtools_exe=$8

$hisat         -p 8 -x $genome --phred64        \\
   -1 $fq_dir/$samp_name/$samp_name.1.fq.gz     \\
   -2 $fq_dir/$samp_name/$samp_name.2.fq.gz     \\
   -S /dev/null                                 \\
   --novel-splicesite-outfile $splice_file      \\
   2>$hisat_dir/$brief_name/log              && \\
$hisat         -p 8 -x $genome --phred64        \\
   -1 $fq_dir/$samp_name/$samp_name.1.fq.gz     \\
   -2 $fq_dir/$samp_name/$samp_name.2.fq.gz     \\
   -S /dev/stdout                               \\
   --novel-splicesite-infile  $splice_file      \\
   2>$hisat_dir/$brief_name/log.2              |\\
awk '{if($1 ~ /^@/) print $0; else{ for(i=1;i<=NF;i++) if($i!~/^XS/) printf("%s\\t",$i);else XS0=$i;  XS1=((and($2, 0x10) && and($2, 0x40)) || (and($2,0x80) && !and($2,0x10)))?"XS:A:+":"XS:A:-"; print XS1 } }' | awk '{if(length($10)==length($11)){print $0}}' | $samtools_exe view -Sb -q 1 - >$hisat_dir/$brief_name/accepted_hits.raw.bam &&\\ 
$samtools_exe sort -m 2000000000 $hisat_dir/$brief_name/accepted_hits.raw.bam $hisat_dir/$brief_name/accepted_hits
      """ 
      sh_work = ""
      for samp in self['sample']:
         brief_name = self['sam_info']['samp_brief'][samp]
         if not os.path.isdir( "%s/%s" % (hisat_dir,brief_name) ):
            os.mkdir( "%s/%s" % (hisat_dir,brief_name) )
         genome     = "%s.hisat"          % ( self['infile']['genome_file'] )
         splice_file= "%s/%s.spliceSites" % ( hisat_dir,  brief_name        )
         sh_work += "sh %s  %s %s %s %s  %s %s %s %s\n" % ( sh_file, hisat, fq_dir, samp, brief_name, hisat_dir, genome, splice_file, samtools_exe  )
      
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
      my_job.running_multi( cpu=6 )
    def run_CIRCexplorer(self):
        sh_file = "%s/s10.CIRCexplorer.sh" % (self.script_dir)
        sh_work_file = "%s/s10.CIRCexplorer_work.sh" % (self.script_dir)

        if not os.path.isdir(self.CIRCexplorer):
            os.mkdir(self.CIRCexplorer)

        py_CIRCexplorer = "/data/Analysis/huboqiang/software/CIRCexplorer/CIRCexplorer_PE.py"
        py_CIRCexplorer_PE_Check = "/datc/huboqiang/cir_dyj_V2/bin/CIRCexplorer_PE_check.py"
        sh_info = """
py_CIRCexplorer=$1
in_bam=$2
genome=$3
ref_file=$4
out_file=$5
samp=$6

py_CIRCexplorer_PE_check=$7
in_raw_bam=$8

[ ! -d $out_file/$samp ] && mkdir -p $out_file/$samp

#python $py_CIRCexplorer                                                 \\
#   -f $in_bam                                                           \\
#   -g $genome                                                           \\
#   -r $ref_file                                                         \\
#   --tmp                                                                \\
#   -o $out_file/$samp/CIRCexplorer
   
python $py_CIRCexplorer_PE_check                                        \\
   --raw_bam      $in_raw_bam                                           \\
   --out_prefix   $out_file/$samp/CIRCexplorer_circ_PE                  \\
   $out_file/$samp/CIRCexplorer_circ.txt
      """
        sh_work = ""
        for samp in self['samp']:
            brief_name = self['sam_info']['samp_brief'][samp]

            in_bam = "%s/%s/accepted_hits.bam" % (self.tophat_fusion,
                                                  brief_name)
            genome = self['infile']['genome_file']
            ref_file = self['infile']['ref_file']
            out_file = self.CIRCexplorer
            in_raw_bam = "%s/%s/accepted_hits.bam" % (self.tophat, brief_name)

            sh_work += "sh %s  %s %s %s %s %s %s   %s %s \n" % (
                sh_file, py_CIRCexplorer, in_bam, genome, ref_file, out_file,
                brief_name, py_CIRCexplorer_PE_Check, in_raw_bam)

        my_job = m_jobs.running_jobs(sh_file, sh_work_file)
        my_job.load_sh_file(sh_info)
        my_job.load_sh_work_file(sh_work)
Exemplo n.º 20
0
    def run_HTSeq_known(self):

        sh_file = "%s/s04.HTSeq_known.sh" % (self.script_dir)
        sh_work_file = "%s/s04.HTSeq_known_work.sh" % (self.script_dir)

        py_exe = self['sftw_name'].py
        samtools_exe = self['sftw_name'].samtools
        deseq_exe = self['sftw_name'].deseq

        sh_info = """
py_exe=$1
samtools_exe=$2
deseq_exe=$3
tophat_dir=$4
samp_name=$5
HTS_k_dir=$6
known_GTF=$7

$samtools_exe view  -H              $tophat_dir/$samp_name/accepted_hits.bam  > $tophat_dir/$samp_name/accepted_hits.header.sam
$samtools_exe sort  -n -m 200000000 $tophat_dir/$samp_name/accepted_hits.bam    $tophat_dir/$samp_name/accepted_hits.sort_name
$samtools_exe view  -o    $tophat_dir/$samp_name/accepted_hits.sort_name.sam    $tophat_dir/$samp_name/accepted_hits.sort_name.bam 

[ ! -d $HTS_k_dir/$samp_name ] && mkdir -p $HTS_k_dir/$samp_name

$py_exe $deseq_exe                                                                                                                             \\
   -s no -f sam -a 10                                                                                                                        \\
   -o $tophat_dir/$samp_name/accepted_hits.sort_name.gene.sam                                                                                \\
   $tophat_dir/$samp_name/accepted_hits.sort_name.sam  $known_GTF            >$HTS_k_dir/$samp_name/$samp_name.dexseq.txt                 && \\
grep -v -P '^ERCC-|^RGC-|MIR|SNORD|Mir|Snord' $HTS_k_dir/$samp_name/$samp_name.dexseq.txt > $HTS_k_dir/$samp_name/$samp_name.dexseq_clean.txt			&& \\
grep    -P '^ERCC-|^RGC-'                     $HTS_k_dir/$samp_name/$samp_name.dexseq.txt > $HTS_k_dir/$samp_name/$samp_name.dexseq_ERCC_RGCPloyA.txt	&& \\
grep "__no_feature" $tophat_dir/$samp_name/accepted_hits.sort_name.gene.sam | grep -v chrM |                                                 \\
	cat           $tophat_dir/$samp_name/accepted_hits.header.sam /dev/stdin | 		                                                         \\
	$samtools_exe view -Sb /dev/stdin >$tophat_dir/$samp_name/accepted_hits.genome.bam                                                          && \\
$samtools_exe sort  -m 200000000 $tophat_dir/$samp_name/accepted_hits.genome.bam    $tophat_dir/$samp_name/accepted_hits.genome.sort          
rm             $tophat_dir/$samp_name/accepted_hits.sort_name.sam $tophat_dir/$samp_name/accepted_hits.sort_name.bam   $tophat_dir/$samp_name/accepted_hits.sort_name.gene.sam  $tophat_dir/$samp_name/accepted_hits.genome.bam
      """
        sh_work = ""
        for samp in self['samp']:
            tophat_dir = self.tophat
            samp_name = self['samp_info']['samp_brief'][samp]
            known_GTF = self['infile']['anno_file']

            sh_work += "sh %s  %s %s %s  %s %s %s %s\n" % (
                sh_file, py_exe, samtools_exe, deseq_exe, self.tophat,
                samp_name, self.HTS_k, known_GTF)

        my_job = m_jobs.running_jobs(sh_file, sh_work_file)
        my_job.load_sh_file(sh_info)
        my_job.load_sh_work_file(sh_work)
        my_job.running_multi(cpu=8)
Exemplo n.º 21
0
    def run_tophat(self):
        home_dir = os.path.abspath('./')

        cln_dir = self['dir_name']['clean_data']
        tophat_dir = self['dir_name']['tophat_dir']

        if not os.path.isdir(tophat_dir):
            os.mkdir(tophat_dir)

        script_dir = "%s/scripts" % (home_dir)
        bin_dir = "%s/bin" % (home_dir)

        tophat_py = self['sftw_name'].tophat

        sh_file = "%s/s02.tophat.sh" % (script_dir)
        sh_work_file = "%s/s02.tophat_work.sh" % (script_dir)

        sh_info = """
tophat_py=$1
cln_dir=$2
samp_name=$3
brief_name=$4
tophat_dir=$5
genome=$6
gtf_file=$7
PE2=$8

$tophat_py  \\
   -p 8 -G $gtf_file                                                 \\
   --library-type fr-unstranded                                      \\
   --transcriptome-index /datc/huboqiang/cir_dyj_V2/Database/refseqGene.ERCC_RGCPloyA.exon.sort \\
   -o $tophat_dir/$brief_name                                        \\
   $genome                                                           \\
   $cln_dir/$samp_name/1.cln.fq.gz  $PE2
      """
        sh_work = ""
        for samp in self['sample']:
            brief_name = self['sam_info']['samp_brief'][samp]
            PE2 = ""
            if self['sam_info']['data_type'][samp] == "PE":
                PE2 = "%s/%s/2.cln.fq.gz" % (cln_dir, samp)
            sh_work += "sh %s  %s %s %s %s  %s %s %s %s\n" % (
                sh_file, tophat_py, cln_dir, samp, brief_name, tophat_dir,
                self['infile']['genome_file'], self['infile']['anno_file'],
                PE2)

        my_job = m_jobs.running_jobs(sh_file, sh_work_file)
        my_job.load_sh_file(sh_info)
        my_job.load_sh_work_file(sh_work)
        my_job.running_multi(cpu=6)
   def run_CIRCexplorer(self):
      sh_file      = "%s/s10.CIRCexplorer.sh"      % (self.script_dir)
      sh_work_file = "%s/s10.CIRCexplorer_work.sh" % (self.script_dir)
      
      if not os.path.isdir( self.CIRCexplorer ):
         os.mkdir( self.CIRCexplorer )
      
      py_CIRCexplorer         = "/data/Analysis/huboqiang/software/CIRCexplorer/CIRCexplorer_PE.py"
      py_CIRCexplorer_PE_Check= "/datc/huboqiang/cir_dyj_V2/bin/CIRCexplorer_PE_check.py"
      sh_info = """
py_CIRCexplorer=$1
in_bam=$2
genome=$3
ref_file=$4
out_file=$5
samp=$6

py_CIRCexplorer_PE_check=$7
in_raw_bam=$8

[ ! -d $out_file/$samp ] && mkdir -p $out_file/$samp

#python $py_CIRCexplorer                                                 \\
#   -f $in_bam                                                           \\
#   -g $genome                                                           \\
#   -r $ref_file                                                         \\
#   --tmp                                                                \\
#   -o $out_file/$samp/CIRCexplorer
   
python $py_CIRCexplorer_PE_check                                        \\
   --raw_bam      $in_raw_bam                                           \\
   --out_prefix   $out_file/$samp/CIRCexplorer_circ_PE                  \\
   $out_file/$samp/CIRCexplorer_circ.txt
      """
      sh_work = ""
      for samp in self['samp']:
         brief_name = self['sam_info']['samp_brief'][samp]
         
         in_bam      = "%s/%s/accepted_hits.bam"   % ( self.tophat_fusion, brief_name )
         genome      =  self['infile']['genome_file']
         ref_file    =  self['infile']['ref_file']
         out_file    = self.CIRCexplorer
         in_raw_bam  = "%s/%s/accepted_hits.bam"   % ( self.tophat, brief_name )
         
         sh_work += "sh %s  %s %s %s %s %s %s   %s %s \n" % ( sh_file,  py_CIRCexplorer, in_bam, genome,  ref_file, out_file, brief_name,  py_CIRCexplorer_PE_Check,in_raw_bam )
      
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
Exemplo n.º 23
0
   def run_HTSeq_known(self):
      
      sh_file        =  "%s/s03.HTSeq_known.sh"       %  (self.script_dir)
      sh_work_file   =  "%s/s03.HTSeq_known_work.sh"  %  (self.script_dir)
      
      py_exe         = self['sftw_name'].py
      samtools_exe   = self['sftw_name'].samtools
      deseq_exe      = self['sftw_name'].deseq
      
      sh_info = """
py_exe=$1
samtools_exe=$2
deseq_exe=$3
tophat_dir=$4
samp_name=$5
HTS_k_dir=$6
known_GTF=$7

$samtools_exe view  -H              $tophat_dir/$samp_name/accepted_hits.bam  > $tophat_dir/$samp_name/accepted_hits.header.sam
$samtools_exe sort  -n -m 200000000 $tophat_dir/$samp_name/accepted_hits.bam    $tophat_dir/$samp_name/accepted_hits.sort_name
$samtools_exe view  -o    $tophat_dir/$samp_name/accepted_hits.sort_name.sam    $tophat_dir/$samp_name/accepted_hits.sort_name.bam 

[ ! -d $HTS_k_dir/$samp_name ] && mkdir -p $HTS_k_dir/$samp_name

$py_exe $deseq_exe                                                                                                                             \\
   -s no -f sam -a 10                                                                                                                        \\
   -o $tophat_dir/$samp_name/accepted_hits.sort_name.gene.sam                                                                                \\
   $tophat_dir/$samp_name/accepted_hits.sort_name.sam  $known_GTF            >$HTS_k_dir/$samp_name/$samp_name.dexseq.txt                 && \\
grep -v -P '^ERCC-|^RGC-|MIR|SNORD|Mir|Snord' $HTS_k_dir/$samp_name/$samp_name.dexseq.txt > $HTS_k_dir/$samp_name/$samp_name.dexseq_clean.txt			&& \\
grep    -P '^ERCC-|^RGC-'                     $HTS_k_dir/$samp_name/$samp_name.dexseq.txt > $HTS_k_dir/$samp_name/$samp_name.dexseq_ERCC_RGCPloyA.txt	&& \\
grep "__no_feature" $tophat_dir/$samp_name/accepted_hits.sort_name.gene.sam | grep -v chrM |                                                 \\
	cat           $tophat_dir/$samp_name/accepted_hits.header.sam /dev/stdin | 		                                                         \\
	$samtools_exe view -Sb /dev/stdin >$tophat_dir/$samp_name/accepted_hits.genome.bam                                                          && \\
$samtools_exe view -Sb           $tophat_dir/$samp_name/accepted_hits.sort_name.gene.sam >$tophat_dir/$samp_name/accepted_hits.sort_name.gene.bam   && \\
$samtools_exe sort  -m 200000000 $tophat_dir/$samp_name/accepted_hits.genome.bam          $tophat_dir/$samp_name/accepted_hits.genome.sort          
rm             $tophat_dir/$samp_name/accepted_hits.sort_name.sam $tophat_dir/$samp_name/accepted_hits.sort_name.bam     $tophat_dir/$samp_name/accepted_hits.genome.bam $tophat_dir/$samp_name/accepted_hits.sort_name.gene.sam
      """
      sh_work = ""
      for samp in self['samp']:
         tophat_dir  =  self.tophat
         samp_name   =  self['samp_info']['samp_brief'][samp]
         known_GTF   =  self['infile']['anno_file']

         sh_work += "sh %s  %s %s %s  %s %s %s %s\n" % ( sh_file,  py_exe,samtools_exe,deseq_exe,  self.tophat, samp_name, self.HTS_k, known_GTF)
      
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
      my_job.running_multi( cpu=8 )
Exemplo n.º 24
0
    def run_repeat_count(self):

        sh_file = "%s/s13.Repeat_Count.sh" % (self.script_dir)
        sh_work_file = "%s/s13.Repeat_Count_work.sh" % (self.script_dir)
        py_Repeat_Intersect2Count = "%s/Repeat_Intersect2Count.py" % (
            self.bin_dir)

        samtools_exe = self['sftw_name'].samtools
        bedtools_exe = self['sftw_name'].bedtools
        py_exe = self['sftw_name'].py

        if not os.path.isdir(self.repeatCount):
            os.mkdir(self.repeatCount)

        sh_info = """
samtools_exe=$1
bedtools_exe=$2
py_exe=$3
in_bam=$4
gtf_bed=$5
py_Repeat_Intersect2Count=$6
out_dir=$7

$samtools_exe view -F 0x0004 $in_bam |                     \\
   grep -v ERCC-00* | grep -v RGC-CRE|                      \\
   grep -v RGC-GFP  | grep -v RGC-mRFP |grep "\\bNH:i:1\\b" | \\
   awk '{OFS="\\t"; print $3,$4,$4+length($10),$1 }' >${out_dir}/repeat_result.bed

$bedtools_exe intersect -sorted -loj -a $gtf_bed -b ${out_dir}/repeat_result.bed | \\
   $py_exe $py_Repeat_Intersect2Count /dev/stdin >${out_dir}/repeat_count.bed
      """
        sh_work = ""
        for samp in self['samp']:
            brief_name = self['samp_info']['samp_brief'][samp]
            in_bam = "%s/%s/accepted_hits.bam" % (self.tophat, brief_name)
            out_dir = "%s/%s" % (self.repeatCount, brief_name)
            sh_work += "sh %s  %s %s %s  %s %s %s %s\n" % (
                sh_file, samtools_exe, bedtools_exe, py_exe, in_bam,
                self['infile']['rmsk_bed'], py_Repeat_Intersect2Count, out_dir)

            if not os.path.isdir(out_dir):
                os.mkdir(out_dir)

        my_job = m_jobs.running_jobs(sh_file, sh_work_file)
        my_job.load_sh_file(sh_info)
        my_job.load_sh_work_file(sh_work)
        my_job.running_multi(cpu=8)
Exemplo n.º 25
0
   def run_repeat_count(self):

      sh_file      = "%s/s09.Repeat_Count.sh"      % (self.script_dir)
      sh_work_file = "%s/s09.Repeat_Count_work.sh" % (self.script_dir)
      py_Repeat_Intersect2Count = "%s/Repeat_Intersect2Count.py" % (self.bin_dir)
      
      samtools_exe   = self['sftw_name'].samtools
      bedtools_exe   = self['sftw_name'].bedtools
      py_exe         = self['sftw_name'].py
      
      if not os.path.isdir( self.repeatCount ):
         os.mkdir( self.repeatCount )
      
      sh_info = """
samtools_exe=$1
bedtools_exe=$2
py_exe=$3
in_bam=$4
gtf_bed=$5
py_Repeat_Intersect2Count=$6
out_dir=$7

$samtools_exe view -F 0x0004 $in_bam |                \\
   grep -v ERCC-00* | grep -v RGC-CRE|                \\
   grep -v RGC-GFP  | grep -v RGC-mRFP |grep NH:i:1 | \\
   awk '{OFS="\\t"; print $3,$4,$4+length($10),$1 }' >${out_dir}/repeat_result.bed

sort -S 10%  -k1V -k2n -k3n ${out_dir}/repeat_result.bed ${out_dir}/repeat_result.sort.bed

$bedtools_exe intersect -sorted -loj -a $gtf_bed -b ${out_dir}/repeat_result.sort.bed | \\
   $py_exe $py_Repeat_Intersect2Count /dev/stdin >${out_dir}/repeat_count.bed
      """
      sh_work = ""
      for samp in self['samp']:
         brief_name = self['samp_info']['samp_brief'][samp]
         in_bam      = "%s/%s/accepted_hits.bam"         % ( self.tophat        ,brief_name   ) 
         out_dir     = "%s/%s"                           % ( self.repeatCount   ,brief_name   )
         sh_work += "sh %s  %s %s %s  %s %s %s %s\n" % ( sh_file,  samtools_exe,bedtools_exe,py_exe,  in_bam, self['infile']['rmsk_bed'], py_Repeat_Intersect2Count, out_dir)
         
         if not os.path.isdir( out_dir ):
            os.mkdir( out_dir )
      
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
      my_job.running_multi( cpu=8 )
#      my_job.running_SGE( vf="500m",maxjob=100 )
   def run_tophat(self):
      home_dir     = os.path.abspath('./')
      
      cln_dir      = self['dir_name']['clean_data']
      tophat_dir   = self['dir_name']['tophat_dir']
      
      if not os.path.isdir(  tophat_dir):
         os.mkdir( tophat_dir )
      
      script_dir   = "%s/scripts"         % (home_dir)
      bin_dir      = "%s/bin"             % (home_dir)
      
      tophat_py    = self['sftw_name'].tophat
      
      sh_file      = "%s/s02.tophat.sh"      % (script_dir)
      sh_work_file = "%s/s02.tophat_work.sh" % (script_dir)
      
      sh_info = """
tophat_py=$1
cln_dir=$2
samp_name=$3
brief_name=$4
tophat_dir=$5
genome=$6
gtf_file=$7
PE2=$8

$tophat_py  \\
   -p 8 -G $gtf_file                                                 \\
   --library-type fr-unstranded                                      \\
   --transcriptome-index /datc/huboqiang/cir_dyj_V2/Database/refseqGene.ERCC_RGCPloyA.exon.sort \\
   -o $tophat_dir/$brief_name                                        \\
   $genome                                                           \\
   $cln_dir/$samp_name/1.cln.fq.gz  $PE2
      """ 
      sh_work = ""
      for samp in self['sample']:
         brief_name = self['sam_info']['samp_brief'][samp]
         PE2 = ""
         if self['sam_info']['data_type'][samp ] == "PE":
            PE2 = "%s/%s/2.cln.fq.gz" % (cln_dir,samp)
         sh_work += "sh %s  %s %s %s %s  %s %s %s %s\n" % ( sh_file, tophat_py, cln_dir, samp, brief_name, tophat_dir, self['infile']['genome_file'],self['infile']['anno_file'],PE2  )
      
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
      my_job.running_multi( cpu=6 )
   def run_tophat_mannual(self):
      home_dir     = os.path.abspath('./')
      
      fq_dir       = self['dir_name']['fastq_data']
      tophat_dir   = self['dir_name']['tophat_mannual_dir']
      
      if not os.path.isdir(  tophat_dir):
         os.mkdir( tophat_dir )
      
      script_dir   = "%s/scripts"         % (home_dir)
      
      tophat_py    = self['sftw_name'].tophat
      samtools_exe = self['sftw_name'].samtools
      
      sh_file      = "%s/s02.2.tophatMannual.sh"      % (script_dir)
      sh_work_file = "%s/s02.2.tophatMannual_work.sh" % (script_dir)
      
      sh_info = """
tophat_py=$1
fq_dir=$2
samp_name=$3
brief_name=$4
tophat_dir=$5
genome=$6
gtf_file=$7
samtools_exe=$8

$tophat_py  \\
   -p 8                                                              \\
   --read-edit-dist 3                                                \\
   --read-realign-edit-dist 3                                        \\
   --phred64-quals                                                   \\
   -o $tophat_dir/$brief_name                                        \\
   $genome                                                           \\
   $fq_dir/$samp_name/$samp_name.1.fq.gz
      """ 
      sh_work = ""
      for samp in self['sample']:
         brief_name = self['sam_info']['samp_brief'][samp]
         sh_work += "sh %s  %s %s %s %s  %s %s %s %s\n" % ( sh_file, tophat_py, fq_dir, samp, brief_name, tophat_dir, self['infile']['genome_file'],self['infile']['anno_file'],samtools_exe  )
      
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
      my_job.running_multi( cpu=6 )
Exemplo n.º 28
0
    def run_QC(self):
        home_dir = os.path.abspath('./')
        raw_dir = self['dir_name']['raw_data']
        cln_dir = self['dir_name']['clean_data']

        if not os.path.isdir(cln_dir):
            os.mkdir(cln_dir)

        script_dir = "%s/scripts" % (home_dir)
        bin_dir = "%s/bin" % (home_dir)

        pl_exe = self['sftw_name'].pl
        pl_QC = "%s/bin/QC.pl" % (home_dir)

        sh_file = "%s/scripts/QC.sh" % (home_dir)
        sh_work_file = "%s/scripts/QC_work.sh" % (home_dir)

        sh_info = """
pl_exe=$1
pl_QC=$2
in_dir=$3
out_dir=$4
samp=$5
data_type=$6

$pl_exe $pl_QC --indir $in_dir --outdir $out_dir --sample $samp --end $data_type
      """

        sh_work = ""
        for samp in self['sample']:
            if not os.path.isdir("%s/%s" % (cln_dir, samp)):
                os.mkdir("%s/%s" % (cln_dir, samp))
            in_dir = raw_dir
            out_dir = cln_dir
            data_type = 2
            if self['sam_info']['data_type'][samp] == "SE":
                data_type = 1
            sh_work += " sh %s  %s %s %s %s %s %d\n" % (
                sh_file, pl_exe, pl_QC, in_dir, out_dir, samp, data_type)

        my_job = m_jobs.running_jobs(sh_file, sh_work_file)
        my_job.load_sh_file(sh_info)
        my_job.load_sh_work_file(sh_work)
        my_job.running_multi(cpu=8)
Exemplo n.º 29
0
   def run_cuffnorm_ERCC(self,stage):
      sh_file      = "%s/s08.1.cuffnorm.ERCC.sh"      % (self.script_dir)
      sh_work_file = "%s/s08.1.cuffnorm.ERCC_work.sh" % (self.script_dir)
      if not os.path.isdir( self.cuffnorm_ercc ):
         os.mkdir( self.cuffnorm_ercc )
      
      cflk_dir     = self['sftw_name'].cflk_dir
      np_stage= np.array(stage,dtype="string")
      
      l_brief = []
      l_cxb   = []
      for samp in self['samp']:
         brief_name = self['samp_info']['samp_brief'][samp]
         l_brief.append( brief_name  )
         l_cxb.append(   "%s/%s/abundances.cxb" % (self.cuffquant_ercc,brief_name) )
      
      
      l_brief = np.array( l_brief,dtype="string" )
      l_cxb   = np.array( l_cxb  ,dtype="string" )
      
      np_stage= np.array( stage,dtype="string" )
      
      sh_info = """
cflk_dir=$1

$cflk_dir/cuffnorm           \\
   -p 8  -o %s.Tophat  -L %s \\
   %s                        \\
   %s

$cflk_dir/cuffnorm           \\
   -p 8  -o %s.Hisat   -L %s \\
   %s                        \\
   %s

python %s %s.Tophat/genes.fpkm_table %s.Hisat/genes.fpkm_table | awk '{OFS="\\t";print $1,$2,$4,$3,$5}' >%s/genes.fpkm_table

      """ % ( self.cuffnorm_ercc, ",".join( l_brief[np_stage=="Tophat"] ), self['infile']['anno_file_merge_ERCC'], " ".join( l_cxb[np_stage=="Tophat"] ), self.cuffnorm_ercc, ",".join( l_brief[np_stage=="Hisat"] ), self['infile']['anno_file_merge_ERCC'], " ".join( l_cxb[np_stage=="Hisat"]   ),  self.mrg_py,self.cuffnorm_ercc,self.cuffnorm_ercc,self.cuffnorm_ercc )
      
      sh_work = "sh %s  %s" % (sh_file, cflk_dir)
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
      my_job.running_multi( cpu=8 )
Exemplo n.º 30
0
    def run_HTSeq_unknown(self):

        sh_file = "%s/s07.HTSeq_unknown.sh" % (self.script_dir)
        sh_work_file = "%s/s07.HTSeq_unknown_work.sh" % (self.script_dir)

        py_exe = self['sftw_name'].py
        samtools_exe = self['sftw_name'].samtools
        deseq_exe = self['sftw_name'].deseq

        if not os.path.isdir(self.HTS_u):
            os.mkdir(self.HTS_u)

        sh_info = """
py_exe=$1
samtools_exe=$2
deseq_exe=$3
tophat_dir=$4
samp_name=$5
HTS_u_dir=$6
unknown_GTF=$7

$samtools_exe view  -H              $tophat_dir/$samp_name/accepted_hits.genome.sort.bam          > $tophat_dir/$samp_name/accepted_hits.header.sam
$samtools_exe sort  -n -m 200000000 $tophat_dir/$samp_name/accepted_hits.genome.sort.bam            $tophat_dir/$samp_name/accepted_hits.genome.sort_name
$samtools_exe view  -o              $tophat_dir/$samp_name/accepted_hits.genome.sort_name.sam  $tophat_dir/$samp_name/accepted_hits.genome.sort_name.bam 

[ ! -d $HTS_u_dir/$samp_name ] && mkdir -p $HTS_u_dir/$samp_name

$py_exe $deseq_exe                                                                                                                           \\
   -s no -f sam -a 10                                                                                                                        \\
   $tophat_dir/$samp_name/accepted_hits.genome.sort_name.sam  $unknown_GTF            >$HTS_u_dir/$samp_name/$samp_name.dexseq_NeoRaw.txt
      """
        sh_work = ""
        for samp in self['samp']:
            tophat_dir = self.tophat
            samp_name = self['samp_info']['samp_brief'][samp]
            unknown_GTF = "%s/novo_lnc_raw.combined.gtf" % (self.data_dir)
            sh_work += "sh %s  %s %s %s  %s %s %s %s\n" % (
                sh_file, py_exe, samtools_exe, deseq_exe, self.tophat,
                samp_name, self.HTS_u, unknown_GTF)

        my_job = m_jobs.running_jobs(sh_file, sh_work_file)
        my_job.load_sh_file(sh_info)
        my_job.load_sh_work_file(sh_work)
        my_job.running_multi(cpu=8)
   def run_HTSeq_known(self):
      
      sh_file        =  "%s/s04.HTSeq_known.sh"       %  (self.script_dir)
      sh_work_file   =  "%s/s04.HTSeq_known_work.sh"  %  (self.script_dir)
      
      py_deseq       =  "/data/Analysis/huboqiang/bin/htseq-count"
      
      sh_info = """
tophat_dir=$1
samp_name=$2
py_deseq=$3
HTS_k_dir=$4
known_GTF=$5

samtools view  -H              $tophat_dir/$samp_name/accepted_hits.bam  > $tophat_dir/$samp_name/accepted_hits.header.sam
samtools sort  -n -m 200000000 $tophat_dir/$samp_name/accepted_hits.bam    $tophat_dir/$samp_name/accepted_hits.sort_name
samtools view  -o    $tophat_dir/$samp_name/accepted_hits.sort_name.sam    $tophat_dir/$samp_name/accepted_hits.sort_name.bam 

[ ! -d $HTS_k_dir/$samp_name ] && mkdir -p $HTS_k_dir/$samp_name

python $py_deseq                                                                                                                             \\
   -s no -f sam -a 10                                                                                                                        \\
   -o $tophat_dir/$samp_name/accepted_hits.sort_name.gene.sam                                                                                \\
   $tophat_dir/$samp_name/accepted_hits.sort_name.sam  $known_GTF            >$HTS_k_dir/$samp_name/$samp_name.dexseq.txt                 && \\
grep -v -P '^ERCC-|^RGC-|MIR|SNORD|Mir|Snord' $HTS_k_dir/$samp_name/$samp_name.dexseq.txt > $HTS_k_dir/$samp_name/$samp_name.dexseq_clean.txt			&& \\
grep    -P '^ERCC-|^RGC-'                     $HTS_k_dir/$samp_name/$samp_name.dexseq.txt > $HTS_k_dir/$samp_name/$samp_name.dexseq_ERCC_RGCPloyA.txt	&& \\
grep "__no_feature" $tophat_dir/$samp_name/accepted_hits.sort_name.gene.sam | grep -v chrM |                                                 \\
	cat           $tophat_dir/$samp_name/accepted_hits.header.sam /dev/stdin | 		                                                         \\
	samtools view -Sb /dev/stdin >$tophat_dir/$samp_name/accepted_hits.genome.bam                                                          && \\
samtools sort  -m 200000000 $tophat_dir/$samp_name/accepted_hits.genome.bam    $tophat_dir/$samp_name/accepted_hits.genome.sort          
rm             $tophat_dir/$samp_name/accepted_hits.sort_name.sam $tophat_dir/$samp_name/accepted_hits.sort_name.bam   $tophat_dir/$samp_name/accepted_hits.sort_name.gene.sam  $tophat_dir/$samp_name/accepted_hits.genome.bam
      """
      sh_work = ""
      for samp in self['samp']:
         tophat_dir  =  self.tophat
         samp_name   =  self['samp_info']['samp_brief'][samp]
         known_GTF   =  self['infile']['anno_file']

         sh_work += "sh %s  %s %s %s %s %s\n" % ( sh_file,  self.tophat, samp_name, py_deseq, self.HTS_k, known_GTF)
      
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
   def run_QC(self):
      home_dir    = os.path.abspath('./')
      raw_dir     = self['dir_name']['raw_data']
      cln_dir     = self['dir_name']['clean_data']
      
      if not os.path.isdir( cln_dir ):
         os.mkdir( cln_dir )
      
      script_dir   = "%s/scripts"         % (home_dir)
      bin_dir      = "%s/bin"             % (home_dir)
      
      pl_exe      = self['sftw_name'].pl
      pl_QC       = "%s/bin/QC.pl" % ( home_dir )
      
      sh_file       = "%s/scripts/QC.sh"      % (home_dir)
      sh_work_file  = "%s/scripts/QC_work.sh" % (home_dir)
      
      sh_info = """
pl_exe=$1
pl_QC=$2
in_dir=$3
out_dir=$4
samp=$5
data_type=$6

$pl_exe $pl_QC --indir $in_dir --outdir $out_dir --sample $samp --end $data_type
      """
      
      sh_work = ""
      for samp in self['sample']:
         if not os.path.isdir( "%s/%s" % (cln_dir,samp) ):
            os.mkdir(          "%s/%s" % (cln_dir,samp) )
         in_dir    = raw_dir
         out_dir   = cln_dir
         data_type = 2
         if self['sam_info']['data_type'][samp ] == "SE":
            data_type = 1
         sh_work  += " sh %s  %s %s %s %s %s %d\n" % ( sh_file, pl_exe,  pl_QC, in_dir,out_dir,samp,data_type )
      
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
      my_job.running_multi( cpu=8 )
Exemplo n.º 33
0
   def run_HTSeq_unknown(self):
      
      sh_file        =  "%s/s07.HTSeq_unknown.sh"       %  (self.script_dir)
      sh_work_file   =  "%s/s07.HTSeq_unknown_work.sh"  %  (self.script_dir)
      
      py_exe         = self['sftw_name'].py
      samtools_exe   = self['sftw_name'].samtools
      deseq_exe      = self['sftw_name'].deseq
      
      if not os.path.isdir( self.HTS_u ):
         os.mkdir( self.HTS_u )
      
      sh_info = """
py_exe=$1
samtools_exe=$2
deseq_exe=$3
tophat_dir=$4
samp_name=$5
HTS_u_dir=$6
unknown_GTF=$7

$samtools_exe view  -H              $tophat_dir/$samp_name/accepted_hits.genome.sort.bam          > $tophat_dir/$samp_name/accepted_hits.header.sam
$samtools_exe sort  -n -m 200000000 $tophat_dir/$samp_name/accepted_hits.genome.sort.bam            $tophat_dir/$samp_name/accepted_hits.genome.sort_name
$samtools_exe view  -o              $tophat_dir/$samp_name/accepted_hits.genome.sort_name.sam  $tophat_dir/$samp_name/accepted_hits.genome.sort_name.bam 

[ ! -d $HTS_u_dir/$samp_name ] && mkdir -p $HTS_u_dir/$samp_name

$py_exe $deseq_exe                                                                                                                           \\
   -s no -f sam -a 10                                                                                                                        \\
   $tophat_dir/$samp_name/accepted_hits.genome.sort_name.sam  $unknown_GTF            >$HTS_u_dir/$samp_name/$samp_name.dexseq_NeoRaw.txt
      """
      sh_work = ""
      for samp in self['samp']:
         tophat_dir  =  self.tophat
         samp_name   =  self['samp_info']['samp_brief'][samp]
         unknown_GTF =  "%s/novo_lnc_raw.combined.gtf" % ( self.data_dir )
         sh_work += "sh %s  %s %s %s  %s %s %s %s\n" % ( sh_file,  py_exe,samtools_exe,deseq_exe,  self.tophat, samp_name, self.HTS_u, unknown_GTF)
      
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
      my_job.running_multi( cpu=8 )
Exemplo n.º 34
0
   def makeGTF_withoutERCC(self):
      sh_file        =  "%s/RemoveERCC.sh"       %  (self.script_dir)
      sh_work_file   =  "%s/RemoveERCC_work.sh"  %  (self.script_dir)
      
      sh_info = """
known_GTF=$1
remove_ERCC_GTF=$2

grep -P "^chr" $known_GTF >$remove_ERCC_GTF
      """
      
      known_GTF      = self['infile']['anno_file']
      remove_ERCC_GTF= "%s.sort.gtf"    % ( ".".join( self['infile']['anno_file'].split(".")[:-3] )  )
      self['infile']['anno_file_remove_ERCC'] = remove_ERCC_GTF
      
      sh_work = "sh %s   %s %s" % (  sh_file,  known_GTF , remove_ERCC_GTF )
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
      my_job.running_multi( cpu=1 )
   def run_cuffcomp_novo_trans(self):

      sh_file      = "%s/s06.1.cuffcompare_novo.sh"      % (self.script_dir)
      sh_work_file = "%s/s06.1.cuffcompare_novo_work.sh" % (self.script_dir)
      
      sh_info = """
out_prefix=$1
shift 

/data/Analysis/huboqiang/software/cufflinks-2.2.1.Linux_x86_64/cuffcompare    \\
   -o  $out_prefix                                                            \\
   -T  $@                                                                     \\
      """
      sh_work = ""
      out_prefix  = "%s/novo_lnc_raw"           % ( self.data_dir )
      l_in_samp   = [  "%s/%s/transcripts.gtf"  % ( self.cufflink_u,self['samp_info']['samp_brief'][samp] ) for samp in self['samp'] ]
      sh_work = "sh %s   %s %s" % ( sh_file, out_prefix, " ".join(l_in_samp) )
      
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
Exemplo n.º 36
0
    def makeGTF_withoutERCC(self):
        sh_file = "%s/RemoveERCC.sh" % (self.script_dir)
        sh_work_file = "%s/RemoveERCC_work.sh" % (self.script_dir)

        sh_info = """
known_GTF=$1
remove_ERCC_GTF=$2

grep -P "^chr" $known_GTF >$remove_ERCC_GTF
      """

        known_GTF = self['infile']['anno_file']
        remove_ERCC_GTF = "%s.sort.gtf" % (".".join(
            self['infile']['anno_file'].split(".")[:-3]))
        self['infile']['anno_file_remove_ERCC'] = remove_ERCC_GTF

        sh_work = "sh %s   %s %s" % (sh_file, known_GTF, remove_ERCC_GTF)
        my_job = m_jobs.running_jobs(sh_file, sh_work_file)
        my_job.load_sh_file(sh_info)
        my_job.load_sh_work_file(sh_work)
        my_job.running_multi(cpu=1)
   def SRA2fastq(self):
      home_dir    = os.path.abspath('./')
      raw_dir     = self['dir_name']['raw_data']
      fq_dir      = self['dir_name']['fastq_data']
      
      if not os.path.isdir( fq_dir ):
         os.mkdir( fq_dir )
      
      script_dir   = "%s/scripts"         % (home_dir)
      
      fqDump       = self['sftw_name'].fastqDump
      python_exe   = self['sftw_name'].py
      fq_cvt_py     = "%s/qual_cvt.py"                   % (self['bin_dir'])
      
      sh_file       = "%s/scripts/s01.SRA2Fastq.sh"      % (home_dir)
      sh_work_file  = "%s/scripts/s01.SRA2Fastq_work.sh" % (home_dir)
      sh_info = """
samp_name=$1
fqDump=$2
raw_dir=$3
fq_dir=$4
fq_cvt_py=$5
python_exe=$6

$fqDump --split-files --gzip --outdir $raw_dir/${samp_name} $raw_dir/${samp_name}.sra 

#mv $fq_dir/${samp_name}/${samp_name}_1.fastq.gz $fq_dir/${samp_name}/${samp_name}.1.fq.gz

$python_exe $fq_cvt_py $raw_dir/${samp_name}/${samp_name}_1.fastq.gz $fq_dir/${samp_name}/${samp_name}.1.fq.gz 59 64
      """
      
      sh_work = ""
      for samp_name in self['sample']:
         sh_work  += " sh %s  %s %s %s %s  %s %s\n" % ( sh_file,  samp_name,fqDump,  raw_dir,fq_dir, fq_cvt_py, python_exe  )
      
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
      my_job.running_multi( cpu=8 )
   def run_tophat_fusion(self):
      home_dir     = os.path.abspath('./')
      
      cln_dir      = self['dir_name']['clean_data']
      trim_dir     = self['dir_name']['trim_data']
      tophat_dir   = self['dir_name']['tophat_dir']
      fusion_dir   = self['dir_name']['tophat_fusion']
      
      script_dir   = "%s/scripts"         % (home_dir)
      bin_dir      = "%s/bin"             % (home_dir)
      
      sh_file      = "%s/s03.tophat_fusion.sh"      % (script_dir)
      sh_work_file = "%s/s03.tophat_fusion_work.sh" % (script_dir)
      
      sh_info = """
tophat_dir=$1
brief_name=$2
fusion_dir=$3
genome=$4

/data/Analysis/huboqiang/software/bedtools-2.17.0/bin/bedtools bamtofastq -i $tophat_dir/$brief_name/unmapped.bam -fq /dev/stdout | gzip - >$tophat_dir/$brief_name/unmapped.fq.gz

/data/Analysis/huboqiang/software/tophat-2.0.12.Linux_x86_64/tophat  \\
   --fusion-search --keep-fasta-order --bowtie1                      \\
   --no-coverage-search -p 8                                         \\
   -o $fusion_dir/$brief_name                                        \\
   $genome                                                           \\
   $tophat_dir/$brief_name/unmapped.fq.gz
      """
      sh_work = ""
      for samp in self['sample']:
         brief_name = self['sam_info']['samp_brief'][samp]
         sh_work += "sh %s  %s %s %s %s \n" % ( sh_file,   tophat_dir,brief_name,fusion_dir,self['infile']['genome_file']  )
      
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
      my_job.running_SGE( vf="7g",maxjob=5 )
   def run_HTSeq_unknown(self):
      
      sh_file        =  "%s/s07.HTSeq_unknown.sh"       %  (self.script_dir)
      sh_work_file   =  "%s/s07.HTSeq_unknown_work.sh"  %  (self.script_dir)
      
      py_deseq       =  "/data/Analysis/huboqiang/bin/htseq-count"
      
      if not os.path.isdir( self.HTS_u ):
         os.mkdir( self.HTS_u )
      
      sh_info = """
tophat_dir=$1
samp_name=$2
py_deseq=$3
HTS_u_dir=$4
unknown_GTF=$5

samtools view  -H              $tophat_dir/$samp_name/accepted_hits.genome.sort.bam          > $tophat_dir/$samp_name/accepted_hits.header.sam
samtools sort  -n -m 200000000 $tophat_dir/$samp_name/accepted_hits.genome.sort.bam            $tophat_dir/$samp_name/accepted_hits.genome.sort_name
samtools view  -o              $tophat_dir/$samp_name/accepted_hits.genome.sort_name.sam  $tophat_dir/$samp_name/accepted_hits.genome.sort_name.bam 

[ ! -d $HTS_u_dir/$samp_name ] && mkdir -p $HTS_u_dir/$samp_name

python $py_deseq                                                                                                                             \\
   -s no -f sam -a 10                                                                                                                        \\
   $tophat_dir/$samp_name/accepted_hits.genome.sort_name.sam  $unknown_GTF            >$HTS_u_dir/$samp_name/$samp_name.dexseq_NeoRaw.txt
      """
      sh_work = ""
      for samp in self['samp']:
         tophat_dir  =  self.tophat
         samp_name   =  self['samp_info']['samp_brief'][samp]
         unknown_GTF =  "%s/novo_lnc_raw.combined.gtf" % ( self.data_dir )
         sh_work += "sh %s  %s %s %s %s %s\n" % ( sh_file,  self.tophat, samp_name, py_deseq, self.HTS_u, unknown_GTF)
      
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
   def run_tophat(self):
      home_dir     = os.path.abspath('./')
      
      cln_dir      = self['dir_name']['clean_data']
      trim_dir     = self['dir_name']['trim_data']
      tophat_dir   = self['dir_name']['tophat_dir']
      script_dir   = "%s/scripts"         % (home_dir)
      bin_dir      = "%s/bin"             % (home_dir)
      
      sh_file      = "%s/s02.tophat.sh"      % (script_dir)
      sh_work_file = "%s/s02.tophat_work.sh" % (script_dir)
      
      sh_info = """
trim_dir=$1
brief_name=$2
tophat_dir=$3
genome=$4
gtf_file=$5

/data/Analysis/huboqiang/software/tophat-2.0.12.Linux_x86_64/tophat  \\
   -a 6 --microexon-search -m 2                                      \\
   -p 8 -G $gtf_file                                                 \\
   --library-type fr-unstranded                                      \\
   --transcriptome-index /datc/huboqiang/cir_dyj_V2/Database/refseqGene.ERCC_RGCPloyA.exon.sort \\
   -o $tophat_dir/$brief_name                                        \\
   $genome                                                           \\
   $trim_dir/TRIMED_${brief_name}.1.clean.fq.gz                      \\
   $trim_dir/TRIMED_${brief_name}.2.clean.fq.gz
      """
      sh_work = ""
      for samp in self['sample']:
         brief_name = self['sam_info']['samp_brief'][samp]
         sh_work += "sh %s  %s %s %s %s %s\n" % ( sh_file,   trim_dir, brief_name, tophat_dir, self['infile']['genome_file'],self['infile']['anno_file']  )
      
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
   def __get_HTS_clean_split(self):
      sh_file        =  "%s/p.HTSeq_split.sh"       %  (self.script_dir)
      sh_work_file   =  "%s/p.HTSeq_split_work.sh"  %  (self.script_dir)
      sh_info = """
infile=$1
out_Refseq=$2
out_NONCODE=$3
out_NSMB=$4

grep -v -P '^NONHSAG|XLOC_' $infile >$out_Refseq
head -n 1 $infile >$out_NONCODE && grep -P '^NONHSAG' $infile >>$out_NONCODE
head -n 1 $infile >$out_NSMB    && grep -P '^XLOC'    $infile >>$out_NSMB

      """ 
      
      infile      = "%s/merge.dexseq_clean.gene.xls"          % ( self.HTS )
      out_Refseq  = "%s/merge.dexseq_clean_refseq.gene.xls"   % ( self.HTS )
      out_NONCODE = "%s/merge.dexseq_clean_NONCODE.gene.xls"  % ( self.HTS )
      out_NSMB    = "%s/merge.dexseq_clean_NSMB.gene.xls"     % ( self.HTS )
      sh_work = "sh %s  %s %s %s %s " % ( sh_file,infile,out_Refseq,out_NONCODE,out_NSMB )
      my_job = m_jobs.running_jobs(sh_file,sh_work_file)
      my_job.load_sh_file(      sh_info )
      my_job.load_sh_work_file( sh_work )
      my_job.running_multi( cpu=1 )