def f_process_one_CTCF(loc_bam, head_dir, node_base_dir):
    #import ipdb; ipdb.set_trace()
    individual_id = loc_bam.split('-')[1]+'_'+loc_bam.split('.')[0].split('-')[3]
    node_dir = node_base_dir + '/' + individual_id
    my.f_ensure_make_dir(node_dir)
    add_chr_cmd = "samtools view -H %s/%s | sed -e 's/SN:\([0-9XY]\)/SN:chr\\1/' -e 's/SN:MT/SN:chrM/' | samtools reheader - %s/%s > %s/%s" % (head_dir, loc_bam, head_dir, loc_bam, node_dir, loc_bam)
    my.f_shell_cmd(add_chr_cmd)
    individual_id = loc_bam.split('-')[1]+'_'+loc_bam.split('.')[0].split('-')[3]
    mkdir_cmd = 'makeTagDirectory %s/%s %s/%s' % (node_dir, individual_id, node_dir, loc_bam)
    my.f_shell_cmd(mkdir_cmd)
    copy_cmd = 'cp -r %s/%s %s; rm -r %s' % (node_dir, individual_id, head_dir, node_dir)
    my.f_shell_cmd(copy_cmd)
Exemplo n.º 2
0
    def test_basic(self):
        other_col = 9
        peak_file = '%s/deepsea/tests/data/yy1.sorted.bed' % project_dir

        chr_str = 'chr22'
        vcf_file = '%s/deepsea/tests/data/%s.merge.head.vcf.gz' % (project_dir,
                                                                   chr_str)
        tmp_dir = '%s/deepsea/tmp/%s/' % (project_dir,
                                          my.f_generate_tmp_file_name('t'))
        tmp_dir = '%s/deepsea/tmp/' % (project_dir)
        my.f_ensure_make_dir(tmp_dir)

        fastq_file = f_prepare_deepsea_fastq_based_on_vcf(
            peak_file, vcf_file, tmp_dir)
Exemplo n.º 3
0
    def test_empty_vcf_overlap_with_bed(self):
        #import ipdb; ipdb.set_trace()
        other_col = 9
        peak_file = '/homed/home/shi/expression_var//data/raw_data/tf/encode_peaks/uw-gm12878-dnase.narrowPeak'
        chr_str = 'chr22'
        vcf_file = '%s/deepsea/examples/deepsea/example.vcf' % project_dir
        tmp_dir = '/tmp/tmpegec75'
        my.f_ensure_make_dir(tmp_dir)
        with self.assertRaises(Exception) as context:
            fastq_file = f_prepare_deepsea_fastq_based_on_vcf(peak_file,
                                                              vcf_file,
                                                              tmp_dir,
                                                              debug=False)

        self.assertTrue(
            'Empty overlap between features and vcf file' in context.exception)
Exemplo n.º 4
0
    def test_dnase(self):
        other_col = 9

        peak_file = '/homed/home/shi/expression_var/data/raw_data/tf/encode_peaks/processed/uw-gm12878-dnase.narrowPeak'
        #peak_file = '/homed/home/shi/expression_var/data/raw_data/tf/encode_peaks/processed/haib-gm12878-runx3.narrowPeak'
        #peak_file = '/homed/home/shi/expression_var/data/raw_data/tf/encode_peaks/processed/sydh-gm12878-ctcf.narrowPeak'
        chr_str = 'chr22'
        vcf_file = '%s/deepsea/tests/data/%s.merge.head.vcf.gz' % (project_dir,
                                                                   chr_str)

        tmp_dir = '%s/deepsea/tmp/' % (project_dir)
        my.f_ensure_make_dir(tmp_dir)
        #import ipdb; ipdb.set_trace()
        fastq_file = f_prepare_deepsea_fastq_based_on_vcf(
            peak_file, vcf_file, tmp_dir)
        a = 0
Exemplo n.º 5
0
if f_judge_debug(debug):
    peak_file_df_rmdup = peak_file_df_rmdup.ix[1:3, :]
    f_add_break_point()

if 'vcf' in vcf_file:

    for loc_tf in peak_file_df_rmdup.index:
        final_file = '%s/%s.out.evalue' % (outdir, loc_tf)
        if os.path.isfile(final_file):
            logging.info('Skip %s: %s' % (loc_tf, final_file))
            continue

        try:
            if f_judge_debug(debug):
                tempdir = './tmp/aaa/'
                my.f_ensure_make_dir(tempdir)
            else:
                tempdir = mkdtemp()
            tmp_dir = tempdir
            peak_file = peak_file_df_rmdup.ix[loc_tf, 'file_path']
            #vcf_file = '%s/deepsea/tests/data/chr22.merge.head.vcf.gz'%(project_dir)
            deepsea_tf = peak_file_df_rmdup.ix[loc_tf, 'deepsea_tf']

            print "Successfully copied input to working directory " + tempdir

            try:

                #logging.info("python2.7 p_generate_peak_fastq.py --vcf_file %s --peak_file %s --tmp_dir %s --hg19_file %s" % (vcf_file, peak_file, tmp_dir, hg19_file))
                my.f_shell_cmd(
                    "python2.7 p_generate_peak_fastq.py --vcf_file %s --peak_file %s --tmp_dir %s --hg19_file %s"
                    % (vcf_file, peak_file, tmp_dir, hg19_file))
Exemplo n.º 6
0
def main():

    if __doc__ is None:
        parser.add_argument('--out_dir',
                            help='Out',
                            default='%s/qsub_445samples/' % project_dir)
        parser.add_argument('--test_flag', help='Test flag', default='T')
        opts = parser.parse_args()
        out_dir = opts.out_dir
        test_flag = (opts.test_flag == 'T')
        node_dir = "/state/partition1/shi/tmp_depth/%s/" % my.f_shell_cmd(
            'echo $JOB_ID', quiet=True).replace('\n', '')
    else:
        out_dir = '%s/qsub_445samples/' % project_dir
        node_dir = out_dir + '/node/'
        test_flag = True
    my.f_ensure_make_dir(out_dir)
    FQ_dir = '%s/fastq/' % project_dir
    geuvadis_meta = '%s/metaData/E-GEUV-1.sdrf.txt' % project_dir
    our_study = '%s/metaData/our_sample.list' % project_dir
    metadata = '%s/metadata' % project_dir

    #import ipdb; ipdb.set_trace()
    our_people = set()
    gender = {}
    pop = {}
    for line in open(our_study, 'r').readlines():
        our_people.add(line.strip().split('\t')[0])
        items = line.strip().split('\t')
        person = items[0]
        person_gender = items[3]
        if person not in gender.keys():
            gender[person] = person_gender
        if person not in pop.keys():
            pop[person] = items[1]

    geu1 = set()
    for line in open(geuvadis_meta, 'r').readlines():
        items = line.strip().split('\t')
        geu1.add(items[0])

    of_interest = geu1.intersection(our_people)
    print of_interest
    print len(of_interest)

    person_to_fq = {}
    for line in open(geuvadis_meta, 'r').readlines():
        items = line.strip().split('\t')
        person = items[0]
        if person not in of_interest:
            continue
        if person not in person_to_fq.keys():
            person_to_fq[person] = set()
        curr_fq = items[28]
        person_to_fq[person].add(FQ_dir + os.path.basename(curr_fq))
        #print items

    print person_to_fq
    metadata_file = open(metadata, 'w')
    for person in person_to_fq.keys():

        out_curr = node_dir + person + '.sailfish/'
        metadata_file.write(person + '\t' + ','.join(person_to_fq[person]) +
                            '\t' + out_curr + '\n')
        #And run sailfish
        cur_gender = gender[person]
        cur_pop = pop[person]
        #sailfish_idx='%s/Transcriptome/gencode.v19.annotation.PC.lincRNA.gtf.splicedExon.N'% project_dir +cur_gender+'.fa.dedup.fa_IDX_sailfish'
        index_dir = '~/expression_var/data/raw_data/pop/%s_dir' % cur_pop
        sailfish_idx = '%s/gencode.v19.annotation.PC.lincRNA.gtf.splicedExon.N' % index_dir + cur_gender + '.fa.dedup.fa_IDX_sailfish'
        #cmd_module='module load sailfish/0.6.3'
        library_type = '"T=PE:O=><"'  #T=PE:O=><:S=SA
        fastqs = list(person_to_fq[person])

        #If the output is there, don't lanch the jobs again.
        final_out_file = '%s/%s.sailfish/%squant.gene_level.sf' % (
            out_dir, person, person)
        if os.path.isfile(final_out_file):
            print 'Got the results of %s' % person
            continue
        else:
            print 'Sailfish %s' % person
            my.f_remove_dir('%s/%s.sailfish' % (out_dir, person))

        if not os.path.isfile(fastqs[0]):
            print 'Missing person %s: %s' % (person, fastqs[0])
            if not os.path.isfile(fastqs[1]):
                print 'Missing person %s: %s' % (person, fastqs[1])
                continue
            continue
        cmds = []
        cmds.append('#!/usr/bin/env bash')
        cmds.append('mkdir -p %s' % out_curr)
        cmds.append('cp -u %s %s' % (' '.join(fastqs), out_curr))
        loc_fastqs = [
            os.path.join(out_curr, os.path.basename(fastq_file))
            for fastq_file in fastqs
        ]

        #cmds.append(cmd_module)
        sailfish_exe = '~/packages/Sailfish-0.6.3-Linux_x86-64/bin/sailfish'
        sailfish_cmd = sailfish_exe + ' quant -i ' + sailfish_idx + ' -l ' + library_type + ' -1 <(gunzip -c ' + loc_fastqs[
            0] + ') -2 <(gunzip -c ' + loc_fastqs[
                1] + ') -o ' + out_curr + ' -f'
        cmds.append(sailfish_cmd)
        cmds.append('cd ' + out_curr)
        #cmds.append('module load java/latest')
        gtf = '%s/GENCODE_v19_2014-06-03/gencode.v19.annotation.PC.lincRNA.gtf' % project_dir
        cmds.append('%s/TranscriptsToGenes.sh --gtf-file ' % script_dir + gtf +
                    ' --exp-file ' + out_curr + '/quant.sf' + ' --res-file ' +
                    person + 'quant.gene_level.sf')
        cmds.append('mv ' + out_curr + '/quant.sf' + ' ' + out_curr + '/' +
                    person + 'quant.sf')
        cmds.append('rm %s/*.fastq.gz' % (out_curr))
        cmds.append('rm %s/reads.*' % (out_curr))
        cmds.append('mv %s %s/' % (out_curr, out_dir))
        cmds.append('rm -r %s' % (out_curr))
        print '\n'.join(cmds)
        if test_flag == False:
            qsub_a_command('qqqq'.join(cmds), out_dir + person + '_script.sh',
                           'qqqq', '10G')
Exemplo n.º 7
0
def f_novo_mapping(fastq_dir,
                   fastq_gz_file,
                   data_dir,
                   wgs_dir,
                   cell,
                   local_bin,
                   desination_dir,
                   mode=None,
                   short_read=False):

    my.f_ensure_make_dir(data_dir)
    my.f_copy_to_dir(fastq_dir, fastq_gz_file, data_dir)
    my.f_unzip_targz(data_dir + "/" + fastq_gz_file)
    fastq_file = data_dir + "/" + fastq_gz_file.replace(".gz", "")

    cell_nix_file = wgs_dir + "/" + cell + ".nix"
    #file_prefix = my.f_get_prefix(fastq_file);
    file_prefix = fastq_file.replace(".fastq", "")  # + '.'.join(mode)
    map_stats_file = file_prefix + ".stats.txt"
    map_bam_file = file_prefix + ".sam.map"

    mode_string = ''
    if 'test' in mode:
        mode_string = " -#1k "
        desination_dir = desination_dir + '/test/'
        my.f_ensure_make_dir(desination_dir)

    if 'unique' in mode:
        mode_string = mode_string + " -r None"

    if mode_string == '' and mode != None:
        logging.error('Unkonwn mode: %s ' % mode)

    #import ipdb; ipdb.set_trace()
    match_object = re.match(".*methy.*", fastq_gz_file)
    if match_object != None:
        #-F ILMFQ
        map_cmd = "%s/novoalign  -d %s -f %s -o SAM %s 2> %s > %s" % (
            local_bin, cell_nix_file, fastq_file, mode_string, map_stats_file,
            map_bam_file)
    elif short_read == True:
        map_cmd = "%s/novoalign  -d %s -f %s -l 20 -o SAM %s 2> %s > %s" % (
            local_bin, cell_nix_file, fastq_file, mode_string, map_stats_file,
            map_bam_file)
    else:
        map_cmd = "%s/novoalign -F ILMFQ -d %s -f %s -o SAM %s 2> %s > %s; echo $?" % (
            local_bin, cell_nix_file, fastq_file, mode_string, map_stats_file,
            map_bam_file)

    logging.info("Map cmd: " + map_cmd)
    first_try = my.f_shell_pipe(map_cmd)

    map_stats = ''
    if first_try == '0\n':
        map_stats = 'ILMFQ'
    else:
        logging.warning('Novo output: first try faild ' + first_try)

        map_cmd = "%s/novoalign  -d %s -f %s -o SAM %s 2> %s > %s;echo $?" % (
            local_bin, cell_nix_file, fastq_file, mode_string, map_stats_file,
            map_bam_file)
        logging.info("2nd Map cmd: " + map_cmd)
        sencond_try = my.f_shell_pipe(map_cmd)
        logging.info('Sencond try: ' + sencond_try)
        if sencond_try == '0\n':
            map_stats = 'default'

    sort_cmd = "samtools view -bS %s | samtools sort - %s" % (
        map_bam_file, file_prefix + ".sorted")
    my.f_shell_pipe(sort_cmd)

    bam_file = "%s.sorted.bam" % os.path.basename(file_prefix)
    #f_bam_remove_dup(bam_file, data_dir, desination_dir, picard_java_lib_path)

    my.f_copy_to_dir(data_dir, "%s.stats.txt" % os.path.basename(file_prefix),
                     desination_dir)
    my.f_copy_to_dir(data_dir, "%s.sorted.bam" % os.path.basename(file_prefix),
                     desination_dir)

    os.remove(map_stats_file)
    os.remove("%s.sorted.bam" % file_prefix)
    os.remove(fastq_file)
    os.remove(map_bam_file)

    if not os.listdir(data_dir):
        logging.info('Empty dir: %s ' % data_dir)
        os.rmdir(data_dir)
    else:
        logging.warning("Not empty dir")
        logging.warning(os.listdir(data_dir))

    return map_stats
Exemplo n.º 8
0
home_dir = os.path.expanduser('~')
lib_dir = '%s/python/' % home_dir
import sys

sys.path.insert(0, lib_dir)
sys.path.insert(0, '%s/expression_var/python/' % home_dir)
import pandas as pd
import p_mymodule as my
from p_project_metadata import *
from p_generate_peak_fastq import chipseq_region

peak_file_df_rmdup = f_get_peak_file_df_rmdup(project_dir)

print peak_file_df_rmdup.head()

processed_dir = '%s/data/raw_data/tf/encode_peaks/processed/' % project_dir
my.f_ensure_make_dir(processed_dir)

for loc_tf in peak_file_df_rmdup.tf:
    #loc_tf = 'pol2'
    print 'Process %s' % loc_tf
    peak_file = peak_file_df_rmdup.ix[loc_tf, 'file_path']
    tf_region = chipseq_region(file_path=peak_file)
    tf_region.merge_overlapped_peaks()
    tf_region.split_peaks_with_multiple_peakMax(debug=False)
    print tf_region.binding_df.head()
    #import ipdb; ipdb.set_trace()
    #print tf_region.binding_df.ix[ tf_region.binding_df.start== 43044464,:]
    tf_region.bed_trim_binding_regions()
    tf_region.save_bed('%s/%s' % (processed_dir, os.path.basename(peak_file)))
Exemplo n.º 9
0
    reload(fun)
    reload(loc)
    cofactor_list_raw = []
    feature_list = []
    #feature_list=["methy"]
    #tf_list=["inputigg","inputstd","ctcf"]
    #tf_list=["ctcf","znf143","bhlhe40","ebf1"]
    #tf_list=["znf143","ctcf","ebf1"]
    #tf_list=['brca1', 'chd2', 'elk1', 'max', 'maz', 'mxi1', 'nfya', 'nfyb', 'rad21', 'rfx5', 'smc3', 'stat3',  'tbp', 'usf2']

reload(fun)
print my.f_shell_cmd('echo $HOME', quiet=True).replace('\n', '')
my.f_unique_element_in_list(guest_cells)
guest_cell = guest_cells[0]
guest_extract_flag = guest_cell != cell  #When the host and guest cell are different, means that try to predict variation impact.
my.f_ensure_make_dir(dnase_dir)
logging.info('Node dir name: ' + dnase_dir)
cofactor_list = [tf_name.lower() for tf_name in cofactor_list_raw]
logging.info(cofactor_list)

reload(my)
reload(fun)
#if in clustdell, copy the tf's bam file to the local node
#import ipdb; ipdb.set_trace()

for tf in tf_list + feature_list + cofactor_list:
    tf_bam_pattern = "*%s*bam" % tf
    print tf_bam_pattern
    f_copy_to_dir(target_dir, tf_bam_pattern, dnase_dir, "-u")

if guest_extract_flag == True:
Exemplo n.º 10
0
    tf_list = ['PU1', 'RPB2']
elif embl_number == '3656':
    tf_list = ['RNA']
else:
    print 'Wrong embl number'

if embl_number != '':  #TF binding data

    cell_list = list(
        set(index_data['Characteristics[coriell id]'].str.replace(
            'NA', 'NA').tolist()))

    my.f_print_list(cell_list)
    for tf in tf_list:
        data_dir = os.path.join(head_dir, tf)
        my.f_ensure_make_dir(data_dir)
        for cell in cell_list:
            data_pattern = '%s_%s' % (cell.replace('NA', ''), tf)
            #fastq_prefix=my.f_create_file_prefix(cell, tf, lab, 'Rep1')
            #fastq_field = 30
            #download_state=f_grep_wget_from_given_embl_file(data_index_file, data_pattern, data_dir, fastq_prefix, download_col = fastq_field, test_flag = test_flag, quiet = True, debug = False)
            bam_field = 36
            bam_prefix = my.f_create_file_prefix(cell, tf, lab)
            #import ipdb; ipdb.set_trace()
            download_state = f_grep_wget_from_given_embl_file(
                data_index_file,
                data_pattern,
                data_dir,
                bam_prefix,
                download_pattern='ftp.*bam',
                test_flag=test_flag,