forked from Multiscale-Genomics/mg-process-fastq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_chipseq.py
155 lines (113 loc) · 4.85 KB
/
process_chipseq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/python
"""
.. Copyright 2017 EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import argparse, urllib2, gzip, shutil, shlex, subprocess, os.path, json
from .. import Tool, Workflow, Metadata
from common import common
from dmp import dmp
import tool
import os
try
from pycompss.api.parameter import FILE_IN, FILE_OUT
from pycompss.api.task import task
from pycompss.api.constraint import constraint
except ImportError :
print "[Warning] Cannot import \"pycompss\" API packages."
print " Using mock decorators."
from dummy_pycompss import *
try :
import pysam
except ImportError :
print "[Error] Cannot import \"pysam\" package. Have you installed it?"
exit(-1)
# ------------------------------------------------------------------------------
class process_chipseq(Workflow):
"""
Functions for processing Chip-seq FastQ files. Files are the aligned,
filtered and analysed for peak calling
"""
def run(self, file_ids, metadata):
"""
Main run function for processing ChIP-seq FastQ data. Pipeline aligns
the FASTQ files to the genome using BWA. MACS 2 is then used for peak
calling.
Parameters
----------
files_ids : list
List of file locations
metadata : list
Returns
-------
outputfiles : list
List of locations for the output bam, bed and tsv files
"""
# TODO - Handle multiple file and background files
genome_fa = file_ids[0]
file_loc = file_ids[1]
file_bgd_loc = file_ids[2]
cf = common()
out_bam = file_loc.replace('.fastq', '.bam')
bwa = tool.bwaAlignerTool(self.configuration)
out_bam = bwa.run((genome_fa, file_loc), ())
#cf.bwa_align_reads(genome_fa, file_loc)
out_bam, out_bam_meta = file_loc.replace('.fastq', '.bam')
#cf.bwa_align_reads(genome_fa, file_bgd_loc)
out_bgd_bam = file_bgd_loc.replace('.fastq', '.bam')
out_bgd_bam, out_bgd_bam_meta = bwa.run((genome_fa, file_bgd_loc), ())
# TODO - Multiple files need merging into a single bam file
# Filter the bams
b3f = tool.biobambam(self.configuration)
b3f_file_out = b3f.run((out_bam[0]), ())
b3f_file_bgd_out = b3f.run((out_bgd_bam[0]), ())
# MACS2 to call peaks
macs2 = tool.macs2(self.configuration)
peak_bed, summits_bed, narrowPeak, broadPeak, gappedPeak = macs2.run((b3f_file_out, b3f_file_bgd_out), ())
return (b3f_file_out, b3f_file_bgd_out, peak_bed, summits_bed, narrowPeak, broadPeak, gappedPeak)
# ------------------------------------------------------------------------------
if __name__ == "__main__":
import sys
import os
# Set up the command line parameters
parser = argparse.ArgumentParser(description="ChIP-seq peak calling")
parser.add_argument("--species", help="Species (homo_sapiens)")
parser.add_argument("--genome", help="Genome FASTA file")
parser.add_argument("--file", help="Project ID of the dataset")
parser.add_argument("--bgd_file", help="Project ID of the dataset")
# Get the matching parameters from the command line
args = parser.parse_args()
species = args.species
genome_fa = args.genome
file_loc = args.data_dir
file_bg_loc = args.tmp_dir
pcs = process_chipseq()
cf = common()
#
# MuG Tool Steps
# --------------
#
# 1. Create data files
# Get the assembly
#genome_fa = cf.getGenomeFromENA(data_dir, species, assembly, False)
#2. Register the data with the DMP
from dmp import dmp
da = dmp()
print da.get_files_by_user("test")
genome_file = da.set_file("test", genome_fa, "fasta", "Assembly", 9606, None)
file_in = da.set_file("test", file_loc, "fasta", "ChIP-seq", 9606, None)
file_bg_in = da.set_file("test", file_bg_loc, "fasta", "ChIP-seq", 9606, None)
print da.get_files_by_user("test")
# 3. Instantiate and launch the App
from basic_modules import WorkflowApp
app = WorkflowApp()
results = app.launch(process_chipseq, [genome_file, file_in, file_bg_in], {})
print da.get_files_by_user("test")