/
run.py
executable file
·298 lines (254 loc) · 11 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
#!/usr/bin/env python
from __future__ import print_function
import operator
import re
import sys
import os
import time
import logging
import pypeline.yaml
import pypeline.logger
from pypeline.pipeline import Pypeline
from pypeline.common.console import \
print_err, \
print_info
from epiomix.tools.commonutils import check_path
from epiomix.config import parse_config, __version__
from epiomix.epimakefile import epicreatemkfile
from epiomix.epimakefile.epivalidmkfile import read_epiomix_makefile
from epiomix.nodes.execute import GeneralExecuteNode
from epiomix.tools import checkchromprefix, \
checkmappabilitychrom, \
getminmax
from epiomix.tools.bamdatastructure import BamCollect, \
MakeCollect, \
MakefileError
from epiomix.nodes.gccorrect import \
GccorrectNode, \
CreateGCModelNode
from epiomix.nodes.cleanbedfiles import \
CleanFilesNode, \
SplitBedFileNode, \
MergeDataFilesNode
ANALYSES = ['Phasogram', 'WriteDepth', 'NucleoMap', 'MethylMap']
class VersionError(StandardError):
pass
def check_python_version():
minversion = 34014192 # sys.hexversion of python 2.7.3
if sys.hexversion < minversion:
raise VersionError("python must be at least version {}. "
"Current version {}\n".format(
(2,7,3),
sys.version_info))
if sys.version_info.major == 3:
raise VersionError(
"epiPALEOMIX only works with python 2.7.3+. Not python3")
def check_pysam_module_version():
required_version = [0, 8, 0]
try:
import pysam
except ImportError:
raise ImportError("It seems 'pysam' is not installed.\n"
"\tpip install pysam\n")
version = map(int, pysam.__version__.split('.'))
if version < required_version:
raise VersionError("Pysam version must be at least version ({}). "
"Current version ({})\n"
"\tpip install pysam --upgrade".format(
'.'.join(map(str,required_version)),
pysam.__version__))
def check_bed_exist(config, infile):
''' Check if bedfiles have been split already, then use them
As a changes in no. of threads from run to run is error prone
in this setup'''
filena, fileext = os.path.splitext(os.path.basename(infile))
reg = r'{}_0([0-9]+).bed'.format(filena)
bedfiles = ((f, int(re.search(reg, f).group(1)))
for f in os.listdir(config.temp_local) if re.search(reg, f))
return [os.path.join(config.temp_local, path)
for path, val in sorted(bedfiles, key=operator.itemgetter(1))]
def split_bedfiles(config, d_make):
''' this name update of bedpaths only works because of
.items() in python 2.7 creates a list so d_make.bedfiles
is updatable in the for loop'''
## maybe put this part into d_make class as we go. easier
enabl_filter, uniqueness, mappapath = d_make.getfilterinfo()
filtnode, nodes = [], []
for bedn, in_bedp in checkbedfiles_ext(d_make.bedfiles):
if enabl_filter and mappapath:
filtnode = [CleanFilesNode(config, d_make, bedn,
mappapath, uniqueness)]
bedexists = check_bed_exist(config, d_make.bedfiles[bedn])
if bedexists:
d_make.bedfiles[bedn] = bedexists
else:
nodes.append(SplitBedFileNode(config, d_make, bedn,
dependencies=filtnode))
return nodes
def chromused_coerce_to_string(bam):
chrused = bam.opts['GCcorrect'].get('--ChromUsed', MakefileError)
bam.opts['GCcorrect']['--ChromUsed'] = str(chrused)
noregions = bam.opts['GCcorrect'].get('--NoRegions', MakefileError)
if isinstance(noregions, str) and noregions.lower() == 'all':
noregions = int(1e7)
elif isinstance(noregions, int):
noregions = int(noregions)
else:
raise MakefileError('--NoRegions: "%s" is incorrect. Must be of a'
' string of {All, all, ALL} or an'
' positive integer' % (noregions,))
bam.opts['GCcorrect']['--NoRegions'] = noregions
def checkbedfiles_ext(bedfiles):
for bedname, bedpath in bedfiles.items():
if isinstance(bedpath, str) and bedpath.endswith('.bed'):
yield bedname, bedpath
elif (isinstance(bedpath, list) and
all([bedp.endswith('.bed') for bedp in bedpath])):
yield bedname, bedpath
def main_anal_to_run(bedinfo, opts):
bedn, bed_paths = bedinfo
for analysis, options in opts.iteritems():
if (analysis in ANALYSES and options['Enabled'] and
(bedn not in options['ExcludeBed'])):
yield analysis
def update_excludebed(d_make, d_bam):
fmt = "{}".format
if d_make.bedfiles.get('MappabilityFilter', False):
fmt = "{}MappaOnly".format
for anal, opts in d_bam.opts.iteritems():
if anal in ANALYSES:
excl_bed = opts.get('ExcludeBed')
if isinstance(excl_bed, str):
opts['ExcludeBed'] = [fmt(excl_bed)]
elif isinstance(excl_bed, list):
opts['ExcludeBed'] = [fmt(bed) for bed in excl_bed]
elif excl_bed is None:
opts['ExcludeBed'] = []
else:
raise MakefileError('Exclude bed in %s is incorrect.'
'Must be a str, list, or None' % (anal,))
def getdequelen(d_bam):
lowerbound, upperbound, top_ninetyfive = getminmax.main(d_bam.baminfo)
rlmin, rltopquant, gcmax = getminmax.main(d_bam.baminfo)
d_bam.opts['WriteDepth']['--DequeLength'] = upperbound
d_bam.opts['NucleoMap']['--DequeLength'] = upperbound
return rlmin, top_ninetyfive
def calc_gcmodel(d_bam):
rlmin, rlmax = getdequelen(d_bam)
if d_bam.opts['GCcorrect'].get('Enabled', False):
chromused_coerce_to_string(d_bam)
assert os.path.exists(d_bam.prefix.get('--MappabilityPath')), \
("If GCcorrection is enabled, a valid --MappabilityPath"
" file must be provided as well")
checkmappabilitychrom.main([d_bam.prefix['--MappabilityPath'],
d_bam.opts['GCcorrect']['--ChromUsed']])
resolution = (rlmax - rlmin)/5
resolution = resolution if resolution % 2 == 1 else resolution - 1
resolutionhalf = resolution/2
gcdependencies = [GccorrectNode(d_bam, rl, resolutionhalf)
for rl in xrange(rlmin+resolutionhalf,
rlmax, resolution)]
return [CreateGCModelNode(d_bam, dependencies=gcdependencies)]
return []
def check_chrom_prefix(d_make):
for bam_name, opts in d_make.makefile['BamInputs'].items():
baminfo = opts['BamInfo']
for bedfile, bedpath in checkbedfiles_ext(d_make.bedfiles):
checkchromprefix.main([baminfo['BamPath'],
d_make.prefix.get('--FastaPath'),
bedpath])
def run_analyses(anal, d_bam, bedinfo, gcnode, splitbednode):
bedn, bed_paths = bedinfo
nodes = []
for idx, bed_p in enumerate(bed_paths):
bedn_temp = '{}_0{}'.format(bedn, idx)
nodes.append(GeneralExecuteNode(anal, d_bam, bedn_temp, bed_p,
gcnode, splitbednode))
return MergeDataFilesNode(d_bam, anal, bedn, dependencies=nodes)
def make_outputnames(config, make):
filename = make["Statistics"]["Filename"]
outputname = os.path.splitext(os.path.basename(filename))[0]
config.makefiledest = os.path.join(config.destination, 'OUT_' + outputname)
config.temp_local = os.path.join(config.destination,
'TEMPORARYFILES_' + outputname)
check_path(config.makefiledest)
check_path(config.temp_local)
def create_nodes(config, makefiles):
topnodes = []
for makefile in read_epiomix_makefile(makefiles):
make_outputnames(config, makefile)
d_make = MakeCollect(makefile)
check_chrom_prefix(d_make)
splitbednode = split_bedfiles(config, d_make)
for bam_name, opts in d_make.makefile['BamInputs'].items():
d_bam = BamCollect(config, bam_name, opts, d_make)
gcnode = calc_gcmodel(d_bam)
update_excludebed(d_make, d_bam)
for bedinfo in checkbedfiles_ext(d_make.bedfiles):
for anal in main_anal_to_run(bedinfo, opts):
topnodes.append(
run_analyses(anal, d_bam, bedinfo, gcnode, splitbednode)
)
if not topnodes:
topnodes.extend(splitbednode+gcnode)
return topnodes
def run(config, makefiles):
check_path(config.temp_root)
logfile_template = time.strftime("epiPALEOMIX_pipe.%"
"Y%m%d_%H%M%S_%%02i.log")
pypeline.logger.initialize(config, logfile_template)
logger = logging.getLogger(__name__)
pipeline = Pypeline(config=config)
topnodes = create_nodes(config, makefiles)
assert topnodes, "No analyses to run. Check %s" % (makefiles)
pipeline.add_nodes(topnodes)
if config.list_output_files:
logger.info("Printing output files ...")
pipeline.print_output_files()
return 0
elif config.list_executables:
logger.info("Printing required executables ...")
pipeline.print_required_executables()
return 0
logger.info("Running Epipaleomix pipeline ...")
if not pipeline.run(dry_run=config.dry_run,
max_running=config.max_threads,
progress_ui=config.progress_ui):
return 1
return 0
def _print_usage():
basename = os.path.basename(sys.argv[0])
if basename == "run.py":
basename = "epiPALEOMIX"
print_info("epiPALEOMIX Pipeline %s\n" % (__version__,))
print_info("Usage:")
print_info(" -- %s help -- Display this message" % basename)
print_info(" -- %s dryrun [...] -- Perform dry run of pipeline on "
"provided makefiles." % basename)
print_info(" -- %s makefile [...] -- Generate makefile template to"
" std.out." % basename)
print_info(" -- %s run [...] -- Run pipeline on provided "
"makefiles." % basename)
def main(argv):
check_python_version()
check_pysam_module_version()
try:
config, args = parse_config(argv)
if args and args[0].startswith("dry"):
config.dry_run = True
except RuntimeError, error:
print_err(error)
return 1
commands = ("makefile", "mkfile", "run", "dry_run", "dry-run", "dryrun")
if (len(args) == 0) or (args[0] not in commands):
_print_usage()
return 1
elif args[0] in ("mkfile", "makefile"):
return epicreatemkfile.main(args[1:])
elif not args[1:]:
_print_usage()
print_err("\nPlease specify at least one makefile!")
return 1
return run(config, args[1:])
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))