-
Notifications
You must be signed in to change notification settings - Fork 6
/
assign_top_hit.py
executable file
·160 lines (140 loc) · 5.71 KB
/
assign_top_hit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#! /usr/bin/env python
"""
Takes an m8 blast and picks the best hit for each.
There are two count methods available, and these are not listed in the
options below: 'tophit' and 'toporg'. The other count methods (first,
most, etc) are unavailable.
First, only the best scores are used, but if there is a tie (aka ambiguous
hit), than a winner is assigned using the abundances found from unambiguous
hits. When 'tophit' is selected, the hit with the highest overal abundance
(from unambigous reads) is used. When 'toporg' is used, the hit which belongs
to the most abundant taxon is used.
If the -P or --proportinal flag is given, then ambiguous hits are resolved
so that the overall proportion of hit (when using tophit) or taxon (for
toporg) abundance is changed the least.
filter_top_pct defaults to 0, but can be altered, but I don't recommend it.
"""
import argparse
import logging
import os
import sys
from urllib.parse import unquote_plus
from edl import redistribute
from edl.hits import ACCS, FilterParams, add_taxon_arguments, \
readMaps
from edl.util import add_IO_arguments, add_universal_arguments, \
inputIterator, setup_logging
def main():
description = __doc__
parser = argparse.ArgumentParser(description=description)
add_IO_arguments(parser)
add_taxon_arguments(
parser,
defaults={
'filter_top_pct': 0,
'parseStyle': ACCS,
'countMethod': 'tophit'},
choices={
'countMethod': (
'tophit',
'toporg')})
parser.add_argument(
"-P",
"--proportional",
dest="proportional",
default=False,
action="store_true",
help="Assign reads that have multiple equal top hits to taxa such "
"that the overal proportion of taxa is consistent with the "
"unambiguious hits. This is meant for use with the 'toporg' "
"count method.")
parser.add_argument(
"-i",
"--individualFiles",
dest="individual",
default=False,
action="store_true",
help="Use this flag to process files independently. Normally, "
"counts from all files are pooled for making choices.")
add_universal_arguments(parser)
arguments = parser.parse_args()
setup_logging(arguments)
# load necessary maps
params = FilterParams.create_from_arguments(arguments)
if arguments.countMethod == 'toporg':
(taxonomy, hitStringMap) = readMaps(arguments)
wta = not (arguments.proportional)
if len(arguments.input_files) <= 1 or arguments.individual:
# loop over input
for (inhandle, outhandle) in inputIterator(arguments):
logging.debug(
"Reading from %s and writing to %s" %
(inhandle, outhandle))
if arguments.countMethod == 'tophit':
# don't give any taxonomy, just map to accessions for
# redistribution
readHits = redistribute.pickBestHitByAbundance(
inhandle,
filterParams=params,
return_lines=True,
winnerTakeAll=wta,
parseStyle=arguments.parseStyle)
else:
# translate to organism before finding most abundant
readHits = redistribute.pickBestHitByAbundance(
inhandle,
filterParams=params,
return_lines=True,
winnerTakeAll=wta,
taxonomy=taxonomy,
hitStringMap=hitStringMap,
parseStyle=arguments.parseStyle)
for line in readHits:
outhandle.write(line)
else:
# process all files at once
multifile = redistribute.multipleFileWrapper(arguments.input_files)
# Build a map from input file name to output handle
outputMap = {}
for infile_name in arguments.input_files:
if arguments.output_file is None:
outputMap[infile_name] = sys.stdout
elif len(arguments.input_files) <= 1:
outputMap[infile_name] = open(arguments.output_file, 'w')
else:
# use outfileName as suffix
if arguments.cwd:
# strip path info first
(infilePath, infileFile) = os.path.split(infile_name)
outfile = "./" + infileFile + arguments.output_file
else:
outfile = infile_name + arguments.output_file
outputMap[infile_name] = open(outfile, 'w')
if arguments.countMethod == 'tophit':
# don't give any taxonomy, just map to accessions for
# redistribution
readHits = redistribute.pickBestHitByAbundance(
multifile,
filterParams=params,
return_lines=False,
winnerTakeAll=wta,
parseStyle=arguments.parseStyle)
else:
# translate to organism before finding most abundant
readHits = redistribute.pickBestHitByAbundance(
multifile,
filterParams=params,
return_lines=False,
winnerTakeAll=wta,
taxonomy=taxonomy,
hitStringMap=hitStringMap,
parseStyle=arguments.parseStyle)
for (read, hit) in readHits:
infile_name, read = read.split("/", 1)
outhandle = outputMap[unquote_plus(infile_name)]
outhandle.write(hit.line.split("/", 1)[1])
if arguments.output_file is not None:
for outhandle in outputMap.values():
outhandle.close()
if __name__ == '__main__':
main()