/
find_corpus.py
193 lines (161 loc) · 6.25 KB
/
find_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#####################################
# 把root_dir 设置到文本目录下(txt格式)
# Usage:
# >>> as ? in fig # 任意1个单词
# >>> code * publicly # 任意数量个单词(0-n)
# >>> (show|demonstrate|achieve) ? results # 多个单词选择
# >>> show[ns]? ? results # 不确定的时态(第一个?表示不确定有或没有 第二个?表示任意一个单词)
# >>> ### ? feature fusion # 显示上下文
#####################################
import os
import os.path as osp
import pdb
import re
import sys
import readline
from utils import get_file_paths_by_pattern, p, get_file_name
from utils import color_print
from utils import progress_bar
root_dir = 'txt'
min_times = 1
max_length = 50
max_display = 30
description="""
Usage:
>>> as ? in fig ? 匹配任意一个单词
>>> code * publicly * 匹配 1~n 个单词
>>> (show|provide) results (A|B|C) 匹配多个单词中的一个
>>> show[ns]? ? results 正则表达式匹配
>>> comparison is ?ed 匹配以ed结尾的单词
>>> ### ? feature fusion 在开头加上 ### 可以显示上下文的语境
>>> show 18 show + 行号 可以显示整段的上下文
>>> show all 显示上一次搜索所有的结果
>>> exit 退出
>>> figure 如果fi没有搜索结果试试将fi换成fi
"""
print(description)
if len(sys.argv) > 1 and (sys.argv[1] == '--help' or sys.argv[1] == '-h'):
exit()
all_files = []
all_files.extend(get_file_paths_by_pattern(root_dir, '*.txt'))
all_files.extend(get_file_paths_by_pattern(root_dir, '*/*.txt'))
print('\033[1;37mpattern:\033[0m')
results = {}
full = {}
while True:
try:
print('\033[1;37m', end='')
pattern = input(' >>> ')
print('\033[0m', end='')
pattern = pattern.replace(']?', ']$').replace('*', '(.*)').replace('?', '\w+').replace(']$', ']?')
except KeyboardInterrupt:
print()
continue
show_context = False
ends_string = ['exit', 'end', 'quit', 'q']
help_string = ['help', 'h', '\w+']
if pattern in ends_string:
exit()
if pattern in help_string:
print(description)
continue
if pattern == 'show all':
with open('result.txt', 'r') as f:
lines = f.readlines()
print('\033[1;33m', end='')
for line in lines:
line = line.rstrip('\n')
print(' ' + line)
print('\033[0m', end='')
continue
if pattern.startswith('### '):
pattern = pattern[4:]
show_context = True
max_display = min(20, max_display)
if pattern.startswith('show'): # show+行号
try:
line = int(pattern.split(' ')[-1])
idx = 0
for i in results:
if i[1] >= min_times and a < max_display:
# print_with_color(i[0])
# print('(%d times)' % i[1])
a += 1
for j in full[i[0]]:
if idx != line:
idx += 1
continue
print(f'({idx}) ...', end='')
j = j.replace('\n', '+')
j = re.sub('(?i)' + i[0], '\033[4;33m%s\033[0m' % i[0], j)
print(j)
idx += 1
continue
except:
pass
context = {}
full = {}
results = {}
num_results = 0
for i, file in enumerate(all_files):
try:
progress_bar(i, len(all_files), pre_msg='Processing...', msg='%d results' % num_results)
with open(file, 'r') as f:
text = f.read()
f = re.finditer(pattern, text, flags=re.IGNORECASE)
for i in f:
result = i.group().lower()
if len(result) > max_length:
continue
if result not in results:
results[result] = 1
if show_context:
m1, m2 = i.span()[0], i.span()[1]
context[result] = [text[m1-70: m2+70] + '... (' + get_file_name(file) + ')']
full[result] = [text[m1-700: m2+1000] + '... (' + get_file_name(file) + ')']
num_results += 1
else:
results[result] += 1
if show_context:
m1, m2 = i.span()[0], i.span()[1]
context[result].append(text[m1-70: m2+70] + '... (' + get_file_name(file) + ')')
full[result].append(text[m1-700: m2+1000] + '... (' + get_file_name(file) + ')')
# print(i.span())
except KeyboardInterrupt:
break
def print_with_color(sentence: str):
print(' ', end='')
s = sentence.split()
for i in s:
if i in pattern:
print('\033[1;32m', end='')
print(i, end='')
print('\033[0m', end='')
else:
print('\033[4;33m', end='')
print('\033[1;33m', end='')
print(i, end='')
print('\033[0m', end='')
print(' ', end='')
color_print('Find %d Matching Results in %d Files:' % (len(results.keys()), len(all_files)), 7, end='\n')
results = sorted(results.items(), key=lambda kv:(kv[1], kv[0]), reverse=True)
a = 0
idx = 0
with open('result.txt', 'w') as f:
for i in results:
if i[1] >= min_times and a < max_display:
print_with_color(i[0])
print('(%d times)' % i[1])
a += 1
if show_context:
for j in context[i[0]]:
print(f'({idx}) ...', end='')
j = j.replace('\n', '+')
j = re.sub('(?i)' + i[0], '\033[4;33m%s\033[0m' % i[0], j)
print(j)
idx += 1
else:
line = '%s (%d times)' % (i[0], i[1])
f.writelines(line+'\n')
print()
print('\033[1;37mpattern:\033[0m')