-
Notifications
You must be signed in to change notification settings - Fork 1
/
prepare-data.py
43 lines (35 loc) · 1.37 KB
/
prepare-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from configparser import ConfigParser
from argparse import ArgumentParser
from multiprocessing import Pool
from supertagging.data import corpusparam, SupertagCorpusFile
from gridsearch import Grid
args = ArgumentParser()
args.add_argument("conf", help="configuration file for corpus, e.g. `example.conf`")
args.add_argument("--grid", nargs="+", help="prepare corpora for a grid search", default=[])
args.add_argument("-j", help="no. parallel processes for extraction", type=int, default=1)
args = args.parse_args()
cp = ConfigParser()
cp.read(args.conf)
baseconfig = {**cp["Corpus"], **cp["Grammar"]}
corpuskeys = set(corpusparam.keys())
corpuskeys.remove("core_attribs")
gridparms = [gridc for gridc in args.grid if gridc.split("=")[0] in corpuskeys]
if not gridparms:
with SupertagCorpusFile(corpusparam(**baseconfig)) as corpusfile:
print("extracted", len(corpusfile.grammar.tags))
exit()
grid = Grid(gridparms)
def extract(gridpoint):
config = dict(baseconfig)
for k, v in gridpoint.items():
config[k] = v
config = corpusparam(**config)
try:
with SupertagCorpusFile(config) as corpusfile:
return corpusfile.grammar.tags, gridpoint
except ValueError as e:
print(e)
return [], gridpoint
pool = Pool(max(1, args.j))
for tags, gp in pool.imap_unordered(extract, grid):
print("extracted", len(tags), "for", gp)