/
extract_lex.py
131 lines (111 loc) · 5.31 KB
/
extract_lex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import warc
import sys, os
import re
from uuid import UUID
import argparse
import slugid
from collections import Counter
from tqdm import tqdm
import redis
# Options
parser = argparse.ArgumentParser(description='Extract Lexicons from WETs')
parser.add_argument('files', metavar='<filepath>', nargs='+',
help='path to file')
parser.add_argument('-b','--binary', action='store_true',
help='output docID as binary form')
parser.add_argument('-s','--startID', metavar='<number>', type=int, default=0,
help='docID Assigment starting after ID')
parser.add_argument('--skipChinese', action='store_true',
help='if set, will not parse chinese words')
parser.add_argument('-r', '--redis', metavar='<path/to/redis.sock>', default='/tmp/redis.sock',
help='if set, will use redis server to store URL Table')
parser.add_argument('-db', '--redisDB', metavar='<dbID>', type=int, default=0,
help='The DB to use in redis server')
parser.add_argument('-w', '--docIDwet', metavar='<path/to/docIDwet.tsv>',
help='If set, direct docID to wet summery to file')
# UUID
parser.add_argument('-u','--uuid', action='store_true',
help='use UUID/ if not specified, use assign new ID mode')
parser.add_argument('-c','--compressuuid', action='store_true',
help='compress UUID in a compact form, only valid in UUID mode')
args = parser.parse_args()
if not args.skipChinese:
import jieba
# NLP
from langid.langid import LanguageIdentifier, model
Language = LanguageIdentifier.from_modelstring(model, norm_probs=True)
# constants
space_devided_langs = ['en','fr','de','it','la','es']
latin_sep_words = re.compile(r"\W+")
# deprecated use non_latin_words_pattern
# chinese_stop_words = [',', '\n','。',',', '.' ,'?','|',']','!','(',')', ' ', '\t']
# global_escape_words = [b'\x00']
non_latin_words_pattern = re.compile(r"([^\u0000-\u007F]|\w)+")
from modules import NumberGenerator
docIDDigits = 4
frequancyDigits = 2
docIdGenerator = NumberGenerator.Number(digits=docIDDigits, after=args.startID)
r = redis.Redis(unix_socket_path=args.redis, db=args.redisDB)
if args.docIDwet:
docIDwetFile = open(args.docIDwet, mode='a')
else:
docIDwetFile = sys.stderr
for filepath in args.files:
print("* Dealing:", filepath, file=sys.stderr)
with warc.open(filepath, 'rb') as f:
for (record, offset, _) in tqdm(f.browse(), unit='records'):
URI = record.url
if URI:
content = record.payload.read()
if content:
(lang, langConfidence) = Language.classify(content)
if lang in space_devided_langs:
words = latin_sep_words.split(str(content))
elif lang == 'zh' and not args.skipChinese:
words = jieba.cut(content, cut_all=False)
# words = list(words)
words = [word for word in words if non_latin_words_pattern.match(word)]
else:
# other languages
continue
# words = [ word for word in words if word not in global_escape_words]
docLength = len(words)
words = [(k, v) for (k,v) in Counter(words).items()]
if args.uuid:
uuid = record.header.get('WARC-Record-ID')[1:-1]
uuid = UUID(uuid)
if args.compressuuid:
uuid = slugid.encode(uuid)
[print("{word}\t{uuid} {count}".format(
word=word, uuid=uuid.decode('ascii'), count=count
))
for (word, count) in words]
else:
docID = docIdGenerator.next()
r.hmset(docID, {
'url' : URI,
'lang': lang,
'len' : docLength,
'off' : offset
})
if args.binary:
docID = docID.to_bytes(docIDDigits, 'big', signed=True)
else:
docID = str(docID)
if args.binary:
for (word, count) in words:
sys.stdout.buffer.write(word.encode() +
b'\t' + docID + b' ' +
str(count).encode() +
b'\n')
else:
for (word, count) in words:
print("{word}\t{docID} {count}".format(word=word, docID=docID, count=str(count))
, file = sys.stdout)
# After each file:
print("{startID}\t{endID}\t{file}".format(startID=args.startID,
endID=int.from_bytes(docID, 'big', signed=True),
file=filepath),
file=docIDwetFile, flush=True)
if args.docIDwet:
docIDwetFile.close()