/
facc1.py
81 lines (62 loc) · 2.57 KB
/
facc1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python2
# coding: utf-8
import sys, os
for lib in os.listdir('./lib'):
sys.path.insert(1, lib)
import re
import warc
import textprocess as tp
def process_facc1_with_filename(facc1file, cluewebfile):
facc1_obj = open(facc1file, 'rb')
clueweb_obj = warc.open(cluewebfile, 'rb')
process_facc1_with_fileobj(facc1_obj, clueweb_obj)
facc1_obj.close()
clueweb_obj.close()
def process_facc1_with_fileobj(facc1_obj, clueweb_obj, logout=sys.stdout, logerr=sys.stderr):
nlpobj = tp.init_CoreNLPServer()
record = clueweb_obj.read_record()
entity_set = set()
is_a_new_record = True
for line in facc1_obj:
(trec_id, encoding, entity_name, entity_start, entity_end, _, __,
freebase_id) = line.strip().split('\t')
# We can iterate over the facc1 file along the clueweb09 file, because
# these files are all organized linearly and ordered by the WARC-TREC-ID.
while record is not None and 'warc-trec-id' not in record or record['warc-trec-id'] != trec_id:
record = clueweb_obj.read_record()
is_a_new_record = True
sentences = []
if record is None:
break
try:
if is_a_new_record:
is_a_new_record = False
html_data = tp.preprocess(record.payload)
# each time a new html file is parsed, a new entity_set must be presented.
entity_set.clear()
#logerr.flush()
#logout.flush()
sentences = tp.get_sentences_from_html_v2(html_data, nlpobj)
#tp.output_html(trec_id, html_data)
#tp.output_sentences(trec_id, sentences)
except:
logerr.write("\t".join((line.strip(), "failed_to_get_sentences", re.sub(r'\r\n', '', html_data))) + "\n")
continue
if freebase_id in entity_set:
continue
else:
entity_set.add(freebase_id)
# take the longest sentence from those the entity exists
try:
sentence = max((s for s in sentences if entity_name in s), key=len)
except ValueError:
logerr.write(line.strip() + "\tentity_not_found\n")
continue
logout.write("\t".join(x for x in (
trec_id, entity_name, freebase_id, re.sub(r'\t', u' ', sentence).encode('utf-8')
)) + "\n")
if __name__ == "__main__":
if len(sys.argv) == 3:
process_facc1_with_filename(sys.argv[1], sys.argv[2])
else:
print "Usage: %s <facc1 file> <clueweb file>" % sys.argv[0]