-
Notifications
You must be signed in to change notification settings - Fork 1
/
ontology.py
95 lines (73 loc) · 2.45 KB
/
ontology.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import sys
import os
from elementtree import ElementTree
def get_articles(category, in_dir):
articles = []
categories = set()
idx = 0
files = os.listdir(in_dir)
for idx_f, file_name in enumerate(files):
print idx_f + 1, "/", len(files)
source = open(in_dir + '/' + file_name)
context = ElementTree.iterparse(source, events=("start", "end"))
context = iter(context)
event, root = context.next()
for event, elem in context:
tag = elem.tag.split('}')[1]
if event == "end" and tag == "text":
if not elem.text:
continue
start = elem.text.find("{{", )
end = elem.text.find("}}")
if start != -1 and end != -1:
parts = elem.text[start+2:end].lower().split('|')
text_c = parts[0].encode('utf-8')
if text_c.startswith(category):
articles.append(elem.text.encode('utf-8'))
categories.add(text_c)
root.clear()
if idx % 1000 == 0:
print idx
idx += 1
f = open("categories", "w")
for c in categories:
f.write(c + "\n")
f.close()
return articles
def print_articles(articles):
f = open("ontology", "w")
for a in articles:
parts = a.split("}}")
if not parts:
continue
short = parts[0]
short = short.replace("{{", "")
short = short.replace("}}", "")
pairs = short.split("|")
f.write("article\n")
for pair in pairs:
if not "=" in pair:
continue
tuple = pair.split("=")
if len(tuple) < 2 or tuple[1].isspace():
continue
f.write(tuple[0].strip()+"\n")
f.write(tuple[1].strip()+"\n")
f.close()
def main():
args_count = len(sys.argv)
if args_count < 3:
print "First command line argument must be directory with input files"
print "Second command line argument must be category name"
return 0
in_dir = sys.argv[1]
category_file = sys.argv[2]
print "Input dir: " + in_dir
f = open(category_file)
category = f.readline().rstrip('\n')
print "Category: " + category
articles = get_articles(category, in_dir)
print_articles(articles)
print "articles in category:", len(articles)
return
main()