forked from roshan-research/openie
/
hamshahri.py
34 lines (27 loc) · 1014 Bytes
/
hamshahri.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from hazm import sent_tokenize, word_tokenize, Normalizer, HamshahriReader, POSTagger, DependencyParser
from InformationExtractor import InformationExtractor
from progress.bar import Bar
hamshahri = HamshahriReader()
normalizer = Normalizer()
tagger = POSTagger()
parser = DependencyParser(tagger=tagger)
extractor = InformationExtractor()
texts = []
output = open('informations.txt', 'w')
for text in Bar(max=310000).iter(hamshahri.texts()):
texts.append(normalizer.normalize(text))
if len(texts) <= 1000: continue
sentences = []
for text in texts:
for sentence in sent_tokenize(text):
words = word_tokenize(sentence)
if len(words) >= 3:
sentences.append(words)
texts = []
tagged = tagger.batch_tag(sentences)
parsed = parser.tagged_batch_parse(tagged)
for sentence in parsed:
# print('*', *[node['word'] for node in sentence.nodelist if node['word']], file=output)
for information in extractor.extract(sentence):
print(*information, sep=' - ', file=output)
print(file=output)