-
Notifications
You must be signed in to change notification settings - Fork 0
/
tag_prediction_texts.py
46 lines (37 loc) · 1.75 KB
/
tag_prediction_texts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from predict_this.text.prediction_text import PredictionText
from predict_this.text.word import Word
from predict_this.text.word import to_ascii
from predict_this.category.analyze_syntax import analyze
from itertools import izip
import io
import sys
def get_raw_prediction_text(index):
prediction_text = PredictionText(index)
raw_text = ""
for line in prediction_text.lines():
raw_text += " ".join(map(Word.original_word, line[1:])) + "\n"
return raw_text
def get_text_analysis(index):
print "analyzing text %d with freeling.. (this may take a while, please wait)" % index
for analyzed_line in analyze(get_raw_prediction_text(index).replace("%", "")):
yield analyzed_line
analyzed_words = [word_analysis for i in (1, 2, 3, 4, 5, 7, 8) for line in get_text_analysis(i) for word_analysis in line]
# # create a new csv, with category replaced with this..
def write_row(file, fields, separator=","):
file.write(separator.join(fields) + "\n")
with io.open("predict_this/text/texts1234578.csv", "r", encoding="utf-8") as csvfile:
with io.open("predict_this/text/texts1234578_2.csv", "w", encoding="utf-8") as new_csvfile:
header = csvfile.next()[:-1].split(",")
tag_index = header.index("tag")
lemma_index = header.index("lemma")
write_row(new_csvfile, header)
i = 1
for line, analyzed in izip(csvfile, analyzed_words):
row = line[:-1].split(",")
if row[0].replace("%", "") != analyzed[0]:
print "line %d. %s != %s" % (i, row[0], analyzed[0])
sys.exit(1)
row[tag_index] = analyzed[2].split("+")[0]
row[lemma_index] = to_ascii(analyzed[1].split("+")[0])
write_row(new_csvfile, row)
i += 1