Exemplo n.º 1
0

path = ''
if len(sys.argv)==2:
	path = sys.argv[1]
else:
	print "No path input"
	sys.exit()

bc = BookCleaner(path)

reload(sys)
sys.setdefaultencoding('utf8')

#AGGREGATE RAW TEXT
text_file = open("big_training_raw.txt", "w+")
print "BookCleaner is now aggregating all texts"
str = bc.getAllFilesCleaned()
text_file.write(str)
text_file.close()
#TOKENIZE
print "Tokenizing texts"

text_file = open("tokenized_train.txt","w+")
file = open("big_training_raw.txt")
tk = Tokenizer()
for line in file:
		line = unicode(line, errors='replace')
		str=tk.tokenizeAdvanced(line)
		text_file.write(str)
text_file.close()