path = '' if len(sys.argv)==2: path = sys.argv[1] else: print "No path input" sys.exit() bc = BookCleaner(path) reload(sys) sys.setdefaultencoding('utf8') #AGGREGATE RAW TEXT text_file = open("big_training_raw.txt", "w+") print "BookCleaner is now aggregating all texts" str = bc.getAllFilesCleaned() text_file.write(str) text_file.close() #TOKENIZE print "Tokenizing texts" text_file = open("tokenized_train.txt","w+") file = open("big_training_raw.txt") tk = Tokenizer() for line in file: line = unicode(line, errors='replace') str=tk.tokenizeAdvanced(line) text_file.write(str) text_file.close()