import io import sys sys.path.insert(0, "..") # for using predict_this module from predict_this.text.word import to_ascii with io.open("corpus_cleaned.txt", "r", encoding="utf-8") as corpus: with open("corpus_cleaned_normalized.txt", "w") as ascii_corpus: for line in corpus: print >> ascii_corpus, to_ascii(line[:-1])
def get_text_analysis(index): print "analyzing text %d with freeling.. (this may take a while, please wait)" % index for analyzed_line in analyze(get_raw_prediction_text(index).replace("%", "")): yield analyzed_line analyzed_words = [word_analysis for i in (1, 2, 3, 4, 5, 7, 8) for line in get_text_analysis(i) for word_analysis in line] # # create a new csv, with category replaced with this.. def write_row(file, fields, separator=","): file.write(separator.join(fields) + "\n") with io.open("predict_this/text/texts1234578.csv", "r", encoding="utf-8") as csvfile: with io.open("predict_this/text/texts1234578_2.csv", "w", encoding="utf-8") as new_csvfile: header = csvfile.next()[:-1].split(",") tag_index = header.index("tag") lemma_index = header.index("lemma") write_row(new_csvfile, header) i = 1 for line, analyzed in izip(csvfile, analyzed_words): row = line[:-1].split(",") if row[0].replace("%", "") != analyzed[0]: print "line %d. %s != %s" % (i, row[0], analyzed[0]) sys.exit(1) row[tag_index] = analyzed[2].split("+")[0] row[lemma_index] = to_ascii(analyzed[1].split("+")[0]) write_row(new_csvfile, row) i += 1