def fic2text(ident): textsegs = Loader.get_field(data['fics'],ident,'fic') rtags = Loader.get_field(data['base'],ident,'tags') rtext = "" for line in textsegs: line = line.replace(u'\xa0',' ') s = re.sub('([.,!?()])', r' \1 ', line) s = re.sub('\s{2,}', ' ', line) line = line.encode('ascii', 'ignore').decode('ascii') rtext += line+" " tags = [] for genre in rtags: for el in rtags[genre]: tname = el["name"] tags.append(tname) reading_ease = textstat.flesch_reading_ease(rtext) reading_level = textstat.flesch_kincaid_grade(rtext) print(ident,reading_ease,reading_level) #tokens = nltk.word_tokenize(rtext) return tags,rtext
def fic2text(ident,master): textsegs = Loader.get_field(data['fics'],ident,'fic') rtags = Loader.get_field(data['base'],ident,'tags') rtext = "" #tngms = [] ttoks = Set([]) atoks = [] rtext = "" for line in textsegs: line = clean_line(line) line = proc_line(line) rtext += line #for i in range(0,len(frags)): # if is_punc(frags[i]): # continue # print(frags[i]) #master = markov.train([rtext],NUM_GRAMS,split_callback=split_line,master_dict=master) return rtext