def sub(tokens, author): """sub :: ([(word, POS)], String) -> [word] Chooses shit based on shit. """ global _dick, _invdick, _loaded_freq_dick if _dick is None: _dick, _invdick = synonyms.read_wordbank("../data/wordbank.txt") if author not in _loaded_freq_dick: _loaded_freq_dick[author] = word_freq.load_file("../wordfreq/%s/sum.txt" % (author)) # tmp freq_dick = _loaded_freq_dick[author] def subsub(word): candidates = synonym_freq.query(word, _dick, _invdick, freq_dick) d = random.random() acc = 0.0 for c, w in candidates: acc += w if acc > d: return c raise Exception("") return [subsub(word) for word, _ in tokens]
def count(file_name, output_file = sys.stdout): freq_dick = word_freq.load_file(file_name) syn_freq_dick = {} for word in words: if word in freq_dick: syn_freq_dick[word] = freq_dick[word] print >>output_file, 'File name:', file_name total = sum(syn_freq_dick.values()) for word, freq in sorted(syn_freq_dick.iteritems(), key=lambda x: x[1], reverse=True)[:30]: print >>output_file, word, '%d/%d =' % (freq, total), float(freq) / total
import codecs import re import os import word_freq import synonym_freq dirs = ['cityup', '9knife', 'love0'] if __name__ == '__main__': for author in dirs: files = os.listdir('../wordfreq/%s/' % (author)) dick = {} for subfile in files: if re.match(r'\d{1,2}\.txt', subfile) is None: continue subdick = word_freq.load_file('../wordfreq/%s/%s' % (author, subfile)) total = sum(subdick.values()) for key in subdick: weight = float(subdick[key]) / total if key not in dick: dick[key] = weight else: dick[key] += weight with codecs.open('../wordfreq/%s/sum.txt' % (author), 'w', encoding='utf8') as File: print >>File, 'freq_dick = ' print >>File, dick