Пример #1
0
def sub(tokens, author):
    """sub :: ([(word, POS)], String) -> [word]
    Chooses shit based on shit.
    """

    global _dick, _invdick, _loaded_freq_dick

    if _dick is None:
        _dick, _invdick = synonyms.read_wordbank("../data/wordbank.txt")

    if author not in _loaded_freq_dick:
        _loaded_freq_dick[author] = word_freq.load_file("../wordfreq/%s/sum.txt" % (author))  # tmp
    freq_dick = _loaded_freq_dick[author]

    def subsub(word):
        candidates = synonym_freq.query(word, _dick, _invdick, freq_dick)
        d = random.random()

        acc = 0.0
        for c, w in candidates:
            acc += w
            if acc > d:
                return c

        raise Exception("")

    return [subsub(word) for word, _ in tokens]
Пример #2
0
    def count(file_name, output_file = sys.stdout):
        freq_dick = word_freq.load_file(file_name)

        syn_freq_dick = {}
        for word in words:
            if word in freq_dick:
                syn_freq_dick[word] = freq_dick[word]

        print >>output_file, 'File name:', file_name
        total = sum(syn_freq_dick.values())
        for word, freq in sorted(syn_freq_dick.iteritems(), key=lambda x: x[1], reverse=True)[:30]:
            print >>output_file, word, '%d/%d =' % (freq, total), float(freq) / total
Пример #3
0
import codecs
import re
import os

import word_freq
import synonym_freq

dirs = ['cityup', '9knife', 'love0']

if __name__ == '__main__':
    for author in dirs:
        files = os.listdir('../wordfreq/%s/' % (author))
        dick = {}
        for subfile in files:
            if re.match(r'\d{1,2}\.txt', subfile) is None: continue

            subdick = word_freq.load_file('../wordfreq/%s/%s' % (author, subfile))

            total = sum(subdick.values())

            for key in subdick:
                weight = float(subdick[key]) / total
                if key not in dick:
                    dick[key] = weight
                else:
                    dick[key] += weight

        with codecs.open('../wordfreq/%s/sum.txt' % (author), 'w', encoding='utf8') as File:
            print >>File, 'freq_dick = '
            print >>File, dick