#!/usr/bin/env python # coding: utf-8 # # Author: Peinan ZHANG # Created at: 2014-10-24 import sys from q010_sort_second_wFreq import sortSecWFreq from collections import defaultdict def calcBigramFreq(lines, unigram_freq): bigram_freq = {} for line in lines: count, c_word, n_word = line.strip().decode('utf-8').split('\t') bigram_freq['%s\t%s' % (c_word, n_word)] = \ float(count) / unigram_freq[c_word] return bigram_freq if __name__ == '__main__': unigram_freq = sortSecWFreq('data/medline.txt.sent.tok') bigram_freq = calcBigramFreq(sys.stdin.readlines(), unigram_freq) for k, v in sorted(bigram_freq.items(), key=lambda x: x[1], reverse=True): sys.stdout.write('%f\t%s\n' % (v, k.encode('utf-8')))
#!/usr/bin/env python # coding: utf-8 # # Author: Peinan ZHANG # Created at: 2014-10-23 import sys def topN(freqDict, N=100, c=True): from q010_sort_second_wFreq import sortSecWFreq count = 0 return_list = [] for k, v in sorted(freqDict.items(), key=lambda x: x[1], reverse=True): if count < N: if c == True: return_list.append('%3d %s\n' % (v, k.encode('utf-8'))) if c == False: return_list.append('%s\n' % k.encode('utf-8')) count += 1 else: break return return_list if __name__ == '__main__': for item in topN(sortSecWFreq(sys.argv[1])): sys.stdout.write(item)
#!/usr/bin/env python # coding: utf-8 # # Author: Peinan ZHANG # Created at: 2014-10-23 import sys, os from q027_top100 import topN from q010_sort_second_wFreq import sortSecWFreq def mkBigramFile(filepath): with open('bigram.temp', 'w') as bigramFile: for line in open(filepath): line = line.strip().decode('utf-8') if len(line) <= 2: bigramFile.write(line.encode('utf-8') + '\n') continue for i in range(len(line) - 1): bigramFile.write('%s%s\n' % \ (line[i].encode('utf-8'), line[i + 1].encode('utf-8'))) if __name__ == '__main__': mkBigramFile(sys.argv[1]) for item in topN(sortSecWFreq('bigram.temp')): sys.stdout.write(item) os.remove('bigram.temp')