def corp(files): res = [] norm = [] for f in files: print "load %s" % f t = md.load(f) res.append(t) norm.append(normalize(t)) return (res, norm)
cols = map(lambda x: x.strip(), row.split(',')) if len(cols) == 0 or len(cols) == 1: continue us = cols[2:] us = dict(zip(map(lambda x: x.strip(), us), [1] * len(us))) id = cols[0] word = cols[1][1:-1] onehot = map(lambda t: 1 if t in us else 0, top) f.write('%s,%s,%s\n' % (id, word, ','.join(map(str, onehot)))) if __name__ == '__main__': blacklist = ['postnauka.ru', 'elementy.ru', 'm.nkj.ru', 'www.nkj.ru'] filename1 = sys.argv[1] filename2 = sys.argv[2] data1 = md.load(filename1).split('\n') data2 = md.load(filename2).split('\n') urls = {} data = data1 + data2 for row in data: cols = row.split(',') if len(cols) == 0: continue us = cols[2:] count(urls, map(lambda x: x.strip(), us)) for b in blacklist: if b in urls: del urls[b] print len(urls) top = sorted(urls, key=urls.__getitem__, reverse=True)[:50] print top