Пример #1
0
def corp(files):
    res = []
    norm = []
    for f in files:
        print "load %s" % f
        t = md.load(f)
        res.append(t)
        norm.append(normalize(t))
    return (res, norm)
Пример #2
0
        cols = map(lambda x: x.strip(), row.split(','))
        if len(cols) == 0 or len(cols) == 1:
            continue
        us = cols[2:]
        us = dict(zip(map(lambda x: x.strip(), us), [1] * len(us)))
        id = cols[0]
        word = cols[1][1:-1]
        onehot = map(lambda t: 1 if t in us else 0, top)
        f.write('%s,%s,%s\n' % (id, word, ','.join(map(str, onehot))))


if __name__ == '__main__':
    blacklist = ['postnauka.ru', 'elementy.ru', 'm.nkj.ru', 'www.nkj.ru']
    filename1 = sys.argv[1]
    filename2 = sys.argv[2]
    data1 = md.load(filename1).split('\n')
    data2 = md.load(filename2).split('\n')
    urls = {}
    data = data1 + data2
    for row in data:
        cols = row.split(',')
        if len(cols) == 0:
            continue
        us = cols[2:]
        count(urls, map(lambda x: x.strip(), us))
    for b in blacklist:
        if b in urls:
            del urls[b]
    print len(urls)
    top = sorted(urls, key=urls.__getitem__, reverse=True)[:50]
    print top