Exemplo n.º 1
0
def similar_top_opt3(vec, words, topn=200, nthreads=4, freq=None):
    vec.init_sims()

    indices = [vec.vocab[w].index for w in words if w in vec.vocab]
    vecs = vec.syn0norm[indices]
    dists = np.dot(vecs, vec.syn0norm.T)

    if freq is not None:
        dists = dists * np.log(freq)

    if nthreads == 1:
        res = dists2neighbours(vec, dists, indices, topn)
    else:
        batchsize = int(ceil(1. * len(indices) / nthreads))
        print >> stderr, "dists2neighbours for %d words in %d threads, batchsize=%d" % (
            len(indices), nthreads, batchsize)

        def ppp(i):
            return dists2neighbours(vec, dists[i:i + batchsize],
                                    indices[i:i + batchsize], topn)

        lres = parallel_map(ppp,
                            range(0, len(indices), batchsize),
                            threads=nthreads)
        res = OrderedDict()
        for lr in lres:
            res.update(lr)

    return res
Exemplo n.º 2
0
def similar_top_opt3(vec, words, topn=200, nthreads=4, freq=None):
    vec.init_sims()

    indices = [vec.vocab[w].index for w in words if w in vec.vocab]
    vecs = vec.syn0norm[indices]
    dists = np.dot(vecs, vec.syn0norm.T)
    print "Shape before freq: ", dists.shape
    
    if freq is not None:
        print "Using freq weighting"
        dists = dists * np.log(freq)
        print "Shape after freq: ", dists.shape

    if nthreads==1:
        res = dists2neighbours(vec, dists, indices, topn)
    else:
        batchsize = int(ceil(1. * len(indices) / nthreads))
        print >> stderr, "dists2neighbours for %d words in %d threads, batchsize=%d" % (len(indices), nthreads, batchsize)
        def ppp(i):
            return dists2neighbours(vec, dists[i:i+batchsize], indices[i:i+batchsize], topn)
        lres = parallel_map(ppp, range(0,len(indices),batchsize), threads=nthreads)
        res = OrderedDict()
        for lr in lres:
            res.update(lr)

    return res
Exemplo n.º 3
0
def similar_top_opt3(wvectors, cvectors, words, topn=200, nthreads=4):
    wvectors.init_sims()
    cvectors.init_sims()
    
    indices = [wvectors.vocab[w].index for w in words if w in wvectors.vocab]
    wvecs = wvectors.syn0norm[indices]
    dists = np.dot(wvecs, cvectors.syn0norm.T)
    

    if nthreads==1:
        res = dists2neighbours(wvectors, cvectors, dists, indices, topn)
    else:
        batchsize = int(ceil(1. * len(indices) / nthreads))
        print >> stderr, "dists2neighbours for %d words in %d threads, batchsize=%d" % (len(indices), nthreads, batchsize)
        def ppp(i):
            return dists2neighbours(wvectors, cvectors, dists[i:i+batchsize], indices[i:i+batchsize], topn)
        lres = parallel_map(ppp, range(0,len(indices),batchsize), threads=nthreads)
        res = OrderedDict()
        for lr in lres:
            res.update(lr)

    return res
Exemplo n.º 4
0
                print(pmid, colored("ya descargado", "yellow"))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("entrada", help="Archivo de entrada", type=open)
    parser.add_argument("-p",
                        "--parallel",
                        help="Ejecuta en forma paralela",
                        action="store_true")
    parser.add_argument("--super1",
                        help="Usa el Super Scraper 1 (Sci-hub)",
                        action="store_true")
    parser.add_argument("--super2",
                        help="Usa el Super Scraper 2 (Libgen)",
                        action="store_true")
    args = parser.parse_args()

    downloaded = load_downloaded()

    in_parallel = args.parallel
    use_super1 = args.super1
    use_super2 = args.super2

    f = args.entrada
    if in_parallel:
        parallel.parallel_map(process_line, f.readlines())
    else:
        for line in f:
            process_line(line)
Exemplo n.º 5
0
socket.setdefaulttimeout(timeout)

def _do_crawl(url, args):
    try:
        import urllib2
        data = urllib2.urlopen(url).read()
        return True, data
    except:
        return False, None

def callback(url, ret, args, data):
    if ret:
        fp = args['output']
        data = ' '.join(data.split('\n'))
        print >>fp, url + '\t' + data

if __name__ == '__main__':
    import sys
    if len(sys.argv) != 4:
        print 'usage: prog urllist outputfile threadnum'
        sys.exit(1)
    urllist = sys.argv[1]
    outputfile = sys.argv[2]
    threadnum = int(sys.argv[3])
    l = [x.strip() for x in open(urllist)]
    fp = open(outputfile, 'w')
    args = {}
    args['output'] = fp
    parallel_map(l, _do_crawl, callback, args, threadnum)
    fp.close()