sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", AWS_ACCESS_KEY_ID)
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", AWS_SECRET_ACCESS_KEY)
    for item in d:
        name = item

        fp = "s3n://bucket-data/"+name
        p = d[item]
        data = read_song_text(sc, fp, p)
        # sparse vector from data
        data_sparse_list = data.map(lambda x:(x[0][1], x[1]))
        rdd_1 = data.map(lambda x:(x[0][0], x[1]))

        # initialize the parameters
        m, n, b, c = 1000,1000,25,2
        # create the model for LSH
        model = lsh.run(data_sparse_list, p, m, n, b, c)

        print ("start printing filename %s" % name)
        # Get similar buckets
        cnt = model.buckets.count()
        # print result time taken
        timetaken = (time.time() - start_time)
        print 'Found %s clusters.' % cnt
        print("--- %s seconds ---" % timetaken)

        tup = cnt, timetaken
        d[name]  = cnt , (time.time() - start_time)

    # write log file
    f = open('output.txt', 'a')
    mystr = str(time.time())
예제 #2
0
파일: driver.py 프로젝트: wan/pyspark-lsh
        help = "Number of times to hash the elements. Larger numbers diversify " +
            "signatures, increasing likelihood similar vectors will be hashed together. " +
            "This is also the length of the signature. [DEFAULT: 1000]")
    parser.add_argument("-b", "--bands", type = int, default = 25,
        help = "Number of bands. Each band will have (n / b) elements. Larger " +
            "numbers of elements increase confidence in element similarity. [DEFAULT: 25]")
    parser.add_argument("-c", "--minbucketsize", type = int, default = 2,
        help = "Minimum bucket size (0 to disable). Buckets with fewer than this " +
            "number of elements will be dropped. [DEFAULT: 2]")

    args = vars(parser.parse_args())
    sc = SparkContext(conf = SparkConf())

    # Read the input data.
    print now(), 'Starting'
    raw_lines, data = read_text(sc, args['input'])
    p = 65537
    m, n, b, c = args['bins'], args['numrows'], args['bands'], args['minbucketsize']
    vector_buckets = lsh.run(data, p, m, n, b, c)

    bucket_ids = vector_buckets.map(lambda (vector, bucket): bucket).distinct()

    print now(), 'Found %s clusters.' % bucket_ids.count()

    bucket_vectors = vector_buckets.map(lambda (vector, bucket): (bucket, vector)).groupByKey()
    for (bucket, vectors) in bucket_vectors.collect():
        print 'Bucket %s' % bucket
        for vector in vectors:
            print '\tDocument %s: %s ...' % (vector, raw_lines[vector][:100])
        print '*' * 40
예제 #3
0
    parser = argparse.ArgumentParser(description = 'Spark LSH',
        epilog = 'lol lsh', add_help = 'How to use',
        prog = 'python driver.py <arguments>')
    parser.add_argument("-i", "--input", required = True,
        help = "Input directory of text files.")

    # Optional parameters.
    parser.add_argument("-m", "--bins", type = int, default = 1000,
        help = "Number of bins into which to hash the data. Smaller numbers " +
            "increase collisions, producing larger clusters. [DEFAULT: 1000]")
    parser.add_argument("-n", "--numrows", type = int, default = 1000,
        help = "Number of times to hash the elements. Larger numbers diversify " +
            "signatures, increasing likelihood similar vectors will be hashed together. " +
            "This is also the length of the signature. [DEFAULT: 1000]")
    parser.add_argument("-b", "--bands", type = int, default = 25,
        help = "Number of bands. Each band will have (n / b) elements. Larger " +
            "numbers of elements increase confidence in element similarity. [DEFAULT: 25]")
    parser.add_argument("-c", "--minbucketsize", type = int, default = 2,
        help = "Minimum bucket size (0 to disable). Buckets with fewer than this " +
            "number of elements will be dropped. [DEFAULT: 2]")

    args = vars(parser.parse_args())
    sc = SparkContext(conf = SparkConf())

    # Read the input data.
    data = read_text(sc, args['input'])
    p = 65537
    m, n, b, c = args['bins'], args['numrows'], args['bands'], args['minbucketsize']
    model = lsh.run(data, p, m, n, b, c)
    print 'Found %s clusters.' % model.buckets.count()
예제 #4
0
        help="Number of times to hash the elements. Larger numbers diversify "
        +
        "signatures, increasing likelihood similar vectors will be hashed together. "
        + "This is also the length of the signature. [DEFAULT: 1000]")
    parser.add_argument(
        "-b",
        "--bands",
        type=int,
        default=25,
        help="Number of bands. Each band will have (n / b) elements. Larger " +
        "numbers of elements increase confidence in element similarity. [DEFAULT: 25]"
    )
    parser.add_argument(
        "-c",
        "--minbucketsize",
        type=int,
        default=2,
        help="Minimum bucket size (0 to disable). Buckets with fewer than this "
        + "number of elements will be dropped. [DEFAULT: 2]")

    args = vars(parser.parse_args())
    sc = SparkContext(conf=SparkConf())

    # Read the input data.
    data = read_text(sc, args['input'])
    p = 65537
    m, n, b, c = args['bins'], args['numrows'], args['bands'], args[
        'minbucketsize']
    model = lsh.run(data, p, m, n, b, c)
    print 'Found %s clusters.' % model.buckets.count()