예제 #1
0
파일: graph.py 프로젝트: fulaphex/apollo
def evaluate_communities(args):
    log = logging.getLogger("evalcc")
    model = CommunitiesModel().load(args.input)
    patch_tables(args)
    spark = create_spark("evalcc-%s" % uuid4(), **args.__dict__)
    log.info("Preparing the communities' RDD")
    items = []
    for i, c in progress_bar(enumerate(model.communities), log,
                             expected_size=len(model.communities)):
        for m in c:
            if m < len(model.id_to_element):
                items.append(Row(sha1=model.id_to_element[m], community=i))
    log.info("Running")
    items_in_spark = spark.sparkContext.parallelize(items).toDF()
    bags = spark \
        .read \
        .format("org.apache.spark.sql.cassandra") \
        .options(table=args.tables["bags"], keyspace=args.keyspace) \
        .load()
    log.info("Loaded the bags, calculating the vocabulary")
    vocabulary = bags.drop("sha1", "value").distinct().rdd.map(lambda x: x.item).collect()
    vocabulary = {v: i for i, v in enumerate(vocabulary)}
    log.info("Vocabulary size: %d", len(vocabulary))
    element_to_id = {e: i for i, e in enumerate(model.id_to_element)}
    metrics = items_in_spark.join(bags, "sha1").rdd \
        .map(lambda r: (r.community, (element_to_id[r.sha1], vocabulary[r.item], r.value))) \
        .groupByKey() \
        .map(CommunityEvaluator(args.threshold, len(vocabulary))) \
        .reduce(lambda v1, v2: [v1[i] + v2[i] for i in range(4)])
    log.info("Total misses: %d", metrics[0])
    log.info("Average normalized misses: %f", metrics[1] / len(model.communities))
    log.info("Total loss: %f", metrics[2])
    log.info("Average normalized loss: %f", numpy.sqrt(metrics[3] / len(model.communities)))
예제 #2
0
파일: graph.py 프로젝트: warenlg/apollo
def evaluate_communities(args):
    log = logging.getLogger("evalcc")
    model = CommunitiesModel().load(args.input)
    patch_tables(args)
    spark = create_spark("evalcc-%s" % uuid4(), **args.__dict__)
    log.info("Preparing the communities' RDD")
    items = []
    for i, c in progress_bar(enumerate(model.communities), log,
                             expected_size=len(model.communities)):
        for m in c:
            if m < len(model.id_to_element):
                items.append(Row(sha1=model.id_to_element[m], community=i))
    log.info("Running")
    items_in_spark = spark.sparkContext.parallelize(items).toDF()
    bags = spark \
        .read \
        .format("org.apache.spark.sql.cassandra") \
        .options(table=args.tables["bags"], keyspace=args.keyspace) \
        .load()
    log.info("Loaded the bags, calculating the vocabulary")
    vocabulary = bags.drop("sha1", "value").distinct().rdd.map(lambda x: x.item).collect()
    vocabulary = {v: i for i, v in enumerate(vocabulary)}
    log.info("Vocabulary size: %d", len(vocabulary))
    element_to_id = {e: i for i, e in enumerate(model.id_to_element)}
    metrics = items_in_spark.join(bags, "sha1").rdd \
        .map(lambda r: (r.community, (element_to_id[r.sha1], vocabulary[r.item], r.value))) \
        .groupByKey() \
        .map(CommunityEvaluator(args.threshold, len(vocabulary))) \
        .reduce(lambda v1, v2: [v1[i] + v2[i] for i in range(4)])
    log.info("Total misses: %d", metrics[0])
    log.info("Average normalized misses: %f", metrics[1] / len(model.communities))
    log.info("Total loss: %f", metrics[2])
    log.info("Average normalized loss: %f", numpy.sqrt(metrics[3] / len(model.communities)))
예제 #3
0
 def setUp(self):
     data = [Row(to_index="to_index%d" % i, value=i) for i in range(10)]
     self.data = data
     self.sc = create_spark("test")
     self.data_rdd = self.sc.sparkContext \
         .parallelize(range(len(data))) \
         .map(lambda x: data[x])
예제 #4
0
def create_engine(session_name,
                  repositories,
                  repository_format=EngineDefault.REPOSITORY_FORMAT,
                  bblfsh=EngineDefault.BBLFSH,
                  engine=EngineDefault.VERSION,
                  config=SparkDefault.CONFIG,
                  packages=SparkDefault.JAR_PACKAGES,
                  spark=SparkDefault.MASTER_ADDRESS,
                  spark_local_dir=SparkDefault.LOCAL_DIR,
                  spark_log_level=SparkDefault.LOG_LEVEL,
                  dep_zip=SparkDefault.DEP_ZIP,
                  memory=SparkDefault.MEMORY):

    config += (get_bblfsh_dependency(bblfsh), )
    packages += (get_engine_package(engine), )
    session = create_spark(session_name,
                           spark=spark,
                           spark_local_dir=spark_local_dir,
                           config=config,
                           packages=packages,
                           spark_log_level=spark_log_level,
                           dep_zip=dep_zip,
                           memory=memory)
    logging.getLogger("engine").info("Initializing engine on %s", repositories)
    return Engine(session, repositories, repository_format)
예제 #5
0
파일: __init__.py 프로젝트: sniperkit/ml
def create_spark_for_test(name="test"):
    config = []
    packages = []
    bblfsh = "localhost"
    engine = get_engine_version()
    add_engine_dependencies(engine=engine, config=config, packages=packages)
    add_bblfsh_dependencies(bblfsh=bblfsh, config=config)
    return create_spark(name, config=config, packages=packages)
예제 #6
0
    def test_error(self):
        with self.assertRaises(ValueError):
            create_or_load_ordered_df(argparse.Namespace(docfreq_in=None), 10,
                                      None)

        with self.assertRaises(ValueError):
            session = create_spark("test_df_util")
            uast_extractor = ParquetLoader(session, paths.PARQUET_DIR) \
                .link(Moder("file")) \
                .link(UastRow2Document()) \
                .link(UastDeserializer()) \
                .link(Uast2BagFeatures(IdentifiersBagExtractor()))
            create_or_load_ordered_df(argparse.Namespace(docfreq_in=None),
                                      None, uast_extractor)
예제 #7
0
 def test_create(self):
     session = create_spark("test_df_util")
     uast_extractor = ParquetLoader(session, paths.PARQUET_DIR) \
         .link(UastRow2Document())
     ndocs = uast_extractor.link(Counter()).execute()
     uast_extractor = uast_extractor.link(UastDeserializer()) \
         .link(Uast2BagFeatures([IdentifiersBagExtractor()]))
     with tempfile.TemporaryDirectory() as tmpdir:
         tmp_path = os.path.join(tmpdir, "df.asdf")
         args = argparse.Namespace(docfreq_in=None, docfreq_out=tmp_path, min_docfreq=1,
                                   vocabulary_size=1000)
         df_model = create_or_load_ordered_df(args, ndocs, uast_extractor)
         self.assertEqual(df_model.docs, ndocs)
         self.assertTrue(os.path.exists(tmp_path))
예제 #8
0
파일: basic.py 프로젝트: absognety/ml
def create_parquet_loader(session_name, repositories,
                          config=SparkDefault.CONFIG,
                          packages=SparkDefault.PACKAGES,
                          spark=SparkDefault.MASTER_ADDRESS,
                          spark_local_dir=SparkDefault.LOCAL_DIR,
                          spark_log_level=SparkDefault.LOG_LEVEL,
                          memory=SparkDefault.MEMORY,
                          dep_zip=False):
    config = assemble_spark_config(config=config, memory=memory)
    session = create_spark(session_name, spark=spark, spark_local_dir=spark_local_dir,
                           config=config, packages=packages, spark_log_level=spark_log_level,
                           dep_zip=dep_zip)
    log = logging.getLogger("parquet")
    log.info("Initializing on %s", repositories)
    parquet = ParquetLoader(session, repositories)
    return parquet
예제 #9
0
파일: test_tfidf.py 프로젝트: absognety/ml
    def setUp(self):
        self.sc = create_spark("test")

        df = DocumentFrequencies().construct(10,
                                             {str(i): i
                                              for i in range(1, 5)})
        self.tfidf = TFIDF(df=df)

        class Columns:
            """
            Stores column names for return value.
            """
            token = "t"
            document = "d"
            value = "v"

        self.tfidf.Columns = Columns
예제 #10
0
 def test_create(self):
     session = create_spark("test_quant_util")
     extractor = ChildrenBagExtractor()
     with tempfile.NamedTemporaryFile(mode="r+b",
                                      suffix="-quant.asdf") as tmp:
         path = tmp.name
         uast_extractor = ParquetLoader(session, paths.PARQUET_DIR) \
             .link(Moder("file")) \
             .link(UastRow2Document()) \
             .link(UastDeserializer())
         create_or_apply_quant(path, [extractor], uast_extractor)
         self.assertIsNotNone(extractor.levels)
         self.assertTrue(os.path.exists(path))
         model_levels = QuantizationLevels().load(
             source=path)._levels["children"]
         for key in model_levels:
             self.assertListEqual(list(model_levels[key]),
                                  list(extractor.levels[key]))
예제 #11
0
파일: graph.py 프로젝트: warenlg/apollo
def detect_communities(args):
    log = logging.getLogger("cmd")
    ccsmodel = ConnectedComponentsModel().load(args.input)
    log.info("Building the connected components")
    ccs = defaultdict(list)
    for i, c in enumerate(ccsmodel.id_to_cc):
        ccs[c].append(i)
    buckmat = ccsmodel.id_to_buckets
    buckindices = buckmat.indices
    buckindptr = buckmat.indptr
    total_nvertices = buckmat.shape[0]
    linear = args.edges in ("linear", "1")
    graphs = []
    communities = []
    if not linear:
        log.info("Transposing the matrix")
        buckmat_csc = buckmat.T.tocsr()
    fat_ccs = []
    for vertices in ccs.values():
        if len(vertices) == 1:
            continue
        if len(vertices) == 2:
            communities.append(vertices)
            continue
        fat_ccs.append(vertices)
    log.info("Building %d graphs", len(fat_ccs))
    for vertices in progress_bar(fat_ccs, log, expected_size=len(fat_ccs)):
        if linear:
            edges = []
            weights = []
            bucket_weights = buckmat.sum(axis=0)
            buckets = set()
            for i in vertices:
                for j in range(buckindptr[i], buckindptr[i + 1]):
                    bucket = buckindices[j]
                    weights.append(bucket_weights[0, bucket])
                    bucket += total_nvertices
                    buckets.add(bucket)
                    edges.append((str(i), str(bucket)))
        else:
            edges = set()
            weights = None
            buckets = set()
            for i in vertices:
                for j in range(buckindptr[i], buckindptr[i + 1]):
                    buckets.add(buckindices[j])
            for bucket in buckets:
                buckverts = \
                    buckmat_csc.indices[buckmat_csc.indptr[bucket]:buckmat_csc.indptr[bucket + 1]]
                for i, x in enumerate(buckverts):
                    for y in buckverts:
                        if x < y:
                            edges.add((str(x), str(y)))
            buckets.clear()
            edges = list(edges)
        graph = Graph(directed=False)
        graph.add_vertices(list(map(str, vertices + list(buckets))))
        graph.add_edges(edges)
        graph.edge_weights = weights
        graphs.append(graph)
    log.info("Launching the community detection")
    detector = CommunityDetector(algorithm=args.algorithm, config=args.params)
    if not args.no_spark:
        spark = create_spark("cmd-%s" % uuid4(), **args.__dict__).sparkContext
        communities.extend(spark.parallelize(graphs).flatMap(detector).collect())
    else:
        communities.extend(chain.from_iterable(progress_bar(
            (detector(g) for g in graphs), log, expected_size=len(graphs))))
    log.info("Overall communities: %d", len(communities))
    log.info("Average community size: %.1f", numpy.mean([len(c) for c in communities]))
    log.info("Median community size: %.1f", numpy.median([len(c) for c in communities]))
    log.info("Max community size: %d", max(map(len, communities)))
    log.info("Writing %s", args.output)
    CommunitiesModel().construct(communities, ccsmodel.id_to_element).save(args.output)
예제 #12
0
 def setUp(self):
     self.sc = create_spark("test")
     self.bag2tf = BagFeatures2TermFreq()
예제 #13
0
파일: hasher.py 프로젝트: fulaphex/apollo
def hash_batches(args):
    log = logging.getLogger("hash")
    log.info("Loading files from %s", args.input)
    loader = BOWLoader(args.input)
    log.info("%d batches", len(loader))

    # Check batches
    if not loader:
        return

    htnum, band_size = calc_hashtable_params(
        args.threshold, args.size, args.false_positive_weight, args.false_negative_weight)
    log.info("Number of hash tables: %d", htnum)
    log.info("Band size: %d", band_size)
    cassandra_utils.configure(args)
    spark_args = filter_kwargs(args.__dict__, create_spark)
    spark = create_spark("hash-%s" % uuid4(), **spark_args).sparkContext
    import libMHCUDA  # delayed import which requires CUDA and friends
    tables = args.tables
    gen = voc_size = None
    try:
        for i, bow in enumerate(loader):
            if voc_size is None:
                voc_size = bow.matrix.shape[-1]
                log.info("Initializing the generator")
                deferred = os.path.isfile(args.params)
                gen = libMHCUDA.minhash_cuda_init(
                    voc_size, args.size, seed=args.seed, devices=args.devices,
                    verbosity=args.mhc_verbosity,
                    deferred=deferred)
                if deferred:
                    model = WeightedMinHashParameters().load(args.params)
                    libMHCUDA.minhash_cuda_assign_vars(gen, model.rs, model.ln_cs, model.betas)
                else:
                    log.info("Writing %s", args.params)
                    params = libMHCUDA.minhash_cuda_retrieve_vars(gen)
                    WeightedMinHashParameters().construct(*params).save(args.params)
            if bow.matrix.shape[-1] != voc_size:
                raise ValueError("The vocabulary sizes do not match: %d != %d"
                                 % (bow.matrix.shape[-1], voc_size))
            log.info("Processing batch %d / %d", i + 1, len(loader))
            # Modify features if needed
            # TODO(vmarkovtsev): port to the new structure
            # batches = modify_feature_weights(batches, args)
            hashes = libMHCUDA.minhash_cuda_calc(gen, bow.matrix)
            job = [(k, h) for k, h in zip(bow.documents, hashes)]
            log.info("Saving the hashtables")
            df = spark.parallelize(job).flatMap(HashExploder(htnum, band_size)).toDF()
            df.write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashtables"], keyspace=args.keyspace) \
                .save()
            df.write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashtables2"], keyspace=args.keyspace) \
                .save()
            log.info("Saving the hashes")
            spark.parallelize(job) \
                .map(lambda x: Row(sha1=x[0], value=bytearray(x[1].data))) \
                .toDF() \
                .write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashes"], keyspace=args.keyspace) \
                .save()
    finally:
        libMHCUDA.minhash_cuda_fini(gen)
예제 #14
0
파일: __init__.py 프로젝트: pareion/ml
def create_spark_for_test(name="test"):
    packages = (get_engine_package(get_engine_version()), )
    config = (get_bblfsh_dependency("localhost"), )
    return create_spark(name, config=config, packages=packages)
예제 #15
0
def hash_batches(args):
    log = logging.getLogger("hash")
    log.info("Loading files from %s", args.input)
    loader = BOWLoader(args.input)
    log.info("%d batches", len(loader))

    # Check batches
    if not loader:
        return

    htnum, band_size = calc_hashtable_params(args.threshold, args.size,
                                             args.false_positive_weight,
                                             args.false_negative_weight)
    log.info("Number of hash tables: %d", htnum)
    log.info("Band size: %d", band_size)
    cassandra_utils.configure(args)
    spark = create_spark("hash-%s" % uuid4(), **args.__dict__).sparkContext
    import libMHCUDA  # delayed import which requires CUDA and friends
    tables = args.tables
    gen = voc_size = None
    try:
        for i, bow in enumerate(loader):
            if voc_size is None:
                voc_size = bow.matrix.shape[-1]
                log.info("Initializing the generator")
                deferred = os.path.isfile(args.params)
                gen = libMHCUDA.minhash_cuda_init(voc_size,
                                                  args.size,
                                                  seed=args.seed,
                                                  devices=args.devices,
                                                  verbosity=args.mhc_verbosity,
                                                  deferred=deferred)
                if deferred:
                    model = WeightedMinHashParameters().load(args.params)
                    libMHCUDA.minhash_cuda_assign_vars(gen, model.rs,
                                                       model.ln_cs,
                                                       model.betas)
                else:
                    log.info("Writing %s", args.params)
                    params = libMHCUDA.minhash_cuda_retrieve_vars(gen)
                    WeightedMinHashParameters().construct(*params).save(
                        args.params)
            if bow.matrix.shape[-1] != voc_size:
                raise ValueError(
                    "The vocabulary sizes do not match: %d != %d" %
                    (bow.matrix.shape[-1], voc_size))
            log.info("Processing batch %d / %d", i + 1, len(loader))
            # Modify features if needed
            # TODO(vmarkovtsev): port to the new structure
            # batches = modify_feature_weights(batches, args)
            hashes = libMHCUDA.minhash_cuda_calc(gen, bow.matrix)
            job = [(k, h) for k, h in zip(bow.documents, hashes)]
            log.info("Saving the hashtables")
            df = spark.parallelize(job).flatMap(HashExploder(
                htnum, band_size)).toDF()
            df.write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashtables"], keyspace=args.keyspace) \
                .save()
            df.write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashtables2"], keyspace=args.keyspace) \
                .save()
            log.info("Saving the hashes")
            spark.parallelize(job) \
                .map(lambda x: Row(sha1=x[0], value=bytearray(x[1].data))) \
                .toDF() \
                .write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashes"], keyspace=args.keyspace) \
                .save()
    finally:
        libMHCUDA.minhash_cuda_fini(gen)
예제 #16
0
 def setUp(self):
     self.sc = create_spark("test")
예제 #17
0
 def setUp(self):
     self.sc = create_spark("test")
     self.bag2df = BagFeatures2DocFreq()
예제 #18
0
파일: graph.py 프로젝트: fulaphex/apollo
def detect_communities(args):
    log = logging.getLogger("cmd")
    ccsmodel = ConnectedComponentsModel().load(args.input)
    log.info("Building the connected components")
    ccs = defaultdict(list)
    for i, c in enumerate(ccsmodel.id_to_cc):
        ccs[c].append(i)
    buckmat = ccsmodel.id_to_buckets
    buckindices = buckmat.indices
    buckindptr = buckmat.indptr
    total_nvertices = buckmat.shape[0]
    linear = args.edges in ("linear", "1")
    graphs = []
    communities = []
    if not linear:
        log.info("Transposing the matrix")
        buckmat_csc = buckmat.T.tocsr()
    fat_ccs = []
    for vertices in ccs.values():
        if len(vertices) == 1:
            continue
        if len(vertices) == 2:
            communities.append(vertices)
            continue
        fat_ccs.append(vertices)
    log.info("Building %d graphs", len(fat_ccs))
    for vertices in progress_bar(fat_ccs, log, expected_size=len(fat_ccs)):
        if linear:
            edges = []
            weights = []
            bucket_weights = buckmat.sum(axis=0)
            buckets = set()
            for i in vertices:
                for j in range(buckindptr[i], buckindptr[i + 1]):
                    bucket = buckindices[j]
                    weights.append(bucket_weights[0, bucket])
                    bucket += total_nvertices
                    buckets.add(bucket)
                    edges.append((str(i), str(bucket)))
        else:
            edges = set()
            weights = None
            buckets = set()
            for i in vertices:
                for j in range(buckindptr[i], buckindptr[i + 1]):
                    buckets.add(buckindices[j])
            for bucket in buckets:
                buckverts = \
                    buckmat_csc.indices[buckmat_csc.indptr[bucket]:buckmat_csc.indptr[bucket + 1]]
                for i, x in enumerate(buckverts):
                    for y in buckverts:
                        if x < y:
                            edges.add((str(x), str(y)))
            buckets.clear()
            edges = list(edges)
        graph = Graph(directed=False)
        graph.add_vertices(list(map(str, vertices + list(buckets))))
        graph.add_edges(edges)
        graph.edge_weights = weights
        graphs.append(graph)
    log.info("Launching the community detection")
    detector = CommunityDetector(algorithm=args.algorithm, config=args.params)
    if not args.no_spark:
        spark = create_spark("cmd-%s" % uuid4(), **args.__dict__).sparkContext
        communities.extend(spark.parallelize(graphs).flatMap(detector).collect())
    else:
        communities.extend(chain.from_iterable(progress_bar(
            (detector(g) for g in graphs), log, expected_size=len(graphs))))
    log.info("Overall communities: %d", len(communities))
    log.info("Average community size: %.1f", numpy.mean([len(c) for c in communities]))
    log.info("Median community size: %.1f", numpy.median([len(c) for c in communities]))
    log.info("Max community size: %d", max(map(len, communities)))
    log.info("Writing %s", args.output)
    CommunitiesModel().construct(communities, ccsmodel.id_to_element).save(args.output)
예제 #19
0
def create_spark_for_test(name="test"):
    if sys.version_info >= (3, 7):
        raise SkipTest("Python 3.7 is not yet supported.")
    packages = (get_engine_package(get_engine_version()), )
    config = (get_bblfsh_dependency("localhost"), )
    return create_spark(name, config=config, packages=packages)