예제 #1
0
파일: basic.py 프로젝트: sniperkit/ml
def create_uast_source(args,
                       session_name,
                       select=HeadFiles,
                       language_selector=None,
                       extract_uast=True):
    if args.parquet:
        parquet_loader_args = filter_kwargs(args.__dict__,
                                            create_parquet_loader)
        start_point = create_parquet_loader(session_name,
                                            **parquet_loader_args)
        root = start_point
        if extract_uast and "uast" not in [
                col.name for col in start_point.execute().schema
        ]:
            raise ValueError("The parquet files do not contain UASTs.")
    else:
        engine_args = filter_kwargs(args.__dict__, create_engine)
        root = create_engine(session_name, **engine_args)
        if language_selector is None:
            language_selector = LanguageSelector(languages=args.languages)
        start_point = Ignition(root, explain=args.explain) \
            .link(select()) \
            .link(language_selector)
        if extract_uast:
            start_point = start_point.link(UastExtractor())
    return root, start_point
예제 #2
0
def create_file_source(args: argparse.Namespace, session_name: str):
    if args.parquet:
        parquet_loader_args = filter_kwargs(args.__dict__, create_parquet_loader)
        root = create_parquet_loader(session_name, **parquet_loader_args)
        file_source = root.link(LanguageSelector.maybe(languages=args.languages,
                                                       blacklist=args.blacklist))
    else:
        engine_args = filter_kwargs(args.__dict__, create_engine)
        root = Ignition(create_engine(session_name, **engine_args), explain=args.explain)
        file_source = root.link(DzhigurdaFiles(args.dzhigurda))
        if args.languages is not None:
            file_source = file_source \
                .link(LanguageExtractor()) \
                .link(LanguageSelector(languages=args.languages, blacklist=args.blacklist))

    return root, file_source
예제 #3
0
파일: merge_coocc.py 프로젝트: sniperkit/ml
def merge_coocc_spark(df, filepaths, log, args):
    session_name = "merge_coocc-%s" % uuid4()
    session = create_spark(session_name, **filter_kwargs(args.__dict__, create_spark))
    spark_context = session.sparkContext
    global_index = spark_context.broadcast(df.order)

    coocc_rdds = []

    def local_to_global(local_index):
        """
        Converts token index of co-occurrence matrix to the common index.
        For example index, 5 correspond to `get` token for a current model.
        And `get` have index 7 in the result.
        So we convert 5 to `get` via tokens list and `get` to 7 via global_index mapping.
        If global_index do not have `get` token, it returns -1.
        """
        return global_index.value.get(tokens.value[local_index], -1)

    for path, coocc in load_and_check(filepaths, log):
        rdd = coocc.matrix_to_rdd(spark_context)  # rdd structure: ((row, col), weight)
        log.info("Broadcasting tokens order for %s model...", path)
        tokens = spark_context.broadcast(coocc.tokens)
        coocc_rdds.append(
            rdd.map(lambda row: ((local_to_global(row[0][0]),
                                  local_to_global(row[0][1])),
                                 np.uint32(row[1])))
               .filter(lambda row: row[0][0] >= 0))

    log.info("Calculating the union of cooccurrence matrices...")
    rdd = spark_context \
        .union(coocc_rdds) \
        .reduceByKey(lambda x, y: min(MAX_INT32, x + y))
    CooccModelSaver(args.output, df)(rdd)
예제 #4
0
def evaluate_communities(args):
    log = logging.getLogger("evalcc")
    model = CommunitiesModel().load(args.input)
    configure(args)
    spark = create_spark("evalcc-%s" % uuid4(),
                         **filter_kwargs(args.__dict__, create_spark))
    log.info("Preparing the communities' RDD")
    items = []
    for i, c in progress_bar(enumerate(model.communities),
                             log,
                             expected_size=len(model.communities)):
        for m in c:
            if m < len(model.id_to_element):
                items.append(Row(sha1=model.id_to_element[m], community=i))
    log.info("Running")
    items_in_spark = spark.sparkContext.parallelize(items).toDF()
    bags = spark \
        .read \
        .format("org.apache.spark.sql.cassandra") \
        .options(table=args.tables["bags"], keyspace=args.keyspace) \
        .load()
    log.info("Loaded the bags, calculating the vocabulary")
    vocabulary = bags.drop(
        "sha1", "value").distinct().rdd.map(lambda x: x.item).collect()
    vocabulary = {v: i for i, v in enumerate(vocabulary)}
    log.info("Vocabulary size: %d", len(vocabulary))
    element_to_id = {e: i for i, e in enumerate(model.id_to_element)}
    metrics = items_in_spark.join(bags, "sha1").rdd \
        .map(lambda r: (r.community, (element_to_id[r.sha1], vocabulary[r.item], r.value))) \
        .groupByKey() \
        .map(CommunityEvaluator(args.threshold, len(vocabulary))) \
        .reduce(lambda v1, v2: [v1[i] + v2[i] for i in range(4)])
    log.info("Total misses: %d", metrics[0])
    log.info("Average normalized misses: %f",
             metrics[1] / len(model.communities))
    log.info("Total loss: %f", metrics[2])
    log.info("Average normalized loss: %f",
             numpy.sqrt(metrics[3] / len(model.communities)))
예제 #5
0
파일: hasher.py 프로젝트: fulaphex/apollo
def hash_batches(args):
    log = logging.getLogger("hash")
    log.info("Loading files from %s", args.input)
    loader = BOWLoader(args.input)
    log.info("%d batches", len(loader))

    # Check batches
    if not loader:
        return

    htnum, band_size = calc_hashtable_params(
        args.threshold, args.size, args.false_positive_weight, args.false_negative_weight)
    log.info("Number of hash tables: %d", htnum)
    log.info("Band size: %d", band_size)
    cassandra_utils.configure(args)
    spark_args = filter_kwargs(args.__dict__, create_spark)
    spark = create_spark("hash-%s" % uuid4(), **spark_args).sparkContext
    import libMHCUDA  # delayed import which requires CUDA and friends
    tables = args.tables
    gen = voc_size = None
    try:
        for i, bow in enumerate(loader):
            if voc_size is None:
                voc_size = bow.matrix.shape[-1]
                log.info("Initializing the generator")
                deferred = os.path.isfile(args.params)
                gen = libMHCUDA.minhash_cuda_init(
                    voc_size, args.size, seed=args.seed, devices=args.devices,
                    verbosity=args.mhc_verbosity,
                    deferred=deferred)
                if deferred:
                    model = WeightedMinHashParameters().load(args.params)
                    libMHCUDA.minhash_cuda_assign_vars(gen, model.rs, model.ln_cs, model.betas)
                else:
                    log.info("Writing %s", args.params)
                    params = libMHCUDA.minhash_cuda_retrieve_vars(gen)
                    WeightedMinHashParameters().construct(*params).save(args.params)
            if bow.matrix.shape[-1] != voc_size:
                raise ValueError("The vocabulary sizes do not match: %d != %d"
                                 % (bow.matrix.shape[-1], voc_size))
            log.info("Processing batch %d / %d", i + 1, len(loader))
            # Modify features if needed
            # TODO(vmarkovtsev): port to the new structure
            # batches = modify_feature_weights(batches, args)
            hashes = libMHCUDA.minhash_cuda_calc(gen, bow.matrix)
            job = [(k, h) for k, h in zip(bow.documents, hashes)]
            log.info("Saving the hashtables")
            df = spark.parallelize(job).flatMap(HashExploder(htnum, band_size)).toDF()
            df.write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashtables"], keyspace=args.keyspace) \
                .save()
            df.write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashtables2"], keyspace=args.keyspace) \
                .save()
            log.info("Saving the hashes")
            spark.parallelize(job) \
                .map(lambda x: Row(sha1=x[0], value=bytearray(x[1].data))) \
                .toDF() \
                .write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashes"], keyspace=args.keyspace) \
                .save()
    finally:
        libMHCUDA.minhash_cuda_fini(gen)
예제 #6
0
파일: hasher.py 프로젝트: fulaphex/apollo
def hash_batches(args):
    log = logging.getLogger("hash")
    log.info("Loading files from %s", args.input)
    loader = BOWLoader(args.input)
    log.info("%d batches", len(loader))

    # Check batches
    if not loader:
        return

    htnum, band_size = calc_hashtable_params(args.threshold, args.size,
                                             args.false_positive_weight,
                                             args.false_negative_weight)
    log.info("Number of hash tables: %d", htnum)
    log.info("Band size: %d", band_size)
    cassandra_utils.configure(args)
    spark_args = filter_kwargs(args.__dict__, create_spark)
    spark = create_spark("hash-%s" % uuid4(), **spark_args).sparkContext
    import libMHCUDA  # delayed import which requires CUDA and friends
    tables = args.tables
    gen = voc_size = None
    try:
        for i, bow in enumerate(loader):
            if voc_size is None:
                voc_size = bow.matrix.shape[-1]
                log.info("Initializing the generator")
                deferred = os.path.isfile(args.params)
                gen = libMHCUDA.minhash_cuda_init(voc_size,
                                                  args.size,
                                                  seed=args.seed,
                                                  devices=args.devices,
                                                  verbosity=args.mhc_verbosity,
                                                  deferred=deferred)
                if deferred:
                    model = WeightedMinHashParameters().load(args.params)
                    libMHCUDA.minhash_cuda_assign_vars(gen, model.rs,
                                                       model.ln_cs,
                                                       model.betas)
                else:
                    log.info("Writing %s", args.params)
                    params = libMHCUDA.minhash_cuda_retrieve_vars(gen)
                    WeightedMinHashParameters().construct(*params).save(
                        args.params)
            if bow.matrix.shape[-1] != voc_size:
                raise ValueError(
                    "The vocabulary sizes do not match: %d != %d" %
                    (bow.matrix.shape[-1], voc_size))
            log.info("Processing batch %d / %d", i + 1, len(loader))
            # Modify features if needed
            # TODO(vmarkovtsev): port to the new structure
            # batches = modify_feature_weights(batches, args)
            hashes = libMHCUDA.minhash_cuda_calc(gen, bow.matrix)
            job = [(k, h) for k, h in zip(bow.documents, hashes)]
            log.info("Saving the hashtables")
            df = spark.parallelize(job).flatMap(HashExploder(
                htnum, band_size)).toDF()
            df.write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashtables"], keyspace=args.keyspace) \
                .save()
            df.write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashtables2"], keyspace=args.keyspace) \
                .save()
            log.info("Saving the hashes")
            spark.parallelize(job) \
                .map(lambda x: Row(sha1=x[0], value=bytearray(x[1].data))) \
                .toDF() \
                .write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table=tables["hashes"], keyspace=args.keyspace) \
                .save()
    finally:
        libMHCUDA.minhash_cuda_fini(gen)
예제 #7
0
def detect_communities(args):
    log = logging.getLogger("cmd")
    ccsmodel = ConnectedComponentsModel().load(args.input)
    log.info("Building the connected components")
    ccs = defaultdict(list)
    for i, c in enumerate(ccsmodel.id_to_cc):
        ccs[c].append(i)
    buckmat = ccsmodel.id_to_buckets
    buckindices = buckmat.indices
    buckindptr = buckmat.indptr
    total_nvertices = buckmat.shape[0]
    linear = args.edges in ("linear", "1")
    graphs = []
    communities = []
    if not linear:
        log.info("Transposing the matrix")
        buckmat_csc = buckmat.T.tocsr()
    fat_ccs = []
    for vertices in ccs.values():
        if len(vertices) == 1:
            continue
        if len(vertices) == 2:
            communities.append(vertices)
            continue
        fat_ccs.append(vertices)
    log.info("Building %d graphs", len(fat_ccs))
    for vertices in progress_bar(fat_ccs, log, expected_size=len(fat_ccs)):
        if linear:
            edges = []
            weights = []
            bucket_weights = buckmat.sum(axis=0)
            buckets = set()
            for i in vertices:
                for j in range(buckindptr[i], buckindptr[i + 1]):
                    bucket = buckindices[j]
                    weights.append(bucket_weights[0, bucket])
                    bucket += total_nvertices
                    buckets.add(bucket)
                    edges.append((str(i), str(bucket)))
        else:
            edges = set()
            weights = None
            buckets = set()
            for i in vertices:
                for j in range(buckindptr[i], buckindptr[i + 1]):
                    buckets.add(buckindices[j])
            for bucket in buckets:
                buckverts = \
                    buckmat_csc.indices[buckmat_csc.indptr[bucket]:buckmat_csc.indptr[bucket + 1]]
                for i, x in enumerate(buckverts):
                    for y in buckverts:
                        if x < y:
                            edges.add((str(x), str(y)))
            buckets.clear()
            edges = list(edges)
        graph = Graph(directed=False)
        graph.add_vertices(list(map(str, vertices + list(buckets))))
        graph.add_edges(edges)
        graph.edge_weights = weights
        graphs.append(graph)
    log.info("Launching the community detection")
    detector = CommunityDetector(algorithm=args.algorithm, config=args.params)
    if not args.no_spark:
        spark = create_spark("cmd-%s" % uuid4(),
                             **filter_kwargs(args.__dict__,
                                             create_spark)).sparkContext
        communities.extend(
            spark.parallelize(graphs).flatMap(detector).collect())
    else:
        communities.extend(
            chain.from_iterable(
                progress_bar((detector(g) for g in graphs),
                             log,
                             expected_size=len(graphs))))
    log.info("Overall communities: %d", len(communities))
    log.info("Average community size: %.1f",
             numpy.mean([len(c) for c in communities]))
    log.info("Median community size: %.1f",
             numpy.median([len(c) for c in communities]))
    log.info("Max community size: %d", max(map(len, communities)))
    log.info("Writing %s", args.output)
    CommunitiesModel().construct(communities,
                                 ccsmodel.id_to_element).save(args.output)
예제 #8
0
def warmup(args):
    engine_args = filter_kwargs(args.__dict__, create_engine)
    create_engine("warmup", "/tmp", **engine_args)