def train( rdd: RDD, torch_obj: str, server: Server, iters: int = 10, partition_shuffles: int = 1, verbose: int = 1, early_stop_patience: int = -1, mini_batch: int = -1, validation_pct: float = 0.0 ) -> Dict: try: torch_obj, _ = load_base_torch(torch_obj) master_url = str(server.master_url) for i in range(partition_shuffles): rdd.mapPartitions( lambda x: handle_model( x, torch_obj=torch_obj, master_url=master_url, iters=iters, verbose=verbose, early_stop_patience=early_stop_patience, mini_batch=mini_batch, validation_pct=validation_pct ) ).foreach(lambda x: x) if partition_shuffles - i > 1: num_partitions = rdd.getNumPartitions() rdd = rdd.repartition(num_partitions) state_dict = get_state_dict(master_url) server.stop_server() return state_dict except Exception as e: server.stop_server() raise e
def __init__(self, ctx, resource_read=None, query=None, **kwargs): kwargs = make_es_config(kwargs, resource_read=resource_read, query=query) kwargs = as_java_object(ctx._gateway, kwargs) jrdd = helper(ctx).esJsonRDD(ctx._jsc, kwargs) rdd = RDD(jrdd, ctx, NoOpSerializer()) # read the rdd in batches of two (first key then value / doc) def pairwise(iterable): iterator = iter(iterable) return izip(iterator, iterator) kvRdd = rdd.mapPartitions(pairwise, True) super(EsRDD, self).__init__(kvRdd._jrdd, ctx)
def rdd(self): """Returns the content as an :class:`pyspark.RDD` of :class:`Row`. """ if self._lazy_rdd is None: jrdd = self._jdf.javaToPython() rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer())) schema = self.schema def applySchema(it): cls = _create_cls(schema) return map(cls, it) self._lazy_rdd = rdd.mapPartitions(applySchema) return self._lazy_rdd
def rdd(self): """ Return the content of the :class:`DataFrame` as an :class:`RDD` of :class:`Row` s. """ if not hasattr(self, '_lazy_rdd'): jrdd = self._jdf.javaToPython() rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer())) schema = self.schema def applySchema(it): cls = _create_cls(schema) return itertools.imap(cls, it) self._lazy_rdd = rdd.mapPartitions(applySchema) return self._lazy_rdd
def son(baskets: RDD, support_threshold_total=support_threshold_total) -> list: def pcy_for_list(partition: list, support_threshold_total=support_threshold_total) -> dict: # partition = baskets num_baskets_chunk = len(partition) support_threshold = math.ceil(support_threshold_total * num_baskets_chunk / num_baskets) # first pass singleton_counts = {} bucket_counts = {} for basket in partition: for item in basket: singleton_counts[item] = singleton_counts.get(item, 0) + 1 pairs = generate_combination(basket, size=2) for pair in pairs: key = hash_pair(pair) bucket_counts[key] = bucket_counts.get(key, 0) + 1 for key, value in bucket_counts.items(): if value >= support_threshold: bucket_counts[key] = 1 else: bucket_counts[key] = 0 frequent_itemsets = {} for key, value in singleton_counts.items(): if value >= support_threshold: frequent_itemsets[key] = None # store all frequent singletons # print("singleton_counts", singleton_counts) # print("frequent singletons", frequent_itemsets) del singleton_counts gc.collect() # second pass itemset_counts = {} for basket in partition: pairs = generate_combination(basket, size=2) for pair in pairs: if qualified_as_candidate_pair(pair, frequent_itemsets, bitmap=bucket_counts): key = tuple(pair) itemset_counts[key] = itemset_counts.get(key, 0) + 1 for key, value in itemset_counts.items(): if value >= support_threshold: frequent_itemsets[key] = None # store all frequent pairs # print("pair counts", itemset_counts) del itemset_counts gc.collect() # more passes for larger-size itemsets size = 3 num_frequent_itemsets = len(frequent_itemsets) while True: itemset_counts = {} for basket in partition: itemsets = generate_combination_with_filter(basket, frequent_itemsets, size) for itemset in itemsets: key = tuple(itemset) itemset_counts[key] = itemset_counts.get(key, 0) + 1 for key, value in itemset_counts.items(): if value >= support_threshold: frequent_itemsets[key] = None # store all frequent pairs del itemset_counts gc.collect() current_num_frequent_itemsets = len(frequent_itemsets) # print("frequent_itemsets", frequent_itemsets) if current_num_frequent_itemsets == num_frequent_itemsets: # no more new frequent itemsets # print("break") break num_frequent_itemsets = current_num_frequent_itemsets size += 1 # print("frequent_itemsets", frequent_itemsets) return frequent_itemsets # First stage num_baskets = baskets.count() candidate_itemsets = dict.fromkeys(baskets.mapPartitions(lambda _: pcy_for_list(list(_), support_threshold_total)).distinct().collect(), 0) # print("candidate_itemsets", candidate_itemsets) # Second stage def qualified_as_candidate_itemset(itemset): try: _ = candidate_itemsets[itemset] return True except: return False singleton_counts = baskets.\ flatMap(lambda basket: basket).\ filter(lambda item: qualified_as_candidate_itemset(item)).\ map(lambda _: (_, 1)).\ reduceByKey(lambda x,y: x+y).\ filter(lambda pair: pair[1] >= support_threshold_total).keys().collect() frequent_itemsets = [sorted(singleton_counts)] del singleton_counts gc.collect() size = 2 while True: frequent_itemsets_for_particular_size = baskets.\ flatMap(lambda _: generate_combination_with_filter(_, candidate_itemsets, size)).\ filter(lambda _: qualified_as_candidate_itemset(tuple(_))).\ map(lambda _: (tuple(_), 1)).\ reduceByKey(lambda x,y: x+y).\ filter(lambda pair: pair[1] >= support_threshold_total).keys().collect() if frequent_itemsets_for_particular_size == []: break else: frequent_itemsets.append(sorted(frequent_itemsets_for_particular_size)) size += 1 del frequent_itemsets_for_particular_size gc.collect() return frequent_itemsets