def naive_multiplication_rdd(mat_a: pyspark.RDD, mat_b: pyspark.RDD, is_triangle=False):
    """
    mat_a is the left matrix
    mat_b is the right matix
    :param mat_a:
    :param mat_b:
    :param is_triangle:
    :return:
    """
    if is_triangle:
        left_rdd = (
            mat_a.flatMap(lambda x: [((x.j, x.i), x.value), ((x.i, x.j), x.value)])
                .aggregateByKey(zeroValue=(0.0, 0.0),
                                seqFunc=lambda x, y: (x[0]+y, x[1]+1),
                                combFunc=lambda x, y: (x[0] + y[0], x[1]+y[1]))
                .mapValues(lambda x: x[0] / x[1])
                .map(lambda x: (x[0][0], (x[0][1], x[1])))
        )
    else:
        left_rdd = mat_a.map(lambda x: (x.j, (x.i, x.value)))

    right_rdd = mat_b.map(lambda x: (x.i, (x.j, x.value)))
    combined_rdd = (left_rdd.join(right_rdd).map(lambda x: x[1])
        .map(lambda x: ((x[0][0], x[1][0]), x[0][1]*x[1][1]))
        .reduceByKey(lambda x, y: x+y)
        .map(lambda x: distributed.MatrixEntry(i=x[0][0], j=x[0][1], value=x[1]))
    )
    return combined_rdd
    def __init_parameters(self, train: RDD):
        """
        _n_buckets/_n_items:
            The number of distinct buckets/items in the train RDD.
        _bucket_block_size/_cross_block_size/_item_block_size:
            The size of blocks when dividing buckets/cross buckets/items into blocks.
        _n_bucket_block/_n_cross_block/_n_item_block:
            The number of blocks when dividing buckets/cross buckets/items into blocks.
        """
        self._n_buckets = train.map(lambda u: u[0]).distinct().count()
        if self._n_buckets <= self._k:
            self._k = float("inf")

        # For the bucket dimension.
        if self._bucket_block_size is None:
            # Interpret bucket_block_size from n_bucket_block
            self._bucket_block_size = self._n_buckets // self._n_bucket_block + 1
        else:
            self._n_bucket_block = self._n_buckets // self._bucket_block_size + 1

        # For the cross dimension.
        if self._cross_block_size is None:
            self._cross_block_size = self._n_buckets // self._n_cross_block + 1
        else:
            self._n_cross_block = self._n_buckets // self._cross_block_size + 1

        # For the item dimension
        self._n_items = train.map(lambda u: u[1]).distinct().count()
        if self._item_block_size is None:
            self._item_block_size = self._n_items // self._n_item_block + 1
        else:
            self._n_item_block = self._n_item // self._item_block_size + 1
        return self
def join_multiple_keys(left: RDD, right: RDD, n: int) -> RDD:
    """
    Join RDDs with multiple keys.
        ((key1, key2, ...), value_left) x (key_i, value_right_i) ->
        ((key1, key2, ...), (value_left, value_right_1, value_right_2, ...))
    :param left: RDD<tuple<int>, value>
    :param right: RDD<int, value>
    :param n: int, the length of the key in left-RDD
    :return: joint RDD.
    """
    left = left.map(
        lambda u: (-1, (u[0], (u[1],)))
    )  # (_, (tuple<key>, tuple<value>))
    right = right.map(
        lambda u: (u[0], (u[1],))
    ).cache()  # (_, tuple<value>)
    for key_order in range(n):
        left = left.map(
            lambda u: (u[1][0][key_order], u[1])  # (_, (tuple<key>, tuple<value>))
        ).join(
            right  # (_, ((tuple<key>, tuple<value>), tuple<value>))
        ).map(
            lambda u: (-1, (u[1][0][0], u[1][0][1] + u[1][1]))
        )  # (_, (tuple<key>, tuple<value>))

    left = left.map(
        lambda u: u[1]
    )  # (tuple<key>, tuple<value>)
    return left
def cStress(rdd: RDD) -> RDD:

    # TODO: TWH Temporary
    ecg_sampling_frequency = 64.0
    rip_sampling_frequency = 64.0
    accel_sampling_frequency = 64.0 / 6.0

    # Timestamp correct datastreams
    ecg_corrected = rdd.map(lambda ds: (
        ds['participant'],
        timestamp_correct(datastream=ds['ecg'],
                          sampling_frequency=ecg_sampling_frequency)))
    rip_corrected = rdd.map(lambda ds: (
        ds['participant'],
        timestamp_correct(datastream=ds['rip'],
                          sampling_frequency=rip_sampling_frequency)))

    accelx_corrected = rdd.map(lambda ds: (
        ds['participant'],
        timestamp_correct(datastream=ds['accelx'],
                          sampling_frequency=accel_sampling_frequency)))
    accely_corrected = rdd.map(lambda ds: (
        ds['participant'],
        timestamp_correct(datastream=ds['accely'],
                          sampling_frequency=accel_sampling_frequency)))
    accelz_corrected = rdd.map(lambda ds: (
        ds['participant'],
        timestamp_correct(datastream=ds['accelz'],
                          sampling_frequency=accel_sampling_frequency)))

    accel_group = accelx_corrected.join(accely_corrected).join(
        accelz_corrected).map(fix_two_joins)
    accel = accel_group.map(lambda ds: (
        ds[0],
        autosense_sequence_align(datastreams=[ds[1][0], ds[1][1], ds[1][2]],
                                 sampling_frequency=accel_sampling_frequency)))

    # Accelerometer Feature Computation
    accel_features = accel.map(
        lambda ds: (ds[0], accelerometer_features(ds[1], window_length=10.0)))

    # rip features
    peak_valley = rip_corrected.map(
        lambda ds: (ds[0], rip.compute_peak_valley(rip=ds[1])))
    rip_features = peak_valley.map(
        lambda ds: (ds[0], rip_feature_computation(ds[1][0], ds[1][1])))

    # r-peak datastream computation
    ecg_rr_rdd = ecg_corrected.map(lambda ds: (ds[
        0], compute_rr_intervals(ds[1], ecg_sampling_frequency)))
    ecg_features = ecg_rr_rdd.map(lambda ds: (ds[
        0], ecg_feature_computation(ds[1], window_size=60, window_offset=60)))

    # return rip_features.join(ecg_features).join(accel_features).map(fix_two_joins)
    return ecg_features
 def evaluate(truth: RDD, prediction: RDD) -> float:
     """
     Calculate RMSE between truth and predictions.
     :param truth: RDD<Hashable, Hashable, float> = RDD<(bucket, item, rating)>
     :param prediction: RDD<Hashable, Hashable, float> = RDD<(bucket, item, rating)>
     :return: float = RMSE
     """
     truth = truth.map(lambda u: ((u[0], u[1]), u[2]))
     prediction = prediction.map(lambda u: ((u[0], u[1]), u[2]))
     return truth.join(prediction).map(lambda u:
                                       (u[1][0] - u[1][1])**2).mean()**0.5
예제 #6
0
 def __preprocessRdd(self, rdd: RDD):
     rddc = rddCorrector()
     rdd = rdd.map(lambda l: rddc.correct(l))
     if rdd != None:
         if (rdd.isEmpty() == False):
             rdd = rdd.map(lambda l: l.replace("<tweet>", ""))
             rdd = rdd.map(lambda l: l.replace("</tweet>", ""))
             df = DataFrameWorks().convertDataFrame(rdd, self.__spark)
             df = CleanText().clean(df, self.__spark)
             return df
     return None
    def convertDataFrame(self, rdd: RDD, SqlObject) -> DataFrame:
        """RDD to DataFrame"""
        #rdd = rdd.map(lambda l: l.replace("½",""))

        rdd =  rdd.map(lambda l: (l[:19], l[19:]))
        schema = [StructField("id", StringType(), False),
                  StructField("rawData", StringType(), False),
                  StructField("preprocessedData", ArrayType(elementType=StringType(), containsNull=True), True),
                  StructField("sentiment", FloatType(), True)]
        final_struct = StructType(fields=schema)
        rdd =rdd.map(lambda l: (l[0],l[1], [None], None))
        return SqlObject.createDataFrame(rdd, schema=final_struct)
예제 #8
0
def java_to_python_rdd(sc, rdd, is_pair, is_json):
    jrdd = sc._jvm.SerDe.javaToPython(rdd)
    output = RDD(jrdd, sc)
    if is_pair:
        if is_json:
            return output.map(lambda x: (x.split("\t")[0], json.loads(x.split("\t")[1])))
        else:
            return output.map(lambda x: (x.split("\t")[0], x.split("\t")[1]))

    if is_json:
        return output.map(lambda x: json.loads(x))
    return output
예제 #9
0
def extract_items(wikidata_items: RDD, b_property_map: Broadcast,
                  b_item_page_map: Broadcast):
    def parse_item(item):
        property_map = b_property_map.value
        item_page_map = b_item_page_map.value
        if "enwiki" in item["sitelinks"]:
            page_title = item["sitelinks"]["enwiki"]["title"]
        else:
            return None, None

        claims = {}
        for prop_id, property_claims in item["claims"].items():
            if prop_id in property_map:
                prop_name = property_map[prop_id]
                parsed_claims = []
                for c in property_claims:
                    if "datavalue" in c["mainsnak"]:
                        c = c["mainsnak"]["datavalue"]["value"]
                        if type(c) == dict and "entity-type" in c:
                            claim_item_id = c["id"]
                            if claim_item_id in item_page_map:
                                c = item_page_map[c["id"]]
                            else:
                                continue
                        parsed_claims.append(c)
                claims[prop_name] = parsed_claims
        return page_title, claims

    return (wikidata_items.map(parse_item).filter(lambda pc: pc[
        0] is not None).reduceByKey(lambda x, y: x).collectAsMap())
예제 #10
0
 def __call__(self, rdd: RDD) -> RDD:
     def select_fields(row):
         return Row(**{f: getattr(row, f) for f in self.fields})
     res = rdd.map(select_fields)
     if self.explained:
         self._log.info("toDebugString():\n%s", res.toDebugString().decode())
     return res
예제 #11
0
def transform_online_retail(
    sc: SparkSession,
    raw_rdd: RDD,
    schema: str,
    max_month: Optional[int] = None,
) -> DataFrame:
    """Method to transform online retail dataset to its correct dataformats, specific
    for online retail

    :return:
    """

    # initial transformation of the raw RDD
    raw_rdd = raw_rdd.map(lambda retail: (
        retail[0],  # InvoiceNo
        retail[1],  # StockCode
        retail[2] if retail[2] != '' else None,  # Description
        int(retail[3]),  # Quantity
        datetime.strptime(retail[4], '%d/%m/%Y %H:%M') if
        int(retail[4].split('/')[1]) < max_month
        else datetime.strptime(retail[4], '%m/%d/%Y %H:%M'),  # InvoiceDate
        float(retail[5]),  # UnitPrice
        int(retail[6]) if retail[6] != '' else None,  # CustomerID
        retail[7] if retail[7] != '' else None)  # Country
    )

    return sc.createDataFrame(
        raw_rdd,
        schema=schema
    )
예제 #12
0
    def __compute_signature(self, data: RDD) -> RDD:
        """
        Compute signature for items.
        :param data: RDD<(Hashable, Iterator<Hashable>)>
            = RDD<(item, content)>
        :return: RDD<(Hashable, tuple<int>)>
            = RDD<(item, signature)>
        """
        hashing_range = self.__hashing_range
        signature_length = self.__signature_length
        random_seed = self.__seed
        min_hash_func = self.__min_hash

        def _signature(key_values: (Hashable, Iterator)) -> (Hashable, tuple):
            """
            Compute signature for each item
            :return (Hashable, tuple<int>)
                = (item, signature)
            """
            item, content = key_values
            signature = [hashing_range for _ in range(signature_length)]
            for element in content:
                for index_i, hashed_value in enumerate(
                        min_hash_func(element, signature_length, hashing_range,
                                      random_seed)):
                    signature[index_i] = min(hashed_value, signature[index_i])
            return item, tuple(signature)

        return data.map(_signature)
예제 #13
0
 def _compute_tfid(texts: RDD) -> IDFModel:
     tf = HashingTF().transform(texts.map(lambda t: t.words))
     tf.cache()
     idf = IDF().fit(tf)
     tfidfs = idf.transform(tf)
     text_tfs = texts.zip(tfidfs)
     return text_tfs.map(lambda t: t[0].set_tfidf(t[1]))
    def run(self, data_rdd: RDD, query_rdd: RDD, n_dim: int) -> RDD:  # type: ignore
        empty_result_rdd = query_rdd.map(lambda idx_coords: (idx_coords[0], 0))

        data_rdd = data_rdd.map(
            lambda idx_coords: ((), idx_coords[1], (DATA, idx_coords[0]))
        )
        query_rdd = query_rdd.map(
            lambda idx_coords: ((), idx_coords[1], (QUERY, idx_coords[0]))
        )
        rdd = data_rdd.union(query_rdd)

        for _ in range(n_dim):
            rdd = self.assign_next_label(rdd=rdd)  # type: ignore

        rdd = empty_result_rdd.union(self.get_results_by_label(rdd))  # type: ignore
        rdd = self.aggregate_results_by_query(rdd).sortByKey()  # type: ignore
        return rdd
예제 #15
0
  def parseDNSInfo(pcap_packets: RDD) -> RDD:
    timer = Timer()

    rddDns = pcap_packets.map(lambda bytes_packet: DNSInfo(bytes_packet[0], bytes_packet[1])).filter(
      lambda dns: not dns.notDns and dns.sip not in Global.TRUSTED_DNS and dns.dip not in Global.TRUSTED_DNS)

    log.info(f'Time spent on parsing chunk = {timer.elapsed()}')
    return rddDns
예제 #16
0
파일: wikidata.py 프로젝트: Pinafore/qb
def extract_item_map(wikidata_items: RDD):
    def parse_item(item):
        if 'en' in item['labels']:
            label = item['labels']['en']['value']
            return item['id'], label
        else:
            return None
    return wikidata_items.map(parse_item).filter(lambda i: i is not None).collectAsMap()
    def __blocking_matrix(self,
                          train: RDD = None,
                          test: RDD = None,
                          similarity=None) -> RDD:
        """
        Divide matrix into blocks for the purpose of reduce key number.
        :param train: RDD<(Hashable, Hashable, float)>
            = RDD<bucket, item, rating>
        :param test: RDD<(Hashable, Hashable)>
            = RDD<bucket, item>
        :param similarity: RDD<(Hashable, Hashable, float)>
            RDD<bucket, bucket, similarity>
        :return: RDD<(int, int)(Hashable, Hashable, float)>
            = RDD<(bucket_block, item_block), (bucket, item, rating)> or
              RDD<(bucket_block, bucket_block), (bucket, bucket, similarity)>
        """
        seed = self._seed
        n_bucket_block = self._n_bucket_block
        n_item_block = self._n_item_block
        n_cross_block = self._n_cross_block

        if train is not None:
            train = train.map(lambda u: ((hash2int(
                u[0], max_value=n_cross_block, seed=seed
            ), hash2int(u[1], max_value=n_item_block, seed=seed)), u)).cache()
            train.count()
            return train

        if test is not None:
            test = test.map(lambda u: ((hash2int(
                u[0], max_value=n_bucket_block, seed=seed
            ), hash2int(u[1], max_value=n_item_block, seed=seed)), u)).cache()
            test.count()
            return test

        if similarity is not None:
            similarity = similarity.flatMap(lambda u: [(u[0], u[1], u[
                2]), (u[1], u[0], u[2])]).map(lambda u: (
                    (hash2int(u[0], max_value=n_bucket_block, seed=seed),
                     hash2int(u[1], max_value=n_cross_block, seed=seed)), u)
                                              ).cache()
            similarity.count()
            return similarity
예제 #18
0
def extract_item_map(wikidata_items: RDD):
    def parse_item(item):
        if "en" in item["labels"]:
            label = item["labels"]["en"]["value"]
            return item["id"], label
        else:
            return None

    return wikidata_items.map(parse_item).filter(
        lambda i: i is not None).collectAsMap()
예제 #19
0
    def calc_llr(ss: SparkSession, from_rdd: RDD, to_rdd: RDD) -> RDD:
        """
        Расчет матрицы co-/cross- occurence LLR для списка списков объектов для рекомендаций A -> B
        Вначале вычисляется произведение матриц left_rdd.Transpose * right_rdd.
        Затем LogLikelihoodRatio для каждой ячейки.
        :param ss: Spark Session объект
        :param from_rdd: rdd с разреженной матрицей со статистикой по объектам A. Формат значений - (ID списка, ID объекта A)
        :param to_rdd: rdd с разреженной матрицей со статистикой по объектам B. Формат значений - (ID списка, ID объекта B)
        :return: Разреженная матрица co-/cross- occurence LLR между треками
        """
        logger.info("Calculating co-/cross- occurence LLR matrix...")

        sc = ss.sparkContext

        lists_count = from_rdd.map(itemgetter(0)).distinct().count()
        bc_lists_count = sc.broadcast(lists_count)

        from_items_counts = from_rdd.map(itemgetter(1)).countByValue()
        bc_from_items_counts = sc.broadcast(from_items_counts)

        to_items_counts = to_rdd.map(itemgetter(1)).countByValue()
        bc_to_items_counts = sc.broadcast(to_items_counts)

        def llr_cell(x):
            i, j, cooc_count = x
            i_count = bc_to_items_counts.value[i]
            j_count = bc_from_items_counts.value[j]
            res_llr = llr_sqrt(cooc_count,
                               j_count - cooc_count,
                               i_count - cooc_count,
                               bc_lists_count.value - i_count - j_count + cooc_count)
            return i, j, res_llr

        llr_rdd = to_rdd \
            .join(from_rdd) \
            .map(lambda x: (x[1], 1)) \
            .reduceByKey(add) \
            .map(lambda x: (x[0][0], x[0][1], x[1])) \
            .map(llr_cell)

        logger.info("Co-/cross- occurence LLR matrix calculated")

        return llr_rdd
예제 #20
0
def extract_item_map(wikidata_items: RDD):
    def parse_item(item):
        if 'en' in item['labels']:
            label = item['labels']['en']['value']
            return item['id'], label
        else:
            return None

    return wikidata_items.map(parse_item).filter(
        lambda i: i is not None).collectAsMap()
 def evaluate(self, lables_and_predictions: RDD):
     TP = lables_and_predictions.map(lambda x:
                                 (set(x[0]), set([p for p,w in x[1][:self._pred_n]]))). \
                                 filter(lambda x:
                                        len(x[0].intersection(x[1])) > self._intersect_n)
     accuracy = 100.0 * TP.count() / lables_and_predictions.count()
     if self._verbose:
         print('accuracy: ', accuracy)
     self._results.append(accuracy)
     return accuracy
예제 #22
0
 def evaluate(self, lables_and_predictions: RDD):
     TP = lables_and_predictions.map(lambda x:
                                 (set(x[0]), set([p for p,w in x[1][:self._pred_n]]))). \
                                 filter(lambda x:
                                        len(x[0].intersection(x[1])) > self._intersect_n)
     accuracy = 100.0 * TP.count() / lables_and_predictions.count()
     if self._verbose:
         print('accuracy: ', accuracy)
     self._results.append(accuracy)
     return accuracy
예제 #23
0
def to_pandas_df(rdd: RDD, string_conversion=False, init_condition: dict = None):
    if init_condition is not None and string_conversion is False:
        # Typefull
        return to_spark(rdd=rdd, init_condition=init_condition).toPandas()
    elif init_condition is None and string_conversion is True:
        # String
        return rdd.map(lambda d: Row(**dict([(k, str(v)) for k, v in d.items()]))).toDF()
    else:
        # Typeless
        return to_pandas(rdd)
예제 #24
0
def lp_to_simple_rdd(lp_rdd: RDD,
                     categorical: bool = False,
                     nb_classes: int = None):
    """Convert a LabeledPoint RDD into an RDD of feature-label pairs

    :param lp_rdd: LabeledPoint RDD of features and labels
    :param categorical: boolean, if labels should be one-hot encode when returned
    :param nb_classes: int, number of total classes
    :return: Spark RDD with feature-label pairs
    """
    if categorical:
        if not nb_classes:
            labels = np.asarray(lp_rdd.map(lambda lp: lp.label).collect(),
                                dtype='int32')
            nb_classes = np.max(labels) + 1
        rdd = lp_rdd.map(lambda lp: (from_vector(lp.features),
                                     encode_label(lp.label, nb_classes)))
    else:
        rdd = lp_rdd.map(lambda lp: (from_vector(lp.features), lp.label))
    return rdd
예제 #25
0
def from_labeled_point(rdd: RDD,
                       categorical: bool = False,
                       nb_classes: int = None):
    """Convert a LabeledPoint RDD back to a pair of numpy arrays

    :param rdd: LabeledPoint RDD
    :param categorical: boolean, if labels should be one-hot encode when returned
    :param nb_classes: optional int, indicating the number of class labels
    :return: pair of numpy arrays, features and labels
    """
    features = np.asarray(
        rdd.map(lambda lp: from_vector(lp.features)).collect())
    labels = np.asarray(rdd.map(lambda lp: lp.label).collect(), dtype='int32')
    if categorical:
        if not nb_classes:
            nb_classes = np.max(labels) + 1
        temp = np.zeros((len(labels), nb_classes))
        for i, label in enumerate(labels):
            temp[i, label] = 1.
        labels = temp
    return features, labels
예제 #26
0
    def toDf(cls, spatialPairRDD: RDD, sparkSession: SparkSession):
        """

        :param spatialPairRDD:
        :param sparkSession:
        :return:
        """
        spatialPairRDD_mapped = spatialPairRDD.map(
            lambda x: [x[0].geom, *x[0].getUserData().split("\t"), x[1].geom, *x[1].getUserData().split("\t")]
        )
        df = sparkSession.createDataFrame(spatialPairRDD_mapped)
        return df
예제 #27
0
def calculate_node_tree(
    config: dict,
    record_rdd: RDD,
    spark: SparkSession,
    repo_type: str = "ecflow",
) -> dict:
    """Calculate node tree for each date in record RDDs.

    Parameters
    ----------
    config: dict
        processor's config
    record_rdd: RDD
        records RDD
    spark: SparkSession
        spark session
    repo_type: ["ecflow", "sms"]
        repo type

    Returns
    -------
    dict
        bunch map dict, key: date, value: bunch map
    """

    # **STEP**: map to (date, node_path)
    #   record object => (date, node_path) distinct
    def node_path_map(record):
        return record.date, record.node_path

    date_node_path_rdd = record_rdd.map(node_path_map).distinct()

    # **STEP**: group by date
    #   (date, node_path) => (record_date, list of record_fullname)
    date_node_path_list_rdd = date_node_path_rdd.groupByKey()

    # **STEP**: collect
    date_with_node_path_list = date_node_path_list_rdd.collect()

    # **STEP**: generate bunch
    logger.info("Generating bunch...")
    bunch_class = get_bunch_class(repo_type)
    bunch_map = {}
    for (day, node_path_list) in date_with_node_path_list:
        bunch = bunch_class()
        for node_path in node_path_list:
            if node_path is not None:
                bunch.add_node(node_path)
        logger.info(f"Generating bunch...done for {day}")
        bunch_map[day] = bunch

    return bunch_map
예제 #28
0
 def __call__(self, head: RDD):
     if self.keymap is None:
         return head.coalesce(self.partitions, self.shuffle)
     # partitionBy the key extracted using self.keymap
     try:
         # this checks if keymap is an identity
         probe = self.keymap("probe")
     except:  # noqa: E722
         probe = None
     if probe != "probe":
         head = head.map(lambda x: (self.keymap(x), x))
     return head \
         .partitionBy(self.partitions) \
         .map(lambda x: x[1])
    def _check_data(train: RDD = None, test: RDD = None) -> (RDD, int):
        # Data-type check
        if isinstance(train, RDD):
            is_legal_train = train.map(
                lambda u: len(u) >= 3 and u[0] is not None and u[1] is not None
                and isinstance(u[2], Number)).reduce(lambda u1, u2: u1 and u2)
            if not is_legal_train:
                raise ValueError(
                    "Parameter train should be an RDD<(user, item, rating)>")
            num_partitions_of_train = train.getNumPartitions()
            return train

        if isinstance(test, RDD):
            is_legal_test = test.map(lambda u: len(u) >= 2 and u[0] is not None
                                     and u[1] is not None).reduce(
                                         lambda u1, u2: u1 and u2)
            if not is_legal_test:
                raise ValueError(
                    "Parameter train should be an RDD<(user, item, rating)>")
            num_partitions_of_test = test.getNumPartitions()
            return test

        raise ValueError("RDD train/test need to be input.")
예제 #30
0
def to_spark_df(rdd: RDD, spark: SparkSession = None, init_condition: dict = None):
    if init_condition is not None and spark is not None:
        # Typefull
        return to_spark(rdd, init_condition)
    elif spark is None and init_condition is None:
        # String
        return rdd.map(lambda d: Row(**dict([(k, str(v)) for k, v in d.items()]))).toDF()
    else:
        # Typeless
        spark.conf.set("spark.sql.execution.arrow.enabled", "true")
        spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "true")
        warnings.simplefilter(action='ignore', category=UserWarning)
        pdf_from_rdd: DataFrame = to_pandas(rdd)
        result = spark.createDataFrame(pdf_from_rdd)
        del pdf_from_rdd
        return result
예제 #31
0
def make_dataframe_from_alerts(rdd: RDD, colnames: list) -> DataFrame:
    """ Make a Dataframe from a RDD of alerts and columns names.

    Parameters
    ----------
    rdd: Apache Spark RDD of dictionaries
        RDD whose elements are dictionaries (decoded alerts)
    colnames: list of str
        List containing the keys to include. For nested levels, just chain
        it using double dot: firstdic:seconddic:key

    Returns
    ----------
    out: DataFrame
        Dataframe from the input RDD and columns names.
    """
    return rdd.map(lambda x: tuple(ret(x, k) for k in colnames)).toDF(colnames)
 def __calculate_similarity(train: RDD, lsh_params: dict,
                            maximum_num_partitions: int) -> RDD:
     """
     Calculate Jaccard Similarity from train-RDD.
     :param train: RDD<(Hashable, Hashable, float)>
     :return: RDD<Hashable, Hashable, float>
         = RDD<bucket, bucket, similarity>
     """
     train = train.map(lambda u: (u[0], u[1]))\
         .groupByKey().map(lambda u: (u[0], list(u[1]))).cache()
     similarity_among_buckets = JaccardSimilarity(
         **lsh_params).predict(train).cache()
     if similarity_among_buckets.getNumPartitions(
     ) > maximum_num_partitions:
         similarity_among_buckets = similarity_among_buckets.coalesce(
             maximum_num_partitions).cache()
     return similarity_among_buckets
예제 #33
0
    def normal_order(self, terms: RDD, **kwargs):
        """Normal order the terms in the RDD."""

        if len(kwargs) > 0:
            raise ValueError('Invalid keyword arguments', kwargs)

        symms = self.symms
        swapper = self.swapper
        resolvers = self.resolvers

        init = terms.map(lambda term: _NOState(
            pivot=1, front=2, term=term.canon4normal(symms.value)))

        res = nest_bind(
            init,
            lambda x: _sort_vec(x, swapper=swapper, resolvers=resolvers.value),
            full_balance=self.full_balance)

        return res.map(lambda x: x.term)
예제 #34
0
파일: tree.py 프로젝트: chewy6i/spark
 def predict(self, x):
     """
     Predict the label of one or more examples.
     :param x:  Data point (feature vector),
                or an RDD of data points (feature vectors).
     """
     pythonAPI = self._sc._jvm.PythonMLLibAPI()
     if isinstance(x, RDD):
         # Bulk prediction
         if x.count() == 0:
             return self._sc.parallelize([])
         dataBytes = _get_unmangled_double_vector_rdd(x, cache=False)
         jSerializedPreds = \
             pythonAPI.predictDecisionTreeModel(self._java_model,
                                                dataBytes._jrdd)
         serializedPreds = RDD(jSerializedPreds, self._sc, NoOpSerializer())
         return serializedPreds.map(lambda bytes: _deserialize_double(bytearray(bytes)))
     else:
         # Assume x is a single data point.
         x_ = _serialize_double_vector(x)
         return pythonAPI.predictDecisionTreeModel(self._java_model, x_)
 def evaluate(self, lables_and_predictions: RDD):
     result = lables_and_predictions.map(lambda p: _hamming_loss(p[0], p[1])). \
         mean()
     self._results.append(result)
     return result