示例#1
0
def calculate_weights(rdd_in: RDD) -> RDD:
    rdd_in = rdd_in.collect()
    total: int = compute_sum_of_values(rdd_in)
    total_counts_for_the_four.append(total)

    return SC.parallelize(rdd_in).map(lambda x:
                                      (x[0], x[1], round(100 * x[1] / total)))
示例#2
0
def assert_rdd_equal(expected: Collection,
                     result: RDD,
                     check_order: bool = True):
    """
    Compare two RDD or one RDD with a Collection

    :param expected: A Collection to compare. For convenience, doesn't need to be a RDD.
    :param result: The RDD to compare.
    :param check_order: Compare the order of values.
    """

    if isinstance(expected, RDD):
        expected = expected.collect()
    else:
        expected = [_ for _ in expected]

    result = result.collect()

    # length comparison
    msg = f'RDD length {len(result)} does not match expected {len(expected)}'
    assert len(expected) == len(result), msg

    # value comparison
    if check_order is True:
        assert expected == result
    else:
        assert Counter(expected) == Counter(result)
示例#3
0
def run(log: logging.Log4j, config: Dict[str, str], n_cols: int,
        vrn_rdd_tfm: RDD, results_rdd: RDD, prices_rdd_tfm: RDD) -> None:
    """Runner of Load phase.

    Loads the transformed RDDs back into the respective worksheets in GSheets:
    - vrn_rdd_tfm - "VRNCleaned" worksheet
    - results_rdd - "Results" worksheet
    - prices_rdd_tfm - "Prices" worksheet

    Args:
        log: Log4j object
        config: Key-value mappings of config values
        n_cols: Number of columns in original VRN worksheet
        vrn_rdd_tfm: Transformed VRN RDD
        results_rdd: Results RDD
        prices_rdd_tfm: Transformed car prices RDD
    """

    # config values used
    spreadsheet_id = config[constants.CONFIG_GSHEET_SPREADSHEET_ID_DEV]
    ws_title_vrn_cleaned = config[constants.CONFIG_GSHEET_WS_VRN_CLEANED]
    ws_title_results = config[constants.CONFIG_GSHEET_WS_RESULTS]
    ws_title_prices = config[constants.CONFIG_GSHEET_WS_PRICES]

    # load VRN RDD and save to "VRNCleaned" worksheet
    vrn_data_tfm_flattened = vrn_rdd_tfm.collect()
    # Split this list into chunks, where each chunk is the number of elements per row
    vrn_data_tfm = list(genhelpers._chunks(vrn_data_tfm_flattened, n_cols))
    vrn_resp = gsheet.save_to_worksheet(spreadsheet_id, ws_title_vrn_cleaned,
                                        vrn_data_tfm, False)
    _log_load_resp(log, ws_title_results, vrn_resp)

    # load results RDD and save to "Results" worksheet
    results_data_flattened = results_rdd.collect()
    # Split this list into chunks, where each chunk is the number of elements per row
    results_data = list(genhelpers._chunks(results_data_flattened, n_cols))
    results_resp = gsheet.save_to_worksheet(spreadsheet_id, ws_title_results,
                                            results_data, False)
    _log_load_resp(log, ws_title_results, results_resp)

    # load prices RDD and save to "Prices" worksheet
    prices_data_tfm = prices_rdd_tfm.collect()
    prices_resp = gsheet.save_to_worksheet(spreadsheet_id, ws_title_prices,
                                           prices_data_tfm, True)
    _log_load_resp(log, ws_title_prices, prices_resp)

    return None
def writeFile(rdd: RDD):
    num = rdd.count()
    if num > 0:
        result_dic = open_result()
        print(result_dic)
        rdd_c = rdd.collect()
        lst = ast.literal_eval(str(rdd_c))
        for item in lst:
            key = item[0]
            value = item[1]
            if str(key).replace("'", '') in result_dic.keys():
                result_dic[str(key).replace(
                    "'", '')] = result_dic[str(key).replace("'", '')] + value
            else:
                result_dic[str(key).replace("'", '')] = value
        result = open(path, 'w', encoding='utf-8')
        result.write(
            json.dumps(result_dic).encode('gb18030').decode('unicode_escape'))
        result.close()
def writeFile(rdd: RDD, f_type):
    num = rdd.count()
    global save1
    global save2
    if num > 0:
        result_dic = open_result(f_type)
        rdd_c = rdd.collect()
        lst = ast.literal_eval(str(rdd_c))
        for item in lst:
            key = item[0]
            value = item[1]
            if str(key).replace("'", '') in result_dic.keys():
                result_dic[str(key).replace("'", '')] = result_dic[str(key).replace("'", '')] + value
            else:
                result_dic[str(key).replace("'", '')] = value
        if f_type == 0:
            save1 = result_dic
        if f_type == 1:
            save2 = result_dic
        result = open(path[f_type], 'w', encoding='utf-8')
        result.write(json.dumps(result_dic).encode('gb18030').decode('unicode_escape'))
        result.close()
 def m_o(ctx: SparkContext, data: pr.RDD) -> None:
     assert isinstance(ctx, SparkContext)
     assert 1 == len(data.collect())