def test_convert_row_to_dict(self): row = Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")}) self.assertEqual(1, row.asDict()['l'][0].a) df = self.sc.parallelize([row]).toDF() df.registerTempTable("test") row = self.sqlCtx.sql("select l, d from test").head() self.assertEqual(1, row.asDict()["l"][0].a) self.assertEqual(1.0, row.asDict()['d']['key'].c)
def test_convert_row_to_dict(self): row = Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")}) self.assertEqual(1, row.asDict()['l'][0].a) df = self.sc.parallelize([row]).toDF() with self.tempView("test"): df.createOrReplaceTempView("test") row = self.spark.sql("select l, d from test").head() self.assertEqual(1, row.asDict()["l"][0].a) self.assertEqual(1.0, row.asDict()['d']['key'].c)
def compareRows(rowA: Row, rowB: Row): # Usage: compareRows(rowA, rowB) # compares two Dictionaries if (rowA == None and rowB == None): return True elif (rowA == None or rowB == None): return False else: return rowA.asDict() == rowB.asDict()
def are_rows_approx_equal(r1: Row, r2: Row, precision: float) -> bool: if r1 is None and r2 is None: return True if (r1 is None and r2 is not None) or (r2 is None and r1 is not None): return False d1 = r1.asDict() d2 = r2.asDict() allEqual = True for key in d1.keys() & d2.keys(): if isinstance(d1[key], float) and isinstance(d2[key], float): if abs(d1[key] - d2[key]) > precision: allEqual = False elif d1[key] != d2[key]: allEqual = False return allEqual
def validate_row(self, row: Row) -> Dict: """ Validate data frame row """ data = row.asDict(recursive=True) schema_data = {key: value for key, value in data.items() if key not in self._count_columns} duplicate_counts_data = [data[column] for column in self._count_columns] try: # Validate schema using marshmallow rvalue = self._schema.load(schema_data, *self._args, **self._kwargs) # Validate uniqueness if sum(duplicate_counts_data) > len(duplicate_counts_data): raise ValidationError("duplicate row") except ValidationError as err: # Return errors rvalue = { self._error_column_name: json.dumps( { "row": data, "errors": err.messages, } ) } return rvalue
def run_etl(source, output_path, spark=None): """ Run Spark ETL of source file. :params source (string) - name of source type (should be module in intake/sources/) :param output_path (string) - where to write parquet output :params spark - spark context """ if not spark: spark = SparkSession.builder.getOrCreate() config = yaml.safe_load( pkg_resources.resource_stream(f'intake.sources.{source}', f'{source}_config.yml')) file_path = config['source'] src_type = file_path.split('.')[-1] header_keys = config['header_keys'] ignore_symbol = config['ignore_symbol'] spark.sparkContext.addFile(file_path) data_path = SparkFiles.get(file_path.split('/')[-1]) rdd = spark.sparkContext.textFile(data_path) # Use mapPartitions for structuring rows to only load # keys once per partition. Alternatively, we can consider # broadcasting the header_keys to workers... # TODO - refactor column renames/yyyymmdd index creation as add more data sources... df = rdd.mapPartitions(lambda partition: filter_helper(partition, header=','.join(list(header_keys.keys())), ignore_symbol=ignore_symbol)) \ .mapPartitions(lambda partition: structure_as_row(partition, header_keys, src_type)) \ .map(lambda Row: create_yyyymmdd_index(Row.asDict())).toDF() \ df = column_rename_factory(df, source) df.write.mode("overwrite").parquet( output_path) # Always overwrite with latest dataset
def read_glove_vecs(glove_file, output_path): rdd = sc.textFile(glove_file) row = Row("glovevec") df = rdd.map(row).toDF() split_col = F.split(F.col('glovevec'), " ") df = df.withColumn('word', split_col.getItem(0)) df = df.withColumn('splitted', split_col) vec_udf = F.udf(lambda row: [float(i) for i in row[1:]], ArrayType(FloatType())) df = df.withColumn('vec', vec_udf(F.col('splitted'))) df = df.drop('splitted', "glovevec") w = Window.orderBy(["word"]) qdf = df.withColumn('vec', F.concat_ws(',', 'vec')).withColumn("id", F.row_number().over(w)) path = '{}/words'.format(output_path) qdf.coalesce(1).write.format('csv').option("sep", "\t").option('header', 'true').save(path) print('Words saved to: "{}"'.format(path)) list_words = list(map(lambda row: row.asDict(), qdf.collect())) word_to_vec_map = {item['word']: item['vec'] for item in list_words} words_to_index = {item['word']: item["id"] for item in list_words} index_to_words = {item["id"]: item['word'] for item in list_words} return words_to_index, index_to_words, word_to_vec_map
def row_mapper(row: Row, stage: Stage, definition: Definition) -> dict: managed_cols = ['val', 'measure_time'] val = row['val'] measure_time = row['measure_time'] output_metric = OutputMetric(definition.metric, val=val, measure_time=measure_time, horizontal_level=stage.horizontal_level, vertical_level=stage.vertical_level) functional_variables = definition.metric.func_vars.copy() row_dict = row.asDict() columns = row_dict.keys() for col in columns: if FUNCTIONAL_VARIABLE_NAME_PREFIX.match( col) and col not in functional_variables: functional_variables.append(col) for col in functional_variables: func_key = FUNCTIONAL_VARIABLE_NAME_PREFIX.sub('', col) if col in row_dict.keys(): output_metric.add_func_var(StructuredValue(row[col], func_key)) managed_cols.extend(functional_variables) group_map_keys = row_dict.keys() - managed_cols for col in group_map_keys: output_metric.add_group_value(StructuredValue(row[col], col)) return output_metric.asdict()
def add_derived_columns(self, row: Row) -> Row: row_dict = row.asDict() num_reviews = 0 good_review_count = 0 if row['REVIEW']: num_reviews = len(row['REVIEW']) for review in row['REVIEW']: if int(float(review['star_rating'])) >= 4: good_review_count += 1 num_checkins = 0 if row['CHECKIN']: num_checkins = len(row['CHECKIN'][0]['timestamps'].split(', ')) num_tips = 0 if row['TIP']: num_tips = len(row['TIP']) row_dict['num_checkins'] = num_checkins row_dict['num_tips'] = num_tips row_dict['num_reviews'] = num_reviews row_dict['good_review_count'] = good_review_count row_dict['business_name'] = row['BUSINESS'][0]['name'] row_dict['state'] = row['BUSINESS'][0]['state'] row_dict['city'] = row['BUSINESS'][0]['city'] row_dict['is_rfn'] = any( c for c in row['BUSINESS'][0]['categories'].split(',') if c.lower().strip() in ('restaurants', 'food', 'nightlife') ) # Testing removing columns to see how this df and source df joins for col in ['BUSINESS', 'REVIEW', 'TIP', 'CHECKIN']: row_dict.pop(col) return Row(**row_dict)
def sql_row_func_api(spark): print("Start running Row and Functions API") # row row = Row(name="Alice", age=11) print(row) print(row.name, row.age) Person = Row("name", "age") print(Person) print(Person("Alice", 11)) print("row API finished") # asDict row = Row(key=1, value=Row(name='a', age=2)) res = (row.asDict() == {'key': 1, 'value': Row(age=2, name='a')}) print(res) print("asDict API finished") # drop and fill df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) df.na.drop().show() df.na.fill(50).show() print("drop and fill API finished") # replace df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) df.na.replace('Alice', None).show() print("replace API finished") print("Finish running SQL Column API")
def recode(self, row: Row) -> Row: """ Input: row: original dataframe Row Output: dataframe Row with recode variables added """ row_dict: dict = row.asDict() try: relship: int = int(row[self.relgq_varname_list[0]]) qgqtyp: str = str(row[self.relgq_varname_list[1]]) final_pop: int = int(row["final_pop"]) qsex: str = str(row[self.sex_varname_list[0]]) qage: int = int(row[self.age_varname_list[0]]) cenhisp: str = str(row[self.hispanic_varname_list[0]]) cenrace: str = str(row[self.cenrace_das_varname_list[0]]) row_dict[CC.ATTR_RELGQ] = relgq_recode(relship, qgqtyp, final_pop) row_dict[CC.ATTR_SEX] = sex_recode(qsex) row_dict[CC.ATTR_AGE] = age_recode(qage) row_dict[CC.ATTR_HISP] = hispanic_recode(cenhisp) row_dict[CC.ATTR_CENRACE_DAS] = cenrace_recode(cenrace) return Row(**row_dict) except Exception: raise Exception(f"Unable to recode row: {str(row_dict)}")
def recode(self, row: Row) -> Row: row_dict = row.asDict() for rname, rfunc in self.recode_funcs.items(): row_dict[rname] = rfunc(row_dict) # for key in row_dict: # if key not in self. # row_dict.pop(k) return Row(**row_dict)
def assert_row_equal(left: Row, right: Row, check_field_order: bool = True): """ Comparte two lists of pyspark.sql.Row :param left: A Row to compare. :param right: Another Row to compare. :check_order: Compare the order of rows or ignore it. """ left_d = left.asDict() right_d = right.asDict() # fields comparison if not left_d.keys() == right_d.keys(): # Something's not right, check which set is different extra_l = left_d.keys() - right_d.keys() extra_r = right_d.keys() - left_d.keys() if extra_l is not set() and extra_r is not set(): msg = ('Both rows contain extra elements\n' ' + where left={l_fields}\n' ' + where right={r_fields}') raise (AssertionError( msg.format(l_fields=extra_l, r_fields=extra_r))) elif extra_l is not set() and extra_r is set(): msg = ('Left row contains extra elements: {l_fields}') raise (AssertionError(msg.format(l_fields=extra_l))) else: msg = ('Right row contains extra elements: {r_fields}') raise (AssertionError(msg.format(r_fields=extra_r))) # values comparison msg = ('Values for {field} do not match\n' ' + where left={l_value}\n' ' + where right={r_value}') for key in left_d.keys(): assert left_d[key] == right_d[key], msg.format(field=key, l_value=left_d[key], r_value=right_d[key])
def recode(self, row): """ Input: row: original dataframe Row Output: dataframe Row with recode variables added """ row = Row(**row.asDict(), RACE0=int(row[self.race]) - 1) row = gqtype1940_recode(row, self.gqtype) return votingage_recode(row, self.age)
def _process_experimenter_id(experimenter_metadata: Row): """ Some experimenter IDs need to be replaced on the metadata values. """ experimenter_metadata = experimenter_metadata.asDict() if experimenter_metadata["value"] in Constants.EXPERIMENTER_IDS: experimenter_metadata["value"] = Constants.EXPERIMENTER_IDS[ experimenter_metadata["value"]] if experimenter_metadata["value"] is not None: experimenter_metadata["value"] = (hashlib.md5( experimenter_metadata["value"].encode()).hexdigest()[:5].upper()) return experimenter_metadata["value"]
def recode(self, row): """ Input: row: an original dataframe Row Output: a dataframe Row with recode variables added """ hhgq1940 = map_to_hhgq(int(row[self.gq[0]]), int(row[self.gq[1]])) geocode = geocode_recode(row, self.geocode) row = Row(**row.asDict(), hhgq1940=hhgq1940, geocode=geocode) return row
def data_preparation(filename, plant, state): ''' This function creates an RDD in which every element is a tuple with the state as first element and a dictionary representing a vector of plant as a second element: (name of the state, {dictionary}) The dictionary should contains the plant names as a key. The corresponding value should be 1 if the plant occurs in the state of the tuple and 0 otherwise. You are strongly encouraged to use the RDD created here in the remainder of the assignment. Return value: True if the plant occurs in the state and False otherwise. Test: tests/test_data_preparation.py ''' spark = init_spark() lines = spark.read.text(filename).rdd parts = lines.map(lambda row: row.value.split(",")) rdd_data = parts.map(lambda p: Row(plant_name=p[0], states=p[1:])) global data_df data_df = spark.createDataFrame(rdd_data) data_df.cache() all_plants = data_df.select( data_df.plant_name).rdd.flatMap(lambda x: x).collect() rdd = createDict(data_df, all_plants) global data_f data_f = spark.createDataFrame(rdd) data_f.cache() dict_op = getFromDict(state) row = Row(**dict_op[0][0]) if plant in row.asDict().keys() and row.asDict()[plant] == 1: return True else: return False
def get_derived_columns(self, row: Row) -> Row: row_dict = row.asDict() num_reviews = 0 good_review_count = 0 num_tips = 0 if row['REVIEW']: num_reviews = len(row['REVIEW']) for review in row['REVIEW']: if int(float(review['star_rating'])) >= 4: good_review_count += 1 if row['TIP']: num_tips = len(row['TIP']) row_dict['num_tips'] = num_tips row_dict['num_reviews'] = num_reviews row_dict['good_review_count'] = good_review_count # Testing removing columns to see how this df and source df joins for col in ['USER', 'REVIEW', 'TIP']: row_dict.pop(col) return Row(**row_dict)
def good_experience_count(self, row: Row) -> Row: # TODO time range row_dict = row.asDict() good_review_count_2017 = 0 if row['REVIEW']: for review in row['REVIEW']: if int(float(review['star_rating'])) >= 4 and datetime.datetime.strptime(review['timestamp'], ts_format).year == 2017: good_review_count_2017 += 1 checkin_count_2017 = 0 if row['CHECKIN']: for ts in row['CHECKIN'][0]['timestamps'].split(', '): if datetime.datetime.strptime(ts, ts_format).year == 2017: checkin_count_2017 += 1 tip_count_2017 = 0 if row['TIP']: for tip in row['TIP']: if datetime.datetime.strptime(tip['timestamp'], ts_format).year == 2017: tip_count_2017 += 1 row_dict['good_experience_count_2017'] = good_review_count_2017 + checkin_count_2017 + tip_count_2017 row_dict['business_name'] = row['BUSINESS'][0]['name'] row_dict['state'] = row['BUSINESS'][0]['state'] return Row(**row_dict)
def recode(self, row: Row) -> Row: """ Input: row: original dataframe Row Output: dataframe Row with recode variables added """ row_dict: dict = row.asDict() try: qgqtyp: str = str(row[self.hhgq_varname_list[0]]) qage: int = int(row[self.votingage_varname_list[0]]) cenhisp: str = str(row[self.hispanic_varname_list[0]]) cenrace: str = str(row[self.cenrace_das_varname_list[0]]) row_dict[CC.ATTR_HHGQ] = hhgq_recode(qgqtyp) row_dict[CC.ATTR_VOTING_AGE] = votingage_recode(qage) row_dict[CC.ATTR_HISP] = hispanic_recode(cenhisp) row_dict[CC.ATTR_CENRACE_DAS] = cenrace_recode(cenrace) return Row(**row_dict) except Exception: raise Exception(f"Unable to recode row: {str(row_dict)}")
print('dataB is ...') print(dataB) ######################################## row_x = Row(id=1, name='Alan', dob='1962-11-25', chelsea_fan=True) # create a list of field values from a Row object using the list function # [1, 'Alan', '1962-11-25', True] print('List of row field values is ...') print(list(row_x)) # create a dictionary of field names/values from a Row object using the asDict method # {'id': 1, 'name': 'Alan', 'dob': '1962-11-25', 'chelsea_fan': True} print('Dictionary of row field names/values is ...') print(row_x.asDict()) # create a list of field values from a Row object using the list() function or asDict.values() method function # [1, 'Alan', '1962-11-25', True] print('List of row field values is ...') print(list(row_x)) print(list(row_x.asDict().values())) # a field value can be obtained from a row by one of 2 methods # 'Alan' # 'Alan' print('A row field values is ...') print(row_x['name']) print(row_x.name) # the presence of a field name in a row can be determened using in
# In[203]: Person("Alice", 11) # In[204]: Row(name="Alice", age=11).asDict() == {'name': 'Alice', 'age': 11} # In[205]: row = Row(key=1, value=Row(name='a', age=2)) # In[206]: row.asDict() == {'key': 1, 'value': Row(age=2, name='a')} # In[207]: row.asDict(True) == {'key': 1, 'value': {'name': 'a', 'age': 2}} # In[208]: df.na.drop().show() # In[223]: df.na.fill({'age': 50, 'name': 'unknown'}).show() # In[224]:
def recode(self, row): """ Input: row: original dataframe Row Output: dataframe Row with recode variables added """ # Get the values for all the variables needed for recoding gq = int(row[self.gq_varnames[0]]) gqtype = int(row[self.gq_varnames[1]]) sex = int(row[self.sex]) age = int(row[self.age]) hispan = int(row[self.hispan]) race = int(row[self.race]) citizen = int(row[self.citizen]) # HHGQ for 1940s is recoded from the GQ and GQTYPE (obtained prior by joining with the unit table) hhgq1940 = map_to_hhgq(gq, gqtype) # 1940s SEX has 2 values, which are re-indexed to 0 and 1 by subtracting 1, to match the 1940 DHCP Schema assert sex in [1, 2], f'Incorrect sex in input data for row: {str(row)}' sex1940 = sex - 1 # 1940s AGE has a maximum of 120. Top code at 115 to match the 1940 DHCP Schema assert 0 <= age <= 120, f'Incorrect age in input data for row: {str(row)}' age1940 = 115 if age > 115 else age # 1940s HISPAN has 5 values. Map Not Hispanic (0) to Not Hispanic (0), and all others to Hispanic (1) # to match the 1940 DHCP Schema assert hispan in [ 0, 1, 2, 3, 4 ], f'Incorrect hispan in input data for row: {str(row)}' hispanic1940 = 0 if hispan == 0 else 1 # 1940s RACE has 6 values, which are re-indexed by subtracting 1, to match the 1940 DHCP Schema assert race in [ 1, 2, 3, 4, 5, 6 ], f'Incorrect hispan in input data for row: {str(row)}' cenrace1940 = race # 1940s CITIZEN has 5 values, recoded as follows to match the 1940 DHCP Schema: # N/A (0) -> Citizen (1) # Born abroad of American parents (1) -> Citizen (1) # Naturalized citizen (2) -> Citizen (1) # Not a citizen (3) -> Not a Citizen (0) # Not a citizen, but has received first papers (4) -> Not a Citizen (0) assert hispan in [ 0, 1, 2, 3, 4 ], f'Incorrect hispan in input data for row: {str(row)}' citizen1940 = 1 if citizen in [0, 1, 2] else 0 row = Row(**row.asDict(), hhgq1940=hhgq1940, sex1940=sex1940, age1940=age1940, hispanic1940=hispanic1940, cenrace1940=cenrace1940, citizen1940=citizen1940) return row
def _update_fields(x: Row, mapped_fields: FieldMap) -> Row: # ``pyspark.sql.Row`` objects are not mutable, so need to # reconstruct all_fields = x.asDict() all_fields.update(mapped_fields) return Row(**all_fields)
def convert_one(row: Row) -> Row: # For now place the .xgb right next to the svmrank files. Naming/path # options could be added if needed later. out_path = row.path + '.xgb' _convert_xgboost_remote(row.path, out_path) return Row(**dict(row.asDict(), vec_format='xgboost', path=out_path))