示例#1
0
 def test_convert_row_to_dict(self):
     row = Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")})
     self.assertEqual(1, row.asDict()['l'][0].a)
     df = self.sc.parallelize([row]).toDF()
     df.registerTempTable("test")
     row = self.sqlCtx.sql("select l, d from test").head()
     self.assertEqual(1, row.asDict()["l"][0].a)
     self.assertEqual(1.0, row.asDict()['d']['key'].c)
示例#2
0
文件: tests.py 项目: sky-junjun/spark
 def test_convert_row_to_dict(self):
     row = Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")})
     self.assertEqual(1, row.asDict()['l'][0].a)
     df = self.sc.parallelize([row]).toDF()
     df.registerTempTable("test")
     row = self.sqlCtx.sql("select l, d from test").head()
     self.assertEqual(1, row.asDict()["l"][0].a)
     self.assertEqual(1.0, row.asDict()['d']['key'].c)
示例#3
0
    def test_convert_row_to_dict(self):
        row = Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")})
        self.assertEqual(1, row.asDict()['l'][0].a)
        df = self.sc.parallelize([row]).toDF()

        with self.tempView("test"):
            df.createOrReplaceTempView("test")
            row = self.spark.sql("select l, d from test").head()
            self.assertEqual(1, row.asDict()["l"][0].a)
            self.assertEqual(1.0, row.asDict()['d']['key'].c)
示例#4
0
    def test_convert_row_to_dict(self):
        row = Row(l=[Row(a=1, b='s')], d={"key": Row(c=1.0, d="2")})
        self.assertEqual(1, row.asDict()['l'][0].a)
        df = self.sc.parallelize([row]).toDF()

        with self.tempView("test"):
            df.createOrReplaceTempView("test")
            row = self.spark.sql("select l, d from test").head()
            self.assertEqual(1, row.asDict()["l"][0].a)
            self.assertEqual(1.0, row.asDict()['d']['key'].c)
def compareRows(rowA: Row, rowB: Row):
  # Usage: compareRows(rowA, rowB)
  # compares two Dictionaries
  
  if (rowA == None and rowB == None):
    return True
  
  elif (rowA == None or rowB == None):
    return False
  
  else: 
    return rowA.asDict() == rowB.asDict()
示例#6
0
def are_rows_approx_equal(r1: Row, r2: Row, precision: float) -> bool:
    if r1 is None and r2 is None:
        return True
    if (r1 is None and r2 is not None) or (r2 is None and r1 is not None):
        return False
    d1 = r1.asDict()
    d2 = r2.asDict()
    allEqual = True
    for key in d1.keys() & d2.keys():
        if isinstance(d1[key], float) and isinstance(d2[key], float):
            if abs(d1[key] - d2[key]) > precision:
                allEqual = False
        elif d1[key] != d2[key]:
            allEqual = False
    return allEqual
示例#7
0
    def validate_row(self, row: Row) -> Dict:
        """
            Validate data frame row
        """
        data = row.asDict(recursive=True)
        schema_data = {key: value for key, value in data.items() if key not in self._count_columns}
        duplicate_counts_data = [data[column] for column in self._count_columns]
        try:
            # Validate schema using marshmallow
            rvalue = self._schema.load(schema_data, *self._args, **self._kwargs)
            # Validate uniqueness
            if sum(duplicate_counts_data) > len(duplicate_counts_data):
                raise ValidationError("duplicate row")
        except ValidationError as err:
            # Return errors
            rvalue = {
                self._error_column_name: json.dumps(
                    {
                        "row": data,
                        "errors": err.messages,
                    }
                )
            }

        return rvalue
示例#8
0
def run_etl(source, output_path, spark=None):
    """
    Run Spark ETL of source file.

    :params source (string) - name of source type (should be module in intake/sources/)
    :param output_path (string) - where to write parquet output
    :params spark - spark context
    """
    if not spark:
        spark = SparkSession.builder.getOrCreate()

    config = yaml.safe_load(
        pkg_resources.resource_stream(f'intake.sources.{source}',
                                      f'{source}_config.yml'))
    file_path = config['source']
    src_type = file_path.split('.')[-1]
    header_keys = config['header_keys']
    ignore_symbol = config['ignore_symbol']

    spark.sparkContext.addFile(file_path)
    data_path = SparkFiles.get(file_path.split('/')[-1])
    rdd = spark.sparkContext.textFile(data_path)

    # Use mapPartitions for structuring rows to only load
    # keys once per partition. Alternatively, we can consider
    # broadcasting the header_keys to workers...
    # TODO - refactor column renames/yyyymmdd index creation as add more data sources...
    df = rdd.mapPartitions(lambda partition: filter_helper(partition, header=','.join(list(header_keys.keys())), ignore_symbol=ignore_symbol)) \
        .mapPartitions(lambda partition: structure_as_row(partition, header_keys, src_type)) \
        .map(lambda Row: create_yyyymmdd_index(Row.asDict())).toDF() \

    df = column_rename_factory(df, source)
    df.write.mode("overwrite").parquet(
        output_path)  # Always overwrite with latest dataset
示例#9
0
def read_glove_vecs(glove_file, output_path):
    rdd = sc.textFile(glove_file)
    row = Row("glovevec")
    df = rdd.map(row).toDF()
    split_col = F.split(F.col('glovevec'), " ")
    df = df.withColumn('word', split_col.getItem(0))
    df = df.withColumn('splitted', split_col)
    vec_udf = F.udf(lambda row: [float(i) for i in row[1:]],
                    ArrayType(FloatType()))
    df = df.withColumn('vec', vec_udf(F.col('splitted')))
    df = df.drop('splitted', "glovevec")
    w = Window.orderBy(["word"])
    qdf = df.withColumn('vec',
                        F.concat_ws(',',
                                    'vec')).withColumn("id",
                                                       F.row_number().over(w))

    path = '{}/words'.format(output_path)
    qdf.coalesce(1).write.format('csv').option("sep",
                                               "\t").option('header',
                                                            'true').save(path)
    print('Words saved to: "{}"'.format(path))
    list_words = list(map(lambda row: row.asDict(), qdf.collect()))
    word_to_vec_map = {item['word']: item['vec'] for item in list_words}
    words_to_index = {item['word']: item["id"] for item in list_words}
    index_to_words = {item["id"]: item['word'] for item in list_words}
    return words_to_index, index_to_words, word_to_vec_map
示例#10
0
def row_mapper(row: Row, stage: Stage, definition: Definition) -> dict:
    managed_cols = ['val', 'measure_time']
    val = row['val']
    measure_time = row['measure_time']

    output_metric = OutputMetric(definition.metric,
                                 val=val,
                                 measure_time=measure_time,
                                 horizontal_level=stage.horizontal_level,
                                 vertical_level=stage.vertical_level)

    functional_variables = definition.metric.func_vars.copy()
    row_dict = row.asDict()
    columns = row_dict.keys()

    for col in columns:
        if FUNCTIONAL_VARIABLE_NAME_PREFIX.match(
                col) and col not in functional_variables:
            functional_variables.append(col)

    for col in functional_variables:
        func_key = FUNCTIONAL_VARIABLE_NAME_PREFIX.sub('', col)
        if col in row_dict.keys():
            output_metric.add_func_var(StructuredValue(row[col], func_key))

    managed_cols.extend(functional_variables)
    group_map_keys = row_dict.keys() - managed_cols

    for col in group_map_keys:
        output_metric.add_group_value(StructuredValue(row[col], col))

    return output_metric.asdict()
 def add_derived_columns(self, row: Row) -> Row:
     row_dict = row.asDict()
     num_reviews = 0
     good_review_count = 0
     if row['REVIEW']:
         num_reviews = len(row['REVIEW'])
         for review in row['REVIEW']:
             if int(float(review['star_rating'])) >= 4:
                 good_review_count += 1
     num_checkins = 0
     if row['CHECKIN']:
         num_checkins = len(row['CHECKIN'][0]['timestamps'].split(', '))
     num_tips = 0
     if row['TIP']:
         num_tips = len(row['TIP'])
     row_dict['num_checkins'] = num_checkins
     row_dict['num_tips'] = num_tips
     row_dict['num_reviews'] = num_reviews
     row_dict['good_review_count'] = good_review_count
     row_dict['business_name'] = row['BUSINESS'][0]['name']
     row_dict['state'] = row['BUSINESS'][0]['state']
     row_dict['city'] = row['BUSINESS'][0]['city']
     row_dict['is_rfn'] = any(
         c for c in row['BUSINESS'][0]['categories'].split(',') if
         c.lower().strip() in ('restaurants', 'food', 'nightlife')
     )
     # Testing removing columns to see how this df and source df joins
     for col in ['BUSINESS', 'REVIEW', 'TIP', 'CHECKIN']:
         row_dict.pop(col)
     return Row(**row_dict)
def sql_row_func_api(spark):
    
    print("Start running Row and Functions API")
    
    # row
    row = Row(name="Alice", age=11)
    print(row)
    print(row.name, row.age)
    Person = Row("name", "age")
    print(Person)
    print(Person("Alice", 11))
    print("row API finished")
    
    # asDict
    row = Row(key=1, value=Row(name='a', age=2))
    res = (row.asDict() == {'key': 1, 'value': Row(age=2, name='a')})
    print(res)
    print("asDict API finished")
    
    # drop and fill
    df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
    df.na.drop().show()
    df.na.fill(50).show()
    print("drop and fill API finished")
    
    # replace
    df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
    df.na.replace('Alice', None).show()
    print("replace API finished")
    
    print("Finish running SQL Column API")
    def recode(self, row: Row) -> Row:
        """
            Input:
                row: original dataframe Row

            Output:
                dataframe Row with recode variables added
        """
        row_dict: dict = row.asDict()
        try:
            relship: int = int(row[self.relgq_varname_list[0]])
            qgqtyp: str = str(row[self.relgq_varname_list[1]])
            final_pop: int = int(row["final_pop"])

            qsex: str = str(row[self.sex_varname_list[0]])
            qage: int = int(row[self.age_varname_list[0]])
            cenhisp: str = str(row[self.hispanic_varname_list[0]])
            cenrace: str = str(row[self.cenrace_das_varname_list[0]])

            row_dict[CC.ATTR_RELGQ] = relgq_recode(relship, qgqtyp, final_pop)
            row_dict[CC.ATTR_SEX] = sex_recode(qsex)
            row_dict[CC.ATTR_AGE] = age_recode(qage)
            row_dict[CC.ATTR_HISP] = hispanic_recode(cenhisp)
            row_dict[CC.ATTR_CENRACE_DAS] = cenrace_recode(cenrace)

            return Row(**row_dict)
        except Exception:
            raise Exception(f"Unable to recode row: {str(row_dict)}")
 def recode(self, row: Row) -> Row:
     row_dict = row.asDict()
     for rname, rfunc in self.recode_funcs.items():
         row_dict[rname] = rfunc(row_dict)
     # for key in row_dict:
     #     if key not in self.
     #         row_dict.pop(k)
     return Row(**row_dict)
示例#15
0
def assert_row_equal(left: Row, right: Row, check_field_order: bool = True):
    """
    Comparte two lists of pyspark.sql.Row

    :param left: A Row to compare.
    :param right: Another Row to compare.
    :check_order: Compare the order of rows or ignore it.
    """

    left_d = left.asDict()
    right_d = right.asDict()

    # fields comparison
    if not left_d.keys() == right_d.keys():
        # Something's not right, check which set is different
        extra_l = left_d.keys() - right_d.keys()
        extra_r = right_d.keys() - left_d.keys()

        if extra_l is not set() and extra_r is not set():
            msg = ('Both rows contain extra elements\n'
                   ' +  where left={l_fields}\n'
                   ' +  where right={r_fields}')
            raise (AssertionError(
                msg.format(l_fields=extra_l, r_fields=extra_r)))

        elif extra_l is not set() and extra_r is set():
            msg = ('Left row contains extra elements: {l_fields}')
            raise (AssertionError(msg.format(l_fields=extra_l)))

        else:
            msg = ('Right row contains extra elements: {r_fields}')
            raise (AssertionError(msg.format(r_fields=extra_r)))

    # values comparison
    msg = ('Values for {field} do not match\n'
           ' +  where left={l_value}\n'
           ' +  where right={r_value}')

    for key in left_d.keys():

        assert left_d[key] == right_d[key], msg.format(field=key,
                                                       l_value=left_d[key],
                                                       r_value=right_d[key])
    def recode(self, row):
        """
            Input:
                row: original dataframe Row

            Output:
                dataframe Row with recode variables added
        """
        row = Row(**row.asDict(), RACE0=int(row[self.race]) - 1)
        row = gqtype1940_recode(row, self.gqtype)
        return votingage_recode(row, self.age)
示例#17
0
def _process_experimenter_id(experimenter_metadata: Row):
    """
    Some experimenter IDs need to be replaced on the metadata values.
    """
    experimenter_metadata = experimenter_metadata.asDict()
    if experimenter_metadata["value"] in Constants.EXPERIMENTER_IDS:
        experimenter_metadata["value"] = Constants.EXPERIMENTER_IDS[
            experimenter_metadata["value"]]
    if experimenter_metadata["value"] is not None:
        experimenter_metadata["value"] = (hashlib.md5(
            experimenter_metadata["value"].encode()).hexdigest()[:5].upper())
    return experimenter_metadata["value"]
示例#18
0
    def recode(self, row):
        """
            Input:
                row: an original dataframe Row

            Output:
                a dataframe Row with recode variables added
        """
        hhgq1940 = map_to_hhgq(int(row[self.gq[0]]), int(row[self.gq[1]]))
        geocode = geocode_recode(row, self.geocode)
        row = Row(**row.asDict(), hhgq1940=hhgq1940, geocode=geocode)
        return row
示例#19
0
def data_preparation(filename, plant, state):
    '''
    This function creates an RDD in which every element is a tuple with 
    the state as first element and a dictionary representing a vector 
    of plant as a second element:
    (name of the state, {dictionary})

    The dictionary should contains the plant names as a key. The 
    corresponding value should be 1 if the plant occurs in the state of 
    the tuple and 0 otherwise.

    You are strongly encouraged to use the RDD created here in the 
    remainder of the assignment.

    Return value: True if the plant occurs in the state and False otherwise.
    Test: tests/test_data_preparation.py
    '''
    spark = init_spark()
    lines = spark.read.text(filename).rdd
    parts = lines.map(lambda row: row.value.split(","))
    rdd_data = parts.map(lambda p: Row(plant_name=p[0], states=p[1:]))
    global data_df
    data_df = spark.createDataFrame(rdd_data)
    data_df.cache()
    all_plants = data_df.select(
        data_df.plant_name).rdd.flatMap(lambda x: x).collect()
    rdd = createDict(data_df, all_plants)
    global data_f
    data_f = spark.createDataFrame(rdd)
    data_f.cache()
    dict_op = getFromDict(state)
    row = Row(**dict_op[0][0])
    if plant in row.asDict().keys() and row.asDict()[plant] == 1:
        return True
    else:
        return False
 def get_derived_columns(self, row: Row) -> Row:
     row_dict = row.asDict()
     num_reviews = 0
     good_review_count = 0
     num_tips = 0
     if row['REVIEW']:
         num_reviews = len(row['REVIEW'])
         for review in row['REVIEW']:
             if int(float(review['star_rating'])) >= 4:
                 good_review_count += 1
     if row['TIP']:
         num_tips = len(row['TIP'])
     row_dict['num_tips'] = num_tips
     row_dict['num_reviews'] = num_reviews
     row_dict['good_review_count'] = good_review_count
     # Testing removing columns to see how this df and source df joins
     for col in ['USER', 'REVIEW', 'TIP']:
         row_dict.pop(col)
     return Row(**row_dict)
示例#21
0
 def good_experience_count(self, row: Row) -> Row:  # TODO time range
     row_dict = row.asDict()
     good_review_count_2017 = 0
     if row['REVIEW']:
         for review in row['REVIEW']:
             if int(float(review['star_rating'])) >= 4 and datetime.datetime.strptime(review['timestamp'], ts_format).year == 2017:
                 good_review_count_2017 += 1
     checkin_count_2017 = 0
     if row['CHECKIN']:
         for ts in row['CHECKIN'][0]['timestamps'].split(', '):
             if datetime.datetime.strptime(ts, ts_format).year == 2017:
                 checkin_count_2017 += 1
     tip_count_2017 = 0
     if row['TIP']:
         for tip in row['TIP']:
             if datetime.datetime.strptime(tip['timestamp'], ts_format).year == 2017:
                 tip_count_2017 += 1
     row_dict['good_experience_count_2017'] = good_review_count_2017 + checkin_count_2017 + tip_count_2017
     row_dict['business_name'] = row['BUSINESS'][0]['name']
     row_dict['state'] = row['BUSINESS'][0]['state']
     return Row(**row_dict)
    def recode(self, row: Row) -> Row:
        """
            Input:
                row: original dataframe Row

            Output:
                dataframe Row with recode variables added
        """
        row_dict: dict = row.asDict()
        try:

            qgqtyp: str = str(row[self.hhgq_varname_list[0]])
            qage: int = int(row[self.votingage_varname_list[0]])
            cenhisp: str = str(row[self.hispanic_varname_list[0]])
            cenrace: str = str(row[self.cenrace_das_varname_list[0]])

            row_dict[CC.ATTR_HHGQ] = hhgq_recode(qgqtyp)
            row_dict[CC.ATTR_VOTING_AGE] = votingage_recode(qage)
            row_dict[CC.ATTR_HISP] = hispanic_recode(cenhisp)
            row_dict[CC.ATTR_CENRACE_DAS] = cenrace_recode(cenrace)

            return Row(**row_dict)
        except Exception:
            raise Exception(f"Unable to recode row: {str(row_dict)}")
示例#23
0
print('dataB is ...')
print(dataB)

########################################

row_x = Row(id=1, name='Alan', dob='1962-11-25', chelsea_fan=True)

# create a list of field values from a Row object using the list function
# [1, 'Alan', '1962-11-25', True]
print('List of row field values is ...')
print(list(row_x))

# create a dictionary of field names/values from a Row object using the asDict method
# {'id': 1, 'name': 'Alan', 'dob': '1962-11-25', 'chelsea_fan': True}
print('Dictionary of row field names/values is ...')
print(row_x.asDict())

# create a list of field values from a Row object using the list() function or asDict.values() method function
# [1, 'Alan', '1962-11-25', True]
print('List of row field values is ...')
print(list(row_x))
print(list(row_x.asDict().values()))

# a field value can be obtained from a row by one of 2 methods
# 'Alan'
# 'Alan'
print('A row field values is ...')
print(row_x['name'])
print(row_x.name)

# the presence of a field name in a row can be determened using in
示例#24
0
# In[203]:

Person("Alice", 11)

# In[204]:

Row(name="Alice", age=11).asDict() == {'name': 'Alice', 'age': 11}

# In[205]:

row = Row(key=1, value=Row(name='a', age=2))

# In[206]:

row.asDict() == {'key': 1, 'value': Row(age=2, name='a')}

# In[207]:

row.asDict(True) == {'key': 1, 'value': {'name': 'a', 'age': 2}}

# In[208]:

df.na.drop().show()

# In[223]:

df.na.fill({'age': 50, 'name': 'unknown'}).show()

# In[224]:
示例#25
0
    def recode(self, row):
        """
            Input:
                row: original dataframe Row

            Output:
                dataframe Row with recode variables added
        """
        # Get the values for all the variables needed for recoding
        gq = int(row[self.gq_varnames[0]])
        gqtype = int(row[self.gq_varnames[1]])
        sex = int(row[self.sex])
        age = int(row[self.age])
        hispan = int(row[self.hispan])
        race = int(row[self.race])
        citizen = int(row[self.citizen])

        # HHGQ for 1940s is recoded from the GQ and GQTYPE (obtained prior by joining with the unit table)
        hhgq1940 = map_to_hhgq(gq, gqtype)

        # 1940s SEX has 2 values, which are re-indexed to 0 and 1 by subtracting 1, to match the 1940 DHCP Schema
        assert sex in [1,
                       2], f'Incorrect sex in input data for row: {str(row)}'
        sex1940 = sex - 1

        # 1940s AGE has a maximum of 120. Top code at 115 to match the 1940 DHCP Schema
        assert 0 <= age <= 120, f'Incorrect age in input data for row: {str(row)}'
        age1940 = 115 if age > 115 else age

        # 1940s HISPAN has 5 values. Map Not Hispanic (0) to Not Hispanic (0), and all others to Hispanic (1)
        # to match the 1940 DHCP Schema
        assert hispan in [
            0, 1, 2, 3, 4
        ], f'Incorrect hispan in input data for row: {str(row)}'
        hispanic1940 = 0 if hispan == 0 else 1

        # 1940s RACE has 6 values, which are re-indexed by subtracting 1, to match the 1940 DHCP Schema
        assert race in [
            1, 2, 3, 4, 5, 6
        ], f'Incorrect hispan in input data for row: {str(row)}'
        cenrace1940 = race

        # 1940s CITIZEN has 5 values, recoded as follows to match the 1940 DHCP Schema:
        # N/A                                          (0) -> Citizen (1)
        # Born abroad of American parents              (1) -> Citizen (1)
        # Naturalized citizen                          (2) -> Citizen (1)
        # Not a citizen                                (3) -> Not a Citizen (0)
        # Not a citizen, but has received first papers (4) -> Not a Citizen (0)
        assert hispan in [
            0, 1, 2, 3, 4
        ], f'Incorrect hispan in input data for row: {str(row)}'
        citizen1940 = 1 if citizen in [0, 1, 2] else 0

        row = Row(**row.asDict(),
                  hhgq1940=hhgq1940,
                  sex1940=sex1940,
                  age1940=age1940,
                  hispanic1940=hispanic1940,
                  cenrace1940=cenrace1940,
                  citizen1940=citizen1940)
        return row
示例#26
0
def _update_fields(x: Row, mapped_fields: FieldMap) -> Row:
    # ``pyspark.sql.Row`` objects are not mutable, so need to
    # reconstruct
    all_fields = x.asDict()
    all_fields.update(mapped_fields)
    return Row(**all_fields)
示例#27
0
 def convert_one(row: Row) -> Row:
     # For now place the .xgb right next to the svmrank files. Naming/path
     # options could be added if needed later.
     out_path = row.path + '.xgb'
     _convert_xgboost_remote(row.path, out_path)
     return Row(**dict(row.asDict(), vec_format='xgboost', path=out_path))