示例#1
0
    def gen_data_simple_schema(self, data_path, partition_date, num_rows,
                               file_format):
        """
        Input
        - data_path: path where the partition will be created (string)
        - partition_date: partition date to be created (date)
        - num_rows: number of rows to be generated (integer)
        - file_format: format of file to be generated (parquet or avro)

        This function creates a data sample with a simple schema
        """

        person = Person('en')

        # Create a simple schema
        schema_df = StructType([
            StructField('identifier', StringType(), True),
            StructField('first_name', StringType(), True),
            StructField('last_name', StringType(), True),
            StructField('occupation', StringType(), True),
            StructField('age', IntegerType(), True),
            StructField('date', DateType(), True)
        ])

        # generate data
        for _ in range(num_rows):
            df_temp = self.spark.createDataFrame([[
                person.identifier(),
                person.first_name(),
                person.last_name(),
                person.occupation(),
                person.age(), partition_date
            ]], schema_df)

            try:
                df = df.union(df_temp)
            except:
                df = df_temp

        df.coalesce(1).write.partitionBy('date').mode('overwrite').format(
            file_format).save(data_path)

        print('Partition created: {data_path}/date={date}'.format(
            data_path=data_path, date=partition_date))
        print('# Rows:', df.count())
        print('Schema:')
        df.printSchema()
        print('\n')

        return
示例#2
0
def getting_started_example():
    generic = Generic()
    #generic = Generic(locales.EN)

    print('Month =', generic.datetime.month())
    print('Datetime =',
          generic.datetime.datetime(start=1900, end=2035,
                                    timezone=None))  # Type: datetime.datetime.
    print('IMEI =', generic.code.imei())
    print('Fruit =', generic.food.fruit())
    print('RNA =', generic.science.rna_sequence())

    print('Word =', generic.text.word())

    with generic.text.override_locale(locales.FR):
        print('Word =', generic.text.word())

    print('Word =', generic.text.word())

    generic = Generic('en')
    generic.add_provider(USASpecProvider)

    print('SSN =', generic.usa_provider.ssn())
    #print('CPF =', generic.usa_provider.cpf())  # AttributeError: 'USASpecProvider' object has no attribute 'cpf'.

    generic = Generic('pt-br')
    #generic = Generic(locales.PT_BR)
    generic.add_provider(BrazilSpecProvider)

    #print('SSN =', generic.brazil_provider.ssn())  # AttributeError: 'BrazilSpecProvider' object has no attribute 'ssn'.
    print('CPF =', generic.brazil_provider.cpf())

    #--------------------
    numbers = Numbers()

    print('Numbers =', numbers.between())  # Type: int.
    print('Numbers =', numbers.between(10, 10000000000000000))  # Type: int.

    #--------------------
    person = Person(locales.KO)

    print('Full name =', person.full_name(gender=Gender.FEMALE))
    print('Full name =', person.full_name(gender=Gender.MALE, reverse=True))

    with person.override_locale(locales.RU):
        print('Full name =', person.full_name())

    print('Full name =', person.full_name())
    print('Telephone =', person.telephone())
    print('Telephone =', person.telephone(mask='(###)-###-####'))
    print('Identifier =', person.identifier())
    print('Identifier =', person.identifier(mask='######-#######'))

    #--------------------
    de = Address('de')
    ru = Address('ru')

    print('Region =', de.region())
    print('Federal subject =', ru.federal_subject())
    print('Address =', de.address())
    print('Address =', ru.address())

    ko = Address('ko')

    print('Address =', ko.province(), ko.city(), ko.address())
    print('Zip code =', ko.zip_code())

    #--------------------
    business = Business('ko')

    #print('Price =', business.price(minimum=1.0, maximum=1000000000.0))  # Type: str.
    #print('Price =', business.price(minimum=1.0, maximum=1000000000.0)[:-2])  # Type: str.
    print('Price =',
          business.price(minimum=1.0, maximum=1000000000.0)[:-5])  # Type: str.

    #--------------------
    payment = Payment()

    print('Credit card =',
          payment.credit_card_number(card_type=None))  # Type: str.
示例#3
0
    def person(
        cls,
        *,
        locale=Locales.EN,
        qualification=None,
        age=None,
        blood_type=None,
        email=None,
        first_name=None,
        last_name=None,
        gender=None,
        height=None,
        id=None,
        language=None,
        nationality=None,
        occupation=None,
        phone=None,
        title=None,
        university=None,
        weight=None,
        work_experience=None,
    ):
        '''
            Create an Person Data Entity object.

            All individual fields are automatically randomly generated based on locale. If provided, the corresponding values are overriden.

            Note:
                All individual fields are randomly generated. Don't expect correct correlation e.g. correct postal code for the generated city.

            Keyword Arguments:
                locale: Approprite Random.locale.<local_name> object. Default is Random.locale.EN
                qualification: Educational Qualification
                age: Age
                blood_type: Blood type
                email: Email address
                first_name: First name
                last_name: Last name
                gender: Gender
                height: Height
                id: Identifier
                language: Language
                nationality: Nationality
                occupation: Occupation
                phone: Phone number
                title: Title
                university: University
                weight: Weight
                work_experience: Work Experience
        '''
        person = Person(locale=locale)
        from arjuna.engine.data.entity.person import Person as ArjPerson

        first_name = first_name is not None and first_name or person.first_name(
        )
        last_name = last_name is not None and last_name or person.last_name()
        return ArjPerson(
            qualification=qualification is not None and qualification
            or person.academic_degree(),
            age=age is not None and age or person.age(),
            blood_type=blood_type is not None and blood_type
            or person.blood_type(),
            email=email is not None and email or person.email(),
            first_name=first_name,
            last_name=last_name,
            name=first_name + " " + last_name,
            gender=gender is not None and gender or person.gender(),
            height=height is not None and height or person.height(),
            id=id is not None and id or person.identifier(),
            language=language is not None and language or person.language(),
            nationality=nationality is not None and nationality
            or person.nationality(),
            occupation=occupation is not None and occupation
            or person.occupation(),
            phone=phone is not None and phone or person.telephone(),
            title=title is not None and title or person.title(),
            university=university is not None and university
            or person.university(),
            weight=weight is not None and weight or person.weight(),
            work_experience=work_experience is not None and work_experience
            or person.work_experience(),
        )
示例#4
0
    def gen_data_remove_column(self, data_path, partition_date, num_rows,
                               file_format):
        """
        Input
        - data_path: path where the partition will be created (string)
        - partition_date: partition date to be created (date)
        - num_rows: number of rows to be generated (integer)
        - file_format: format of file to be generated (parquet or avro)

        This function creates a data sample removing some columns
        """

        person = Person('en')
        address = Address('en')

        schema_street = StructType([
            StructField('street_name', StringType(), True)
            # StructField('lat', FloatType(), True), #column removed
            # StructField('long', FloatType(), True) #column removed
        ])

        schema_address_details = StructType([
            StructField('street', schema_street, True),
            StructField('number', IntegerType(), True)
        ])

        schema_address = StructType([
            StructField('address_details', schema_address_details, True),
            StructField('city', StringType(), True),
            StructField('country', StringType(), True),
            # StructField('country_code', StringType(), True), #column removed
            StructField('state', StringType(), True),
            StructField('postal_code', IntegerType(), True)
        ])

        schema_df = StructType([
            StructField('identifier', StringType(), True),
            StructField('first_name', StringType(), True),
            StructField('last_name', StringType(), True),
            StructField('occupation', StringType(), True),
            StructField('age', IntegerType(), True),
            StructField('address', schema_address, True),
            # StructField('title_name', StringType(), True), #column removed
            StructField('date', DateType(), True)
        ])

        for _ in range(num_rows):
            df_temp = self.spark.createDataFrame(
                [[
                    person.identifier(),
                    person.first_name(),
                    person.last_name(),
                    person.occupation(),
                    person.age(),
                    [
                        [
                            [
                                address.street_name()
                                #float(address.latitude()),
                                #float(address.longitude())
                            ],
                            int(address.street_number())
                        ],
                        address.city(),
                        address.country(),
                        #address.country_code(),
                        address.state(),
                        int(address.postal_code())
                    ],
                    #person.title(),
                    partition_date
                ]],
                schema_df)

            try:
                df = df.union(df_temp)
            except:
                df = df_temp

        df.coalesce(1).write.partitionBy('date').mode('overwrite').format(
            file_format).save(data_path)

        print('Partition created: {data_path}/date={date}'.format(
            data_path=data_path, date=partition_date))
        print('# Rows:', df.count())
        print('Schema:')
        df.printSchema()
        print('\n')

        return