def gen_data_simple_schema(self, data_path, partition_date, num_rows, file_format): """ Input - data_path: path where the partition will be created (string) - partition_date: partition date to be created (date) - num_rows: number of rows to be generated (integer) - file_format: format of file to be generated (parquet or avro) This function creates a data sample with a simple schema """ person = Person('en') # Create a simple schema schema_df = StructType([ StructField('identifier', StringType(), True), StructField('first_name', StringType(), True), StructField('last_name', StringType(), True), StructField('occupation', StringType(), True), StructField('age', IntegerType(), True), StructField('date', DateType(), True) ]) # generate data for _ in range(num_rows): df_temp = self.spark.createDataFrame([[ person.identifier(), person.first_name(), person.last_name(), person.occupation(), person.age(), partition_date ]], schema_df) try: df = df.union(df_temp) except: df = df_temp df.coalesce(1).write.partitionBy('date').mode('overwrite').format( file_format).save(data_path) print('Partition created: {data_path}/date={date}'.format( data_path=data_path, date=partition_date)) print('# Rows:', df.count()) print('Schema:') df.printSchema() print('\n') return
def getting_started_example(): generic = Generic() #generic = Generic(locales.EN) print('Month =', generic.datetime.month()) print('Datetime =', generic.datetime.datetime(start=1900, end=2035, timezone=None)) # Type: datetime.datetime. print('IMEI =', generic.code.imei()) print('Fruit =', generic.food.fruit()) print('RNA =', generic.science.rna_sequence()) print('Word =', generic.text.word()) with generic.text.override_locale(locales.FR): print('Word =', generic.text.word()) print('Word =', generic.text.word()) generic = Generic('en') generic.add_provider(USASpecProvider) print('SSN =', generic.usa_provider.ssn()) #print('CPF =', generic.usa_provider.cpf()) # AttributeError: 'USASpecProvider' object has no attribute 'cpf'. generic = Generic('pt-br') #generic = Generic(locales.PT_BR) generic.add_provider(BrazilSpecProvider) #print('SSN =', generic.brazil_provider.ssn()) # AttributeError: 'BrazilSpecProvider' object has no attribute 'ssn'. print('CPF =', generic.brazil_provider.cpf()) #-------------------- numbers = Numbers() print('Numbers =', numbers.between()) # Type: int. print('Numbers =', numbers.between(10, 10000000000000000)) # Type: int. #-------------------- person = Person(locales.KO) print('Full name =', person.full_name(gender=Gender.FEMALE)) print('Full name =', person.full_name(gender=Gender.MALE, reverse=True)) with person.override_locale(locales.RU): print('Full name =', person.full_name()) print('Full name =', person.full_name()) print('Telephone =', person.telephone()) print('Telephone =', person.telephone(mask='(###)-###-####')) print('Identifier =', person.identifier()) print('Identifier =', person.identifier(mask='######-#######')) #-------------------- de = Address('de') ru = Address('ru') print('Region =', de.region()) print('Federal subject =', ru.federal_subject()) print('Address =', de.address()) print('Address =', ru.address()) ko = Address('ko') print('Address =', ko.province(), ko.city(), ko.address()) print('Zip code =', ko.zip_code()) #-------------------- business = Business('ko') #print('Price =', business.price(minimum=1.0, maximum=1000000000.0)) # Type: str. #print('Price =', business.price(minimum=1.0, maximum=1000000000.0)[:-2]) # Type: str. print('Price =', business.price(minimum=1.0, maximum=1000000000.0)[:-5]) # Type: str. #-------------------- payment = Payment() print('Credit card =', payment.credit_card_number(card_type=None)) # Type: str.
def person( cls, *, locale=Locales.EN, qualification=None, age=None, blood_type=None, email=None, first_name=None, last_name=None, gender=None, height=None, id=None, language=None, nationality=None, occupation=None, phone=None, title=None, university=None, weight=None, work_experience=None, ): ''' Create an Person Data Entity object. All individual fields are automatically randomly generated based on locale. If provided, the corresponding values are overriden. Note: All individual fields are randomly generated. Don't expect correct correlation e.g. correct postal code for the generated city. Keyword Arguments: locale: Approprite Random.locale.<local_name> object. Default is Random.locale.EN qualification: Educational Qualification age: Age blood_type: Blood type email: Email address first_name: First name last_name: Last name gender: Gender height: Height id: Identifier language: Language nationality: Nationality occupation: Occupation phone: Phone number title: Title university: University weight: Weight work_experience: Work Experience ''' person = Person(locale=locale) from arjuna.engine.data.entity.person import Person as ArjPerson first_name = first_name is not None and first_name or person.first_name( ) last_name = last_name is not None and last_name or person.last_name() return ArjPerson( qualification=qualification is not None and qualification or person.academic_degree(), age=age is not None and age or person.age(), blood_type=blood_type is not None and blood_type or person.blood_type(), email=email is not None and email or person.email(), first_name=first_name, last_name=last_name, name=first_name + " " + last_name, gender=gender is not None and gender or person.gender(), height=height is not None and height or person.height(), id=id is not None and id or person.identifier(), language=language is not None and language or person.language(), nationality=nationality is not None and nationality or person.nationality(), occupation=occupation is not None and occupation or person.occupation(), phone=phone is not None and phone or person.telephone(), title=title is not None and title or person.title(), university=university is not None and university or person.university(), weight=weight is not None and weight or person.weight(), work_experience=work_experience is not None and work_experience or person.work_experience(), )
def gen_data_remove_column(self, data_path, partition_date, num_rows, file_format): """ Input - data_path: path where the partition will be created (string) - partition_date: partition date to be created (date) - num_rows: number of rows to be generated (integer) - file_format: format of file to be generated (parquet or avro) This function creates a data sample removing some columns """ person = Person('en') address = Address('en') schema_street = StructType([ StructField('street_name', StringType(), True) # StructField('lat', FloatType(), True), #column removed # StructField('long', FloatType(), True) #column removed ]) schema_address_details = StructType([ StructField('street', schema_street, True), StructField('number', IntegerType(), True) ]) schema_address = StructType([ StructField('address_details', schema_address_details, True), StructField('city', StringType(), True), StructField('country', StringType(), True), # StructField('country_code', StringType(), True), #column removed StructField('state', StringType(), True), StructField('postal_code', IntegerType(), True) ]) schema_df = StructType([ StructField('identifier', StringType(), True), StructField('first_name', StringType(), True), StructField('last_name', StringType(), True), StructField('occupation', StringType(), True), StructField('age', IntegerType(), True), StructField('address', schema_address, True), # StructField('title_name', StringType(), True), #column removed StructField('date', DateType(), True) ]) for _ in range(num_rows): df_temp = self.spark.createDataFrame( [[ person.identifier(), person.first_name(), person.last_name(), person.occupation(), person.age(), [ [ [ address.street_name() #float(address.latitude()), #float(address.longitude()) ], int(address.street_number()) ], address.city(), address.country(), #address.country_code(), address.state(), int(address.postal_code()) ], #person.title(), partition_date ]], schema_df) try: df = df.union(df_temp) except: df = df_temp df.coalesce(1).write.partitionBy('date').mode('overwrite').format( file_format).save(data_path) print('Partition created: {data_path}/date={date}'.format( data_path=data_path, date=partition_date)) print('# Rows:', df.count()) print('Schema:') df.printSchema() print('\n') return