def initOptimus(cls, sparkURL=None): print(colored("Optimus Should Be initalized only once", 'blue')) cls.optimus = Optimus() if sparkURL is not None: cls.optimus.stop() spark = SparkSession.builder.master(sparkURL).appName("DMXDeepInsightpy").getOrCreate() cls.optimus = Optimus(spark, verbose=True)
def main(): # load in the data, sample from it and run optimus on the sample words = pd.read_csv('data/words.csv', header=None, names=('description',))['description'] sample = words.sample(1000) O = Optimus(config_path='config.json') result = O(sample) df_sample = pd.DataFrame({'original': sample, 'label': result}) # use a knn on vectors + assigned labels to apply these labels to the whole dataset model = ft.load_model('models/wiki.en.bin') embedSample = [model.get_word_vector(word) for word in sample.tolist()] # train the model on the result and the embedded sample vectors classifier = KNeighborsClassifier() trained = classifier.fit(embedSample, result) nonsampled = [word for word in words.tolist() if word not in sample.tolist()] outvectors = [model.get_word_vector(word) for word in nonsampled] predictions = trained.predict(outvectors) df_unsampled = pd.DataFrame({'original': nonsampled, 'label': predictions}) df = pd.concat([df_sample, df_unsampled]) df.to_csv('output.csv', index=False)
def __init__(self, engine=False): if not engine: from optimus.optimus import Engine engine = Engine.DASK.value from optimus import Optimus self.op = Optimus(engine) self.op.set_var = self.set_var self.op.get_var = self.get_var self.op.del_var = self.del_var self.op.list_vars = self.list_vars self.op.update_vars = self.update_vars self.set_var("_load", self.op.load) self.set_var("_create", self.op.create)
def setup(): """ A set up procedure for the app. Returns ------- tuple It returns (dash.app, app.layout, optimus instance, Optimus module) """ # CREATE APP app = config_app() # Create persistant optimus object o = Optimus(config_path='config.json', data=path + 'uploaded_data.csv') with open(path + 'o', 'wb+') as f: pickle.dump(o, f) return app, retrieve_layout(), o, Optimus
# -*- coding: utf-8 -*- # + {} from pyspark.sql.types import * import datetime import sys from optimus import Optimus # - op = Optimus() source_df = op.create.df([ ('names', StringType(), True), ('height(ft)', ShortType(), True), ('function', StringType(), True), ('rank', ByteType(), True), ('age', IntegerType(), True), ('weight(t)', FloatType(), True), ('japanese name', ArrayType(StringType(), True), True), ('last position seen', StringType(), True), ('date arrival', StringType(), True), ('last date seen', StringType(), True), ('attributes', ArrayType(FloatType(), True), True), ('DateType', DateType(), True), ('Tiemstamp', TimestampType(), True), ('Cybertronian', BooleanType(), True), ('function(binary)', BinaryType(), True), ('NullType', NullType(), True) ], [("Optim'us", 28, 'Leader', 10, 5000000, 4.300000190734863, ['Inochi', 'Convoy'], '19.442735,-99.201111', '1980/04/10', '2016/09/10', [8.53439998626709, 4300.0], datetime.date(2016, 9, 10), datetime.datetime(2014, 6, 24, 0, 0), True, bytearray(b'Leader'), None), ('bumbl#ebéé ', 17, 'Espionage', 7, 5000000, 2.0, ['Bumble', 'Goldback'], '10.642707,-71.612534', '1980/04/10', '2015/08/10', [ 5.334000110626221, 2000.0 ], datetime.date(2015, 8, 10), datetime.datetime(
f.close() logger.debug('start writing to database') #In Case of JSON report #Helper function call to store data to DB #write_to_Db(pathtosave) else: logger.debug( "format can be either json of html!! specify the right format") if __name__ == "__main__": #Initializing Spark Session as spark spark = SparkSession.builder.config( "spark.jars", "/home/vk/Downloads/postgresql-42.2.6.jar").appName( "DP").getOrCreate() #Spark Logging Set to Error(Noice reduction) spark.sparkContext.setLogLevel("ERROR") #Initializing Optimus as op op = Optimus(spark) #Config Pile path for user to specify the report to be generated configfilepath = "/home/vk/PycharmProjects/Reporting/config.json" #converting config to df for processing configDF = spark.read.json(configfilepath, multiLine=True) #calling hepler finction logger.debug("Generating Report") generate_report(op, configDF) logger.debug("REPORT GENERATED")
# version: 3.6.5 # nteract: # version: 0.11.6 # --- # %load_ext autoreload # %autoreload 2 import sys sys.path.append("..") # ### Now you can get extra information for the profiler if you activate pass verbose= True to optimus # Create optimus from optimus import Optimus op = Optimus(master="local[*]", app_name="optimus", checkpoint=True) df = op.load.csv("data/Meteorite_Landings.csv").h_repartition() df.table(10) # ### Profiler dump mode (Faster). It just handle the column data type as present in the dataframe op.profiler.run(df, "name", infer=False) # ### Profiler smart mode (Slower). It just try to infer the column data type and present extra data acordinly. From example datetype columns get extra histograms about minutes, day, week and month. Also can detect array types on data. op.profiler.run(df, "*", infer=True) # ### Plot profile for a specific column
def test_optimus_from_session(): from pyspark.sql import SparkSession spark = SparkSession.builder.appName('abc').getOrCreate() df = spark.read.csv('examples/data/foo.csv', header=True) op = Optimus(spark) df.display()
"potatoes", "cleaning tools", "building materials", "frozen meat", "argon", "helium cylinders", "household products", "construction materials", "confectionary", "stome and paving", "tractors x 3", "animal products", "composite materials", "Plasterboard", "cardboard", ]) #### 2. from file #### to read from file 'words.csv' in the data folder - and convert to pandas Series text_1 = pd.read_csv("./data/words.csv", index_col=False, header=0) text_1 = pd.Series(text_1.iloc[:, 0]) #### Initialise Optimus object and run model o = Optimus(config_path="config.json", cutoff=6, stepsize=1) results = o(text_1, save_csv=True, full=True, verbose=True, runKNN=False) results.sort_values("current_labels")
# --- # # This notebook create the tests in python code. All this cells must be run to executed the tests # %load_ext autoreload # %autoreload 2 # + {"outputHidden": false, "inputHidden": false} import sys sys.path.append("../..") # - from optimus import Optimus from optimus.helpers.test import Test op = Optimus(master='local', verbose=True) # + import pandas as pd from pyspark.sql.types import * from datetime import date, datetime cols = [("names", "str"), ("height(ft)", ShortType()), ("function", "str"), ("rank", ByteType()), ("age", "int"), ("weight(t)", "float"), "japanese name", "last position seen", "date arrival", "last date seen", ("attributes", ArrayType(FloatType())), ("Date Type", DateType()), ("timestamp", TimestampType()), ("Cybertronian", BooleanType()), ("function(binary)", BinaryType()), ("NullType", NullType())] rows = [
print(text_1) text = pd.Series([ 'chocolate', 'chocolate biscuits', 'frozen pizza', 'frozn pizza', 'cleaning products', 'household goods', 'bicycle parts', 'mortorbike', 'cleaning brush', '3 x cars', '4 x cars', 'Vauxhaul Astra', 'Toyota Yaris', 'Yaris', 'mondeo', 'Compressed gas', 'Helium tanks', 'frozen foods', 'foodstuff', 'groupage', 'bread and pasta', 'bread', 'milk', 'powder', 'baby food', 'board', ' oxygen cylinders', 'methane', 'argon', 'fertilizers', 'steel gates', 'metal', 'scrap metal', 'steel posts', 'fences and steel Products', 'tomatoes', 'potatoes', 'cleaning tools', 'building materials', 'frozen meat', 'argon', 'helium cylinders', 'household products', 'construction materials', 'confectionary', 'stome and paving', 'tractors x 3', 'animal products', 'composite materials', 'Plasterboard', 'cardboard' ]) #### 2. from file #### to read from file 'words.csv' in the data folder - and convert to pandas Series text_1 = pd.read_csv("./data/words.csv", index_col=False, header=0) text_1 = pd.Series(text_1.iloc[:, 0]) #### Initialise Optimus object and run model o = Optimus(config_path='config.json', cutoff=6, stepsize=1) results = o(text_1, save_csv=True, full=True, verbose=True, runKNN=False) results.sort_values("current_labels")
from optimus import Optimus import pyspark from pyspark.ml.image import ImageSchema from pyspark.sql.functions import lit from sparkdl.image import imageIO from pyspark import Row from optimus.dl.models import DL op = Optimus(dl=True) # rdd = op.sc.parallelize([Row(predicted_labels=['daisy', '0.8918145298957825']), # Row(predicted_labels=['picket_fence', '0.14247830212116241']), # Row(predicted_labels=['daisy', '0.9532104134559631'])]) # df_row = spark.createDataFrame(rdd) def assert_spark_df(df): assert isinstance(df, pyspark.sql.dataframe.DataFrame), "Not a Spark DF" def assert_spark_model(model): assert isinstance(model, pyspark.ml.PipelineModel), "Not a model" tulips_df = ImageSchema.readImages("tests/testtulips/").withColumn( "label", lit(1)) daisy_df = imageIO.readImagesWithCustomFn( "tests/testdaisy/", decode_f=imageIO.PIL_decode).withColumn("label", lit(0))
# ## Install Optimus # # from command line: # # `pip install optimuspyspark` # # from a notebook you can use: # # `!pip install optimuspyspark` # ## Import Optimus and start it from optimus import Optimus op = Optimus(master="local") # ## Dataframe creation # # Create a dataframe to passing a list of values for columns and rows. Unlike pandas you need to specify the column names. # df = op.create.df( [ "names", "height(ft)", "function", "rank", "weight(t)", "japanese name", "last position",
# %load_ext autoreload # %autoreload 2 import sys sys.path.append("..") # ### Now you can get extra information for the profiler if you activate pass verbose= True to optimus # + {"scrolled": false} # Create optimus from optimus import Optimus op = Optimus( master="local[*]", app_name="optimus", checkpoint=True, queue_url= "amqp://*****:*****@chimpanzee.rmq.cloudamqp.com/eujwlcwg" ) # - df = op.load.csv("data/Meteorite_Landings.csv").h_repartition() # + {"scrolled": false} df.table(10) # - # ### Profiler dump mode (Faster). It just handle the column data type as present in the dataframe # + {"scrolled": false} op.profiler.run(df, "name", infer=False)
from pyspark.sql.types import * from optimus import Optimus from optimus.helpers.json import json_enconding from optimus.helpers.functions import deep_sort import unittest from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector import numpy as np nan = np.nan import datetime from pyspark.sql import functions as F from optimus.ml import keycollision as keyCol op = Optimus(master='local') source_df=op.create.df([('LOCNCODE', StringType(), True),('LOCNDSCR', StringType(), True),('ADDRESS1', StringType(), True),('ADDRESS2', StringType(), True),('ADDRESS3', StringType(), True),('CITY', StringType(), True),('STATE', StringType(), True),('ZIPCODE', StringType(), True),('COUNTRY', StringType(), True),('Location_Segment', StringType(), True),('PAQ', StringType(), True),('TIPUNI', StringType(), True),('Tipo_unidad', StringType(), True),('ITEMNMBR', StringType(), True),('ITMSHNAM', StringType(), True),('MZ', StringType(), True),('LT', StringType(), True),('EDIF', StringType(), True),('NIVEL', StringType(), True),('NOUNI', StringType(), True),('CONDO', StringType(), True),('REGIMEN', StringType(), True),('ETAPA', StringType(), True),('PROTO', StringType(), True),('ITEMDESC', StringType(), True),('NIVELES', StringType(), True),('COCHERA', StringType(), True),('RECAM', StringType(), True),('ALCOB', StringType(), True),('BANOS', StringType(), True),('Num_Balcon', StringType(), True),('SALA', StringType(), True),('COMEDOR', StringType(), True),('COCINA', StringType(), True),('Cuarto_Lavado', StringType(), True),('Cuarto_Servicio', StringType(), True),('OTROX', StringType(), True),('OTROX1', StringType(), True),('SupCons', StringType(), True),('PATIOSERV', StringType(), True),('TERRAZA', StringType(), True),('BALCON', StringType(), True),('AZOTEA', StringType(), True),('Otros', StringType(), True),('AREATOT', StringType(), True),('FRENTE', StringType(), True),('Sup_Terreno', StringType(), True),('EXCEDENTE', StringType(), True),('OTRO1', StringType(), True),('OTRO2', StringType(), True),('TAMANO', StringType(), True),('UBICAVER', StringType(), True),('UBICAHORI', StringType(), True),('QTYONHND_', StringType(), True),('QTYSOLD', StringType(), True),('INACTIVE', StringType(), True),('UOMPRICE', StringType(), True),('MONTOAPA', StringType(), True),('PAGINI', StringType(), True),('ENGANCHE', StringType(), True),('FECHESCRIPRO', StringType(), True),('FECHAENTREGA', StringType(), True),('FECHASALIDAVENTAS', StringType(), True),('LIBERADO_NOLIBERADO', StringType(), True),('ACTIVO_INACTIVO', StringType(), True),('Estatus1Vivienda', StringType(), True),('Estatus2Vivienda', StringType(), True),('CUSTNMBR', StringType(), True),('Nombre_Completo', StringType(), True),('cNombre', StringType(), True),('cApellidoPaterno', StringType(), True),('cApellidoMaterno', StringType(), True),('cRfc', StringType(), True),('cCurp', StringType(), True),('fkIdGradoInteres', StringType(), True),('cSexo', StringType(), True),('cEmail', StringType(), True),('cTelefonoCasa', StringType(), True),('cTelefonoCelular', StringType(), True),('cTelefonoTrabajo', StringType(), True),('cNumeroSeguroSocial', StringType(), True),('dFechaNacimiento', StringType(), True),('cEstadoCivil', StringType(), True),('cRegimenConyugal', StringType(), True),('cNacionalidad', StringType(), True),('cLugarNacimiento', StringType(), True),('cRecomendadoPor', StringType(), True),('fkIdMedio', StringType(), True),('cMedioContacto', StringType(), True),('cCalle', StringType(), True),('cNumeroExterior', StringType(), True),('cNumeroInterior', StringType(), True),('cColonia', StringType(), True),('cMunicipio', StringType(), True),('cEstado', StringType(), True),('cPais', StringType(), True),('cCodigoPostal', StringType(), True),('nTiempoResidencia', StringType(), True),('cComentario', StringType(), True),('cNumeroIdentificacion', StringType(), True),('cTipoIdentificación', StringType(), True),('REFERENCIA', StringType(), True),('FACTURA', StringType(), True),('NOTACR', StringType(), True),('Precio_cierre', StringType(), True),('Precio_cierre_Tot', StringType(), True),('Aumento_al_Contrato', StringType(), True),('Condonacón', StringType(), True),('Precio_Escritura_Total', StringType(), True),('Precio_Dev', StringType(), True),('Precio_Dev_Total', StringType(), True),('Notarios_Proyectados', StringType(), True),('Gatos_A_terceros', StringType(), True),('Depositos', StringType(), True),('Saldo', StringType(), True),('dFechaCreacion', StringType(), True),('dFechaModificacion', StringType(), True),('FECHA_Cotizado', StringType(), True),('FECHA_SolApartado', StringType(), True),('FECHA_AutApartado', StringType(), True),('Vigencia_Apartado', StringType(), True),('FechaVencimientoApartado', StringType(), True),('FECHA_SolDictamen', StringType(), True),('FECHA_ProcDictamen', StringType(), True),('FECHA_DictaminadoLlamada', StringType(), True),('FECHA_DictaminadoFirma', StringType(), True),('FECHA_Dictaminado', StringType(), True),('FECHA_Rechazado', StringType(), True),('FECHA_EscrituraAvaluo', StringType(), True),('FECHA_EscrituraFolio', StringType(), True),('FolioEscsritura', StringType(), True),('FECHA_EscrituraReal', StringType(), True),('FECHA_Cancelado', StringType(), True),('FECHA_Liberado', StringType(), True),('FECHA_Entregado', StringType(), True),('MotivoCancelacion', StringType(), True)], [('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV008', 'ALVCDEY0080', None, None, None, None, '008', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV021', 'ALVCDEY0690', None, None, None, None, '069', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV022', 'ALVCDEY0710', None, None, None, None, '071', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV027', 'ALVCDEY0810', None, None, None, None, '081', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV032', 'ALVCEEY0090', None, None, None, None, '009', None, '0', '0', 'EST CEEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV035', 'ALVCEEY0150', None, None, None, None, '015', None, '0', '0', 'EST CEEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV009', 'ALVCDEY0100', None, None, None, None, '010', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV012', 'ALVCDEY0160', None, None, None, None, '016', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV019', 'ALVCDEY0650', None, None, None, None, '065', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV044', 'ALVCUEY0340', None, None, None, None, '034', None, '0', '0', 'EST CUEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)]) class Test_df_keycollision(unittest.TestCase): maxDiff = None @staticmethod def test_fingerprint(): actual_df =keyCol.fingerprint(source_df,'STATE') expected_df = op.create.df([('LOCNCODE', StringType(), True),('LOCNDSCR', StringType(), True),('ADDRESS1', StringType(), True),('ADDRESS2', StringType(), True),('ADDRESS3', StringType(), True),('CITY', StringType(), True),('STATE', StringType(), True),('ZIPCODE', StringType(), True),('COUNTRY', StringType(), True),('Location_Segment', StringType(), True),('PAQ', StringType(), True),('TIPUNI', StringType(), True),('Tipo_unidad', StringType(), True),('ITEMNMBR', StringType(), True),('ITMSHNAM', StringType(), True),('MZ', StringType(), True),('LT', StringType(), True),('EDIF', StringType(), True),('NIVEL', StringType(), True),('NOUNI', StringType(), True),('CONDO', StringType(), True),('REGIMEN', StringType(), True),('ETAPA', StringType(), True),('PROTO', StringType(), True),('ITEMDESC', StringType(), True),('NIVELES', StringType(), True),('COCHERA', StringType(), True),('RECAM', StringType(), True),('ALCOB', StringType(), True),('BANOS', StringType(), True),('Num_Balcon', StringType(), True),('SALA', StringType(), True),('COMEDOR', StringType(), True),('COCINA', StringType(), True),('Cuarto_Lavado', StringType(), True),('Cuarto_Servicio', StringType(), True),('OTROX', StringType(), True),('OTROX1', StringType(), True),('SupCons', StringType(), True),('PATIOSERV', StringType(), True),('TERRAZA', StringType(), True),('BALCON', StringType(), True),('AZOTEA', StringType(), True),('Otros', StringType(), True),('AREATOT', StringType(), True),('FRENTE', StringType(), True),('Sup_Terreno', StringType(), True),('EXCEDENTE', StringType(), True),('OTRO1', StringType(), True),('OTRO2', StringType(), True),('TAMANO', StringType(), True),('UBICAVER', StringType(), True),('UBICAHORI', StringType(), True),('QTYONHND_', StringType(), True),('QTYSOLD', StringType(), True),('INACTIVE', StringType(), True),('UOMPRICE', StringType(), True),('MONTOAPA', StringType(), True),('PAGINI', StringType(), True),('ENGANCHE', StringType(), True),('FECHESCRIPRO', StringType(), True),('FECHAENTREGA', StringType(), True),('FECHASALIDAVENTAS', StringType(), True),('LIBERADO_NOLIBERADO', StringType(), True),('ACTIVO_INACTIVO', StringType(), True),('Estatus1Vivienda', StringType(), True),('Estatus2Vivienda', StringType(), True),('CUSTNMBR', StringType(), True),('Nombre_Completo', StringType(), True),('cNombre', StringType(), True),('cApellidoPaterno', StringType(), True),('cApellidoMaterno', StringType(), True),('cRfc', StringType(), True),('cCurp', StringType(), True),('fkIdGradoInteres', StringType(), True),('cSexo', StringType(), True),('cEmail', StringType(), True),('cTelefonoCasa', StringType(), True),('cTelefonoCelular', StringType(), True),('cTelefonoTrabajo', StringType(), True),('cNumeroSeguroSocial', StringType(), True),('dFechaNacimiento', StringType(), True),('cEstadoCivil', StringType(), True),('cRegimenConyugal', StringType(), True),('cNacionalidad', StringType(), True),('cLugarNacimiento', StringType(), True),('cRecomendadoPor', StringType(), True),('fkIdMedio', StringType(), True),('cMedioContacto', StringType(), True),('cCalle', StringType(), True),('cNumeroExterior', StringType(), True),('cNumeroInterior', StringType(), True),('cColonia', StringType(), True),('cMunicipio', StringType(), True),('cEstado', StringType(), True),('cPais', StringType(), True),('cCodigoPostal', StringType(), True),('nTiempoResidencia', StringType(), True),('cComentario', StringType(), True),('cNumeroIdentificacion', StringType(), True),('cTipoIdentificación', StringType(), True),('REFERENCIA', StringType(), True),('FACTURA', StringType(), True),('NOTACR', StringType(), True),('Precio_cierre', StringType(), True),('Precio_cierre_Tot', StringType(), True),('Aumento_al_Contrato', StringType(), True),('Condonacón', StringType(), True),('Precio_Escritura_Total', StringType(), True),('Precio_Dev', StringType(), True),('Precio_Dev_Total', StringType(), True),('Notarios_Proyectados', StringType(), True),('Gatos_A_terceros', StringType(), True),('Depositos', StringType(), True),('Saldo', StringType(), True),('dFechaCreacion', StringType(), True),('dFechaModificacion', StringType(), True),('FECHA_Cotizado', StringType(), True),('FECHA_SolApartado', StringType(), True),('FECHA_AutApartado', StringType(), True),('Vigencia_Apartado', StringType(), True),('FechaVencimientoApartado', StringType(), True),('FECHA_SolDictamen', StringType(), True),('FECHA_ProcDictamen', StringType(), True),('FECHA_DictaminadoLlamada', StringType(), True),('FECHA_DictaminadoFirma', StringType(), True),('FECHA_Dictaminado', StringType(), True),('FECHA_Rechazado', StringType(), True),('FECHA_EscrituraAvaluo', StringType(), True),('FECHA_EscrituraFolio', StringType(), True),('FolioEscsritura', StringType(), True),('FECHA_EscrituraReal', StringType(), True),('FECHA_Cancelado', StringType(), True),('FECHA_Liberado', StringType(), True),('FECHA_Entregado', StringType(), True),('MotivoCancelacion', StringType(), True),('STATE***FINGERPRINT', StringType(), True)], [('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV008', 'ALVCDEY0080', None, None, None, None, '008', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV021', 'ALVCDEY0690', None, None, None, None, '069', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV022', 'ALVCDEY0710', None, None, None, None, '071', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV027', 'ALVCDEY0810', None, None, None, None, '081', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV032', 'ALVCEEY0090', None, None, None, None, '009', None, '0', '0', 'EST CEEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV035', 'ALVCEEY0150', None, None, None, None, '015', None, '0', '0', 'EST CEEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV009', 'ALVCDEY0100', None, None, None, None, '010', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV012', 'ALVCDEY0160', None, None, None, None, '016', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV019', 'ALVCDEY0650', None, None, None, None, '065', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV044', 'ALVCUEY0340', None, None, None, None, '034', None, '0', '0', 'EST CUEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal')]) assert (expected_df.collect() == actual_df.collect()) @staticmethod def test_fingerprint_cluster(): actual_df =keyCol.fingerprint_cluster(source_df,'STATE') actual_df =json_enconding(actual_df) expected_value =json_enconding({'Distrito Federal': {'similar': {'Distrito Federal': 10}, 'count': 1, 'sum': 10}}) assert(expected_value == actual_df) @staticmethod def test_n_gram_fingerprint(): actual_df =keyCol.n_gram_fingerprint(source_df,'STATE') expected_df = op.create.df([('LOCNCODE', StringType(), True),('LOCNDSCR', StringType(), True),('ADDRESS1', StringType(), True),('ADDRESS2', StringType(), True),('ADDRESS3', StringType(), True),('CITY', StringType(), True),('STATE', StringType(), True),('ZIPCODE', StringType(), True),('COUNTRY', StringType(), True),('Location_Segment', StringType(), True),('PAQ', StringType(), True),('TIPUNI', StringType(), True),('Tipo_unidad', StringType(), True),('ITEMNMBR', StringType(), True),('ITMSHNAM', StringType(), True),('MZ', StringType(), True),('LT', StringType(), True),('EDIF', StringType(), True),('NIVEL', StringType(), True),('NOUNI', StringType(), True),('CONDO', StringType(), True),('REGIMEN', StringType(), True),('ETAPA', StringType(), True),('PROTO', StringType(), True),('ITEMDESC', StringType(), True),('NIVELES', StringType(), True),('COCHERA', StringType(), True),('RECAM', StringType(), True),('ALCOB', StringType(), True),('BANOS', StringType(), True),('Num_Balcon', StringType(), True),('SALA', StringType(), True),('COMEDOR', StringType(), True),('COCINA', StringType(), True),('Cuarto_Lavado', StringType(), True),('Cuarto_Servicio', StringType(), True),('OTROX', StringType(), True),('OTROX1', StringType(), True),('SupCons', StringType(), True),('PATIOSERV', StringType(), True),('TERRAZA', StringType(), True),('BALCON', StringType(), True),('AZOTEA', StringType(), True),('Otros', StringType(), True),('AREATOT', StringType(), True),('FRENTE', StringType(), True),('Sup_Terreno', StringType(), True),('EXCEDENTE', StringType(), True),('OTRO1', StringType(), True),('OTRO2', StringType(), True),('TAMANO', StringType(), True),('UBICAVER', StringType(), True),('UBICAHORI', StringType(), True),('QTYONHND_', StringType(), True),('QTYSOLD', StringType(), True),('INACTIVE', StringType(), True),('UOMPRICE', StringType(), True),('MONTOAPA', StringType(), True),('PAGINI', StringType(), True),('ENGANCHE', StringType(), True),('FECHESCRIPRO', StringType(), True),('FECHAENTREGA', StringType(), True),('FECHASALIDAVENTAS', StringType(), True),('LIBERADO_NOLIBERADO', StringType(), True),('ACTIVO_INACTIVO', StringType(), True),('Estatus1Vivienda', StringType(), True),('Estatus2Vivienda', StringType(), True),('CUSTNMBR', StringType(), True),('Nombre_Completo', StringType(), True),('cNombre', StringType(), True),('cApellidoPaterno', StringType(), True),('cApellidoMaterno', StringType(), True),('cRfc', StringType(), True),('cCurp', StringType(), True),('fkIdGradoInteres', StringType(), True),('cSexo', StringType(), True),('cEmail', StringType(), True),('cTelefonoCasa', StringType(), True),('cTelefonoCelular', StringType(), True),('cTelefonoTrabajo', StringType(), True),('cNumeroSeguroSocial', StringType(), True),('dFechaNacimiento', StringType(), True),('cEstadoCivil', StringType(), True),('cRegimenConyugal', StringType(), True),('cNacionalidad', StringType(), True),('cLugarNacimiento', StringType(), True),('cRecomendadoPor', StringType(), True),('fkIdMedio', StringType(), True),('cMedioContacto', StringType(), True),('cCalle', StringType(), True),('cNumeroExterior', StringType(), True),('cNumeroInterior', StringType(), True),('cColonia', StringType(), True),('cMunicipio', StringType(), True),('cEstado', StringType(), True),('cPais', StringType(), True),('cCodigoPostal', StringType(), True),('nTiempoResidencia', StringType(), True),('cComentario', StringType(), True),('cNumeroIdentificacion', StringType(), True),('cTipoIdentificación', StringType(), True),('REFERENCIA', StringType(), True),('FACTURA', StringType(), True),('NOTACR', StringType(), True),('Precio_cierre', StringType(), True),('Precio_cierre_Tot', StringType(), True),('Aumento_al_Contrato', StringType(), True),('Condonacón', StringType(), True),('Precio_Escritura_Total', StringType(), True),('Precio_Dev', StringType(), True),('Precio_Dev_Total', StringType(), True),('Notarios_Proyectados', StringType(), True),('Gatos_A_terceros', StringType(), True),('Depositos', StringType(), True),('Saldo', StringType(), True),('dFechaCreacion', StringType(), True),('dFechaModificacion', StringType(), True),('FECHA_Cotizado', StringType(), True),('FECHA_SolApartado', StringType(), True),('FECHA_AutApartado', StringType(), True),('Vigencia_Apartado', StringType(), True),('FechaVencimientoApartado', StringType(), True),('FECHA_SolDictamen', StringType(), True),('FECHA_ProcDictamen', StringType(), True),('FECHA_DictaminadoLlamada', StringType(), True),('FECHA_DictaminadoFirma', StringType(), True),('FECHA_Dictaminado', StringType(), True),('FECHA_Rechazado', StringType(), True),('FECHA_EscrituraAvaluo', StringType(), True),('FECHA_EscrituraFolio', StringType(), True),('FolioEscsritura', StringType(), True),('FECHA_EscrituraReal', StringType(), True),('FECHA_Cancelado', StringType(), True),('FECHA_Liberado', StringType(), True),('FECHA_Entregado', StringType(), True),('MotivoCancelacion', StringType(), True),('STATE***FINGERPRINT', StringType(), True)], [('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV008', 'ALVCDEY0080', None, None, None, None, '008', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'aldediederfeisitofraristtotr'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV021', 'ALVCDEY0690', None, None, None, None, '069', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'aldediederfeisitofraristtotr'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV022', 'ALVCDEY0710', None, None, None, None, '071', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'aldediederfeisitofraristtotr'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV027', 'ALVCDEY0810', None, None, None, None, '081', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'aldediederfeisitofraristtotr'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV032', 'ALVCEEY0090', None, None, None, None, '009', None, '0', '0', 'EST CEEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'aldediederfeisitofraristtotr'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV035', 'ALVCEEY0150', None, None, None, None, '015', None, '0', '0', 'EST CEEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'aldediederfeisitofraristtotr'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV009', 'ALVCDEY0100', None, None, None, None, '010', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'aldediederfeisitofraristtotr'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV012', 'ALVCDEY0160', None, None, None, None, '016', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'aldediederfeisitofraristtotr'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV019', 'ALVCDEY0650', None, None, None, None, '065', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'aldediederfeisitofraristtotr'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV044', 'ALVCUEY0340', None, None, None, None, '034', None, '0', '0', 'EST CUEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'aldediederfeisitofraristtotr')]) assert (expected_df.collect() == actual_df.collect())
# # This notebook create the tests in python code. All this cells must be run to executed the tests # %load_ext autoreload # %autoreload 2 # + {"outputHidden": false, "inputHidden": false} import sys sys.path.append("../..") # - from optimus import Optimus from optimus.helpers.test import Test from optimus import Optimus op = Optimus("dask", n_workers=1, threads_per_worker=8, processes=False, memory_limit="3G", comm=True) import numpy as np import pandas as pd # + import pandas as pd from pyspark.sql.types import * from datetime import date, datetime cols = [ ("names", "str"), ("height(ft)", ShortType()), ("function", "str"), ("rank", ByteType()),
sys.path.append("..") # ## Install Optimus # # from command line: # # `pip install optimuspyspark` # # from a notebook you can use: # # `!pip install optimuspyspark` # ## Import optimus and start it from optimus import Optimus op= Optimus(master="local") # ## Dataframe creation # # Create a dataframe to passing a list of values for columns and rows. Unlike pandas you need to specify the column names. # df = op.create.df( [ "names", "height(ft)", "function", "rank", "weight(t)", "japanese name", "last position",
model_path = "mlruns_artifacts/2/6f7291973ca345b3974e2346bbdb9261/artifacts/" \ "gbm_grid1_model_179.zip" table_name = "predictions" config_name = 'nrl_config.ini' # load in config file config = configparser.ConfigParser() config.read(config_name) # get secret keys from config file access_key = config['aws']['aws_access_key_id'] secret_key = config['aws']['aws_secret_access_key'] # Initiate spark findspark.init("/home/user/spark-2.4.3-bin-hadoop2.7") op = Optimus() op.sc._jsc.hadoopConfiguration().set( "fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem") op.sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", access_key) op.sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", secret_key) name_changer_udf = udf(name_changer, StringType()) df = get_data_from_db(db_path) pdf = transform_df(df) predictions = predict_results(pdf, model_path) print(send_to_db(predictions, table_name, db_path))
# Autores/as: Nahir Saddi y Guido Canevello # Importamos las librerías necesarias, nos conectamos al HDFS para recuperar el archivo almacenado con anterioridad # y realizamos una operación de Select para observar la columna ISO y buscar tuplas inconsistentes. # REEMPLAZAR LAS DIRECCIONES PARA CADA ACCESO A HADOOP import findspark import pyspark findspark.init() from pyspark.sql import SparkSession from operator import add from optimus import Optimus import matplotlib.pyplot as plt import numpy sparkSession = SparkSession.builder.appName("ejercicio-informe").getOrCreate() op = Optimus(sparkSession) df = sparkSession.read.csv('hdfs://*servidorHDFS*/*direccion del archivo*/fuente3.1.csv', header=True, multiLine=True) df.cols.select(["ISO"]).rdd.map(lambda x: (x,1)).reduceByKey(add).collect() # In[ ]: # Luego de observar los resultados de la consulta, se corrigió manualmente aquellas tuplas inconsistentes # dentro del .csv original y se publicó la fuente mejorada nuevamente en el sistema HDFS mediante el programa Java. df = sparkSession.read.csv('hdfs://*servidorHDFS*/*direccion del archivo*/fuente3.1_mejorada.csv', header=True, multiLine=True) # Eliminamos las columnas, "ID, PCODE, LINK" df=df.drop("ID", "PCODE", "LINK")