def initOptimus(cls,  sparkURL=None):
     print(colored("Optimus Should Be initalized only once", 'blue'))
     cls.optimus = Optimus()
     if sparkURL is not None:
         cls.optimus.stop()
         spark = SparkSession.builder.master(sparkURL).appName("DMXDeepInsightpy").getOrCreate()
         cls.optimus = Optimus(spark, verbose=True)
def main():

    # load in the data, sample from it and run optimus on the sample
    words = pd.read_csv('data/words.csv', header=None, names=('description',))['description']
    sample = words.sample(1000)

    O = Optimus(config_path='config.json')
    result = O(sample)

    df_sample = pd.DataFrame({'original': sample, 'label': result})

    # use a knn on vectors + assigned labels to apply these labels to the whole dataset
    model = ft.load_model('models/wiki.en.bin')
    embedSample = [model.get_word_vector(word) for word in sample.tolist()]

    # train the model on the result and the embedded sample vectors
    classifier = KNeighborsClassifier()
    trained = classifier.fit(embedSample, result)

    nonsampled = [word for word in words.tolist() if word not in sample.tolist()]
    outvectors = [model.get_word_vector(word) for word in nonsampled]

    predictions = trained.predict(outvectors)

    df_unsampled = pd.DataFrame({'original': nonsampled, 'label': predictions})

    df = pd.concat([df_sample, df_unsampled])
    df.to_csv('output.csv', index=False)
Exemplo n.º 3
0
    def __init__(self, engine=False):
        if not engine:
            from optimus.optimus import Engine
            engine = Engine.DASK.value

        from optimus import Optimus
        self.op = Optimus(engine)
        self.op.set_var = self.set_var
        self.op.get_var = self.get_var
        self.op.del_var = self.del_var
        self.op.list_vars = self.list_vars
        self.op.update_vars = self.update_vars
        self.set_var("_load", self.op.load)
        self.set_var("_create", self.op.create)
Exemplo n.º 4
0
def setup():
    """
    A set up procedure for the app.

    Returns
    -------
    tuple
        It returns (dash.app, app.layout,
        optimus instance, Optimus module)

    """

    # CREATE APP
    app = config_app()

    # Create persistant optimus object
    o = Optimus(config_path='config.json', data=path + 'uploaded_data.csv')

    with open(path + 'o', 'wb+') as f:
        pickle.dump(o, f)

    return app, retrieve_layout(), o, Optimus
Exemplo n.º 5
0
# -*- coding: utf-8 -*-
# + {}
from pyspark.sql.types import *
import datetime
import sys

from optimus import Optimus
# -

op = Optimus()

source_df = op.create.df([
    ('names', StringType(), True), ('height(ft)', ShortType(), True),
    ('function', StringType(), True), ('rank', ByteType(), True),
    ('age', IntegerType(), True), ('weight(t)', FloatType(), True),
    ('japanese name', ArrayType(StringType(), True), True),
    ('last position seen', StringType(), True),
    ('date arrival', StringType(), True),
    ('last date seen', StringType(), True),
    ('attributes', ArrayType(FloatType(), True), True),
    ('DateType', DateType(), True), ('Tiemstamp', TimestampType(), True),
    ('Cybertronian', BooleanType(), True),
    ('function(binary)', BinaryType(), True), ('NullType', NullType(), True)
], [("Optim'us", 28, 'Leader', 10, 5000000, 4.300000190734863,
     ['Inochi', 'Convoy'], '19.442735,-99.201111', '1980/04/10', '2016/09/10',
     [8.53439998626709, 4300.0], datetime.date(2016, 9, 10),
     datetime.datetime(2014, 6, 24, 0, 0), True, bytearray(b'Leader'), None),
    ('bumbl#ebéé  ', 17, 'Espionage', 7, 5000000, 2.0, ['Bumble', 'Goldback'],
     '10.642707,-71.612534', '1980/04/10', '2015/08/10', [
         5.334000110626221, 2000.0
     ], datetime.date(2015, 8, 10), datetime.datetime(
Exemplo n.º 6
0
        f.close()
        logger.debug('start writing to database')
        #In Case of JSON report
        #Helper function call to store data to DB
        #write_to_Db(pathtosave)
    else:
        logger.debug(
            "format can be either json of html!! specify the right format")


if __name__ == "__main__":

    #Initializing Spark Session as spark
    spark = SparkSession.builder.config(
        "spark.jars", "/home/vk/Downloads/postgresql-42.2.6.jar").appName(
            "DP").getOrCreate()
    #Spark Logging Set to Error(Noice reduction)
    spark.sparkContext.setLogLevel("ERROR")
    #Initializing Optimus as op
    op = Optimus(spark)

    #Config Pile path for user to specify the report to be generated
    configfilepath = "/home/vk/PycharmProjects/Reporting/config.json"
    #converting config to df for processing
    configDF = spark.read.json(configfilepath, multiLine=True)

    #calling hepler finction
    logger.debug("Generating Report")
    generate_report(op, configDF)
    logger.debug("REPORT GENERATED")
Exemplo n.º 7
0
#     version: 3.6.5
#   nteract:
#     version: 0.11.6
# ---

# %load_ext autoreload
# %autoreload 2

import sys
sys.path.append("..")

# ### Now you can get extra information for the profiler if you activate pass verbose= True to optimus

# Create optimus
from optimus import Optimus
op = Optimus(master="local[*]", app_name="optimus", checkpoint=True)

df = op.load.csv("data/Meteorite_Landings.csv").h_repartition()

df.table(10)

# ### Profiler dump mode (Faster). It just handle the column data type as present in the dataframe

op.profiler.run(df, "name", infer=False)

# ### Profiler smart mode (Slower). It just try to infer the column data type and present extra data acordinly. From example datetype columns get extra histograms about minutes, day, week and month. Also can detect array types on data.

op.profiler.run(df, "*", infer=True)

# ### Plot profile for a specific column
Exemplo n.º 8
0
 def test_optimus_from_session():
     from pyspark.sql import SparkSession
     spark = SparkSession.builder.appName('abc').getOrCreate()
     df = spark.read.csv('examples/data/foo.csv', header=True)
     op = Optimus(spark)
     df.display()
Exemplo n.º 9
0
    "potatoes",
    "cleaning tools",
    "building materials",
    "frozen meat",
    "argon",
    "helium cylinders",
    "household products",
    "construction materials",
    "confectionary",
    "stome and paving",
    "tractors x 3",
    "animal products",
    "composite materials",
    "Plasterboard",
    "cardboard",
])

#### 2. from file
#### to read from file 'words.csv' in the data folder - and convert to pandas Series

text_1 = pd.read_csv("./data/words.csv", index_col=False, header=0)
text_1 = pd.Series(text_1.iloc[:, 0])

#### Initialise Optimus object and run model

o = Optimus(config_path="config.json", cutoff=6, stepsize=1)

results = o(text_1, save_csv=True, full=True, verbose=True, runKNN=False)

results.sort_values("current_labels")
Exemplo n.º 10
0
# ---

# # This notebook create the tests in python code. All this cells must be run to executed the tests

# %load_ext autoreload
# %autoreload 2

# + {"outputHidden": false, "inputHidden": false}
import sys
sys.path.append("../..")
# -

from optimus import Optimus
from optimus.helpers.test import Test

op = Optimus(master='local', verbose=True)

# +
import pandas as pd
from pyspark.sql.types import *
from datetime import date, datetime

cols = [("names", "str"), ("height(ft)", ShortType()), ("function", "str"),
        ("rank", ByteType()), ("age", "int"), ("weight(t)", "float"),
        "japanese name", "last position seen", "date arrival",
        "last date seen", ("attributes", ArrayType(FloatType())),
        ("Date Type", DateType()), ("timestamp", TimestampType()),
        ("Cybertronian", BooleanType()), ("function(binary)", BinaryType()),
        ("NullType", NullType())]

rows = [
print(text_1)

text = pd.Series([
    'chocolate', 'chocolate biscuits', 'frozen pizza', 'frozn pizza',
    'cleaning products', 'household goods', 'bicycle parts', 'mortorbike',
    'cleaning brush', '3 x cars', '4 x cars', 'Vauxhaul Astra', 'Toyota Yaris',
    'Yaris', 'mondeo', 'Compressed gas', 'Helium tanks', 'frozen foods',
    'foodstuff', 'groupage', 'bread and pasta', 'bread', 'milk', 'powder',
    'baby food', 'board', ' oxygen cylinders', 'methane', 'argon',
    'fertilizers', 'steel gates', 'metal', 'scrap metal', 'steel posts',
    'fences and steel Products', 'tomatoes', 'potatoes', 'cleaning tools',
    'building materials', 'frozen meat', 'argon', 'helium cylinders',
    'household products', 'construction materials', 'confectionary',
    'stome and paving', 'tractors x 3', 'animal products',
    'composite materials', 'Plasterboard', 'cardboard'
])

#### 2. from file
#### to read from file 'words.csv' in the data folder - and convert to pandas Series

text_1 = pd.read_csv("./data/words.csv", index_col=False, header=0)
text_1 = pd.Series(text_1.iloc[:, 0])

#### Initialise Optimus object and run model

o = Optimus(config_path='config.json', cutoff=6, stepsize=1)

results = o(text_1, save_csv=True, full=True, verbose=True, runKNN=False)

results.sort_values("current_labels")
Exemplo n.º 12
0
from optimus import Optimus
import pyspark
from pyspark.ml.image import ImageSchema
from pyspark.sql.functions import lit
from sparkdl.image import imageIO
from pyspark import Row
from optimus.dl.models import DL

op = Optimus(dl=True)

# rdd = op.sc.parallelize([Row(predicted_labels=['daisy', '0.8918145298957825']),
#                         Row(predicted_labels=['picket_fence', '0.14247830212116241']),
#                         Row(predicted_labels=['daisy', '0.9532104134559631'])])

# df_row = spark.createDataFrame(rdd)


def assert_spark_df(df):
    assert isinstance(df, pyspark.sql.dataframe.DataFrame), "Not a Spark DF"


def assert_spark_model(model):
    assert isinstance(model, pyspark.ml.PipelineModel), "Not a model"


tulips_df = ImageSchema.readImages("tests/testtulips/").withColumn(
    "label", lit(1))
daisy_df = imageIO.readImagesWithCustomFn(
    "tests/testdaisy/",
    decode_f=imageIO.PIL_decode).withColumn("label", lit(0))
Exemplo n.º 13
0
# ## Install Optimus 
#
# from command line:
#
# `pip install optimuspyspark`
#
# from a notebook you can use:
#
# `!pip install optimuspyspark`

# ## Import Optimus and start it

from optimus import Optimus

op = Optimus(master="local")

# ## Dataframe creation
#
# Create a dataframe to passing a list of values for columns and rows. Unlike pandas you need to specify the column names.
#

df = op.create.df(
    [
        "names",
        "height(ft)",
        "function",
        "rank",
        "weight(t)",
        "japanese name",
        "last position",
Exemplo n.º 14
0
# %load_ext autoreload
# %autoreload 2

import sys
sys.path.append("..")

# ### Now you can get extra information for the profiler if you activate pass verbose= True to optimus

# + {"scrolled": false}
# Create optimus
from optimus import Optimus
op = Optimus(
    master="local[*]",
    app_name="optimus",
    checkpoint=True,
    queue_url=
    "amqp://*****:*****@chimpanzee.rmq.cloudamqp.com/eujwlcwg"
)
# -

df = op.load.csv("data/Meteorite_Landings.csv").h_repartition()

# + {"scrolled": false}
df.table(10)
# -

# ### Profiler dump mode (Faster). It just handle the column data type as present in the dataframe

# + {"scrolled": false}
op.profiler.run(df, "name", infer=False)
Exemplo n.º 15
0
from pyspark.sql.types import *
from optimus import Optimus
from optimus.helpers.json import json_enconding
from optimus.helpers.functions import deep_sort
import unittest
from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector
import numpy as np
nan = np.nan
import datetime
from pyspark.sql import functions as F
from optimus.ml import keycollision as keyCol
op = Optimus(master='local')
source_df=op.create.df([('LOCNCODE', StringType(), True),('LOCNDSCR', StringType(), True),('ADDRESS1', StringType(), True),('ADDRESS2', StringType(), True),('ADDRESS3', StringType(), True),('CITY', StringType(), True),('STATE', StringType(), True),('ZIPCODE', StringType(), True),('COUNTRY', StringType(), True),('Location_Segment', StringType(), True),('PAQ', StringType(), True),('TIPUNI', StringType(), True),('Tipo_unidad', StringType(), True),('ITEMNMBR', StringType(), True),('ITMSHNAM', StringType(), True),('MZ', StringType(), True),('LT', StringType(), True),('EDIF', StringType(), True),('NIVEL', StringType(), True),('NOUNI', StringType(), True),('CONDO', StringType(), True),('REGIMEN', StringType(), True),('ETAPA', StringType(), True),('PROTO', StringType(), True),('ITEMDESC', StringType(), True),('NIVELES', StringType(), True),('COCHERA', StringType(), True),('RECAM', StringType(), True),('ALCOB', StringType(), True),('BANOS', StringType(), True),('Num_Balcon', StringType(), True),('SALA', StringType(), True),('COMEDOR', StringType(), True),('COCINA', StringType(), True),('Cuarto_Lavado', StringType(), True),('Cuarto_Servicio', StringType(), True),('OTROX', StringType(), True),('OTROX1', StringType(), True),('SupCons', StringType(), True),('PATIOSERV', StringType(), True),('TERRAZA', StringType(), True),('BALCON', StringType(), True),('AZOTEA', StringType(), True),('Otros', StringType(), True),('AREATOT', StringType(), True),('FRENTE', StringType(), True),('Sup_Terreno', StringType(), True),('EXCEDENTE', StringType(), True),('OTRO1', StringType(), True),('OTRO2', StringType(), True),('TAMANO', StringType(), True),('UBICAVER', StringType(), True),('UBICAHORI', StringType(), True),('QTYONHND_', StringType(), True),('QTYSOLD', StringType(), True),('INACTIVE', StringType(), True),('UOMPRICE', StringType(), True),('MONTOAPA', StringType(), True),('PAGINI', StringType(), True),('ENGANCHE', StringType(), True),('FECHESCRIPRO', StringType(), True),('FECHAENTREGA', StringType(), True),('FECHASALIDAVENTAS', StringType(), True),('LIBERADO_NOLIBERADO', StringType(), True),('ACTIVO_INACTIVO', StringType(), True),('Estatus1Vivienda', StringType(), True),('Estatus2Vivienda', StringType(), True),('CUSTNMBR', StringType(), True),('Nombre_Completo', StringType(), True),('cNombre', StringType(), True),('cApellidoPaterno', StringType(), True),('cApellidoMaterno', StringType(), True),('cRfc', StringType(), True),('cCurp', StringType(), True),('fkIdGradoInteres', StringType(), True),('cSexo', StringType(), True),('cEmail', StringType(), True),('cTelefonoCasa', StringType(), True),('cTelefonoCelular', StringType(), True),('cTelefonoTrabajo', StringType(), True),('cNumeroSeguroSocial', StringType(), True),('dFechaNacimiento', StringType(), True),('cEstadoCivil', StringType(), True),('cRegimenConyugal', StringType(), True),('cNacionalidad', StringType(), True),('cLugarNacimiento', StringType(), True),('cRecomendadoPor', StringType(), True),('fkIdMedio', StringType(), True),('cMedioContacto', StringType(), True),('cCalle', StringType(), True),('cNumeroExterior', StringType(), True),('cNumeroInterior', StringType(), True),('cColonia', StringType(), True),('cMunicipio', StringType(), True),('cEstado', StringType(), True),('cPais', StringType(), True),('cCodigoPostal', StringType(), True),('nTiempoResidencia', StringType(), True),('cComentario', StringType(), True),('cNumeroIdentificacion', StringType(), True),('cTipoIdentificación', StringType(), True),('REFERENCIA', StringType(), True),('FACTURA', StringType(), True),('NOTACR', StringType(), True),('Precio_cierre', StringType(), True),('Precio_cierre_Tot', StringType(), True),('Aumento_al_Contrato', StringType(), True),('Condonacón', StringType(), True),('Precio_Escritura_Total', StringType(), True),('Precio_Dev', StringType(), True),('Precio_Dev_Total', StringType(), True),('Notarios_Proyectados', StringType(), True),('Gatos_A_terceros', StringType(), True),('Depositos', StringType(), True),('Saldo', StringType(), True),('dFechaCreacion', StringType(), True),('dFechaModificacion', StringType(), True),('FECHA_Cotizado', StringType(), True),('FECHA_SolApartado', StringType(), True),('FECHA_AutApartado', StringType(), True),('Vigencia_Apartado', StringType(), True),('FechaVencimientoApartado', StringType(), True),('FECHA_SolDictamen', StringType(), True),('FECHA_ProcDictamen', StringType(), True),('FECHA_DictaminadoLlamada', StringType(), True),('FECHA_DictaminadoFirma', StringType(), True),('FECHA_Dictaminado', StringType(), True),('FECHA_Rechazado', StringType(), True),('FECHA_EscrituraAvaluo', StringType(), True),('FECHA_EscrituraFolio', StringType(), True),('FolioEscsritura', StringType(), True),('FECHA_EscrituraReal', StringType(), True),('FECHA_Cancelado', StringType(), True),('FECHA_Liberado', StringType(), True),('FECHA_Entregado', StringType(), True),('MotivoCancelacion', StringType(), True)], [('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV008', 'ALVCDEY0080', None, None, None, None, '008', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV021', 'ALVCDEY0690', None, None, None, None, '069', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV022', 'ALVCDEY0710', None, None, None, None, '071', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV027', 'ALVCDEY0810', None, None, None, None, '081', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV032', 'ALVCEEY0090', None, None, None, None, '009', None, '0', '0', 'EST CEEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV035', 'ALVCEEY0150', None, None, None, None, '015', None, '0', '0', 'EST CEEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV009', 'ALVCDEY0100', None, None, None, None, '010', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV012', 'ALVCDEY0160', None, None, None, None, '016', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV019', 'ALVCDEY0650', None, None, None, None, '065', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV044', 'ALVCUEY0340', None, None, None, None, '034', None, '0', '0', 'EST CUEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)])
class Test_df_keycollision(unittest.TestCase):
	maxDiff = None
	@staticmethod
	def test_fingerprint():
		actual_df =keyCol.fingerprint(source_df,'STATE')
		expected_df = op.create.df([('LOCNCODE', StringType(), True),('LOCNDSCR', StringType(), True),('ADDRESS1', StringType(), True),('ADDRESS2', StringType(), True),('ADDRESS3', StringType(), True),('CITY', StringType(), True),('STATE', StringType(), True),('ZIPCODE', StringType(), True),('COUNTRY', StringType(), True),('Location_Segment', StringType(), True),('PAQ', StringType(), True),('TIPUNI', StringType(), True),('Tipo_unidad', StringType(), True),('ITEMNMBR', StringType(), True),('ITMSHNAM', StringType(), True),('MZ', StringType(), True),('LT', StringType(), True),('EDIF', StringType(), True),('NIVEL', StringType(), True),('NOUNI', StringType(), True),('CONDO', StringType(), True),('REGIMEN', StringType(), True),('ETAPA', StringType(), True),('PROTO', StringType(), True),('ITEMDESC', StringType(), True),('NIVELES', StringType(), True),('COCHERA', StringType(), True),('RECAM', StringType(), True),('ALCOB', StringType(), True),('BANOS', StringType(), True),('Num_Balcon', StringType(), True),('SALA', StringType(), True),('COMEDOR', StringType(), True),('COCINA', StringType(), True),('Cuarto_Lavado', StringType(), True),('Cuarto_Servicio', StringType(), True),('OTROX', StringType(), True),('OTROX1', StringType(), True),('SupCons', StringType(), True),('PATIOSERV', StringType(), True),('TERRAZA', StringType(), True),('BALCON', StringType(), True),('AZOTEA', StringType(), True),('Otros', StringType(), True),('AREATOT', StringType(), True),('FRENTE', StringType(), True),('Sup_Terreno', StringType(), True),('EXCEDENTE', StringType(), True),('OTRO1', StringType(), True),('OTRO2', StringType(), True),('TAMANO', StringType(), True),('UBICAVER', StringType(), True),('UBICAHORI', StringType(), True),('QTYONHND_', StringType(), True),('QTYSOLD', StringType(), True),('INACTIVE', StringType(), True),('UOMPRICE', StringType(), True),('MONTOAPA', StringType(), True),('PAGINI', StringType(), True),('ENGANCHE', StringType(), True),('FECHESCRIPRO', StringType(), True),('FECHAENTREGA', StringType(), True),('FECHASALIDAVENTAS', StringType(), True),('LIBERADO_NOLIBERADO', StringType(), True),('ACTIVO_INACTIVO', StringType(), True),('Estatus1Vivienda', StringType(), True),('Estatus2Vivienda', StringType(), True),('CUSTNMBR', StringType(), True),('Nombre_Completo', StringType(), True),('cNombre', StringType(), True),('cApellidoPaterno', StringType(), True),('cApellidoMaterno', StringType(), True),('cRfc', StringType(), True),('cCurp', StringType(), True),('fkIdGradoInteres', StringType(), True),('cSexo', StringType(), True),('cEmail', StringType(), True),('cTelefonoCasa', StringType(), True),('cTelefonoCelular', StringType(), True),('cTelefonoTrabajo', StringType(), True),('cNumeroSeguroSocial', StringType(), True),('dFechaNacimiento', StringType(), True),('cEstadoCivil', StringType(), True),('cRegimenConyugal', StringType(), True),('cNacionalidad', StringType(), True),('cLugarNacimiento', StringType(), True),('cRecomendadoPor', StringType(), True),('fkIdMedio', StringType(), True),('cMedioContacto', StringType(), True),('cCalle', StringType(), True),('cNumeroExterior', StringType(), True),('cNumeroInterior', StringType(), True),('cColonia', StringType(), True),('cMunicipio', StringType(), True),('cEstado', StringType(), True),('cPais', StringType(), True),('cCodigoPostal', StringType(), True),('nTiempoResidencia', StringType(), True),('cComentario', StringType(), True),('cNumeroIdentificacion', StringType(), True),('cTipoIdentificación', StringType(), True),('REFERENCIA', StringType(), True),('FACTURA', StringType(), True),('NOTACR', StringType(), True),('Precio_cierre', StringType(), True),('Precio_cierre_Tot', StringType(), True),('Aumento_al_Contrato', StringType(), True),('Condonacón', StringType(), True),('Precio_Escritura_Total', StringType(), True),('Precio_Dev', StringType(), True),('Precio_Dev_Total', StringType(), True),('Notarios_Proyectados', StringType(), True),('Gatos_A_terceros', StringType(), True),('Depositos', StringType(), True),('Saldo', StringType(), True),('dFechaCreacion', StringType(), True),('dFechaModificacion', StringType(), True),('FECHA_Cotizado', StringType(), True),('FECHA_SolApartado', StringType(), True),('FECHA_AutApartado', StringType(), True),('Vigencia_Apartado', StringType(), True),('FechaVencimientoApartado', StringType(), True),('FECHA_SolDictamen', StringType(), True),('FECHA_ProcDictamen', StringType(), True),('FECHA_DictaminadoLlamada', StringType(), True),('FECHA_DictaminadoFirma', StringType(), True),('FECHA_Dictaminado', StringType(), True),('FECHA_Rechazado', StringType(), True),('FECHA_EscrituraAvaluo', StringType(), True),('FECHA_EscrituraFolio', StringType(), True),('FolioEscsritura', StringType(), True),('FECHA_EscrituraReal', StringType(), True),('FECHA_Cancelado', StringType(), True),('FECHA_Liberado', StringType(), True),('FECHA_Entregado', StringType(), True),('MotivoCancelacion', StringType(), True),('STATE***FINGERPRINT', StringType(), True)], [('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV008', 'ALVCDEY0080', None, None, None, None, '008', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV021', 'ALVCDEY0690', None, None, None, None, '069', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV022', 'ALVCDEY0710', None, None, None, None, '071', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV027', 'ALVCDEY0810', None, None, None, None, '081', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV032', 'ALVCEEY0090', None, None, None, None, '009', None, '0', '0', 'EST CEEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV035', 'ALVCEEY0150', None, None, None, None, '015', None, '0', '0', 'EST CEEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV009', 'ALVCDEY0100', None, None, None, None, '010', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV012', 'ALVCDEY0160', None, None, None, None, '016', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV019', 'ALVCDEY0650', None, None, None, None, '065', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV044', 'ALVCUEY0340', None, None, None, None, '034', None, '0', '0', 'EST CUEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal')])
		assert (expected_df.collect() == actual_df.collect())
	@staticmethod
	def test_fingerprint_cluster():
		actual_df =keyCol.fingerprint_cluster(source_df,'STATE')
		actual_df =json_enconding(actual_df)
		expected_value =json_enconding({'Distrito Federal': {'similar': {'Distrito Federal': 10}, 'count': 1, 'sum': 10}})
		assert(expected_value == actual_df)
	@staticmethod
	def test_n_gram_fingerprint():
		actual_df =keyCol.n_gram_fingerprint(source_df,'STATE')
		expected_df = op.create.df([('LOCNCODE', StringType(), True),('LOCNDSCR', StringType(), True),('ADDRESS1', StringType(), True),('ADDRESS2', StringType(), True),('ADDRESS3', StringType(), True),('CITY', StringType(), True),('STATE', StringType(), True),('ZIPCODE', StringType(), True),('COUNTRY', StringType(), True),('Location_Segment', StringType(), True),('PAQ', StringType(), True),('TIPUNI', StringType(), True),('Tipo_unidad', StringType(), True),('ITEMNMBR', StringType(), True),('ITMSHNAM', StringType(), True),('MZ', StringType(), True),('LT', StringType(), True),('EDIF', StringType(), True),('NIVEL', StringType(), True),('NOUNI', StringType(), True),('CONDO', StringType(), True),('REGIMEN', StringType(), True),('ETAPA', StringType(), True),('PROTO', StringType(), True),('ITEMDESC', StringType(), True),('NIVELES', StringType(), True),('COCHERA', StringType(), True),('RECAM', StringType(), True),('ALCOB', StringType(), True),('BANOS', StringType(), True),('Num_Balcon', StringType(), True),('SALA', StringType(), True),('COMEDOR', StringType(), True),('COCINA', StringType(), True),('Cuarto_Lavado', StringType(), True),('Cuarto_Servicio', StringType(), True),('OTROX', StringType(), True),('OTROX1', StringType(), True),('SupCons', StringType(), True),('PATIOSERV', StringType(), True),('TERRAZA', StringType(), True),('BALCON', StringType(), True),('AZOTEA', StringType(), True),('Otros', StringType(), True),('AREATOT', StringType(), True),('FRENTE', StringType(), True),('Sup_Terreno', StringType(), True),('EXCEDENTE', StringType(), True),('OTRO1', StringType(), True),('OTRO2', StringType(), True),('TAMANO', StringType(), True),('UBICAVER', StringType(), True),('UBICAHORI', StringType(), True),('QTYONHND_', StringType(), True),('QTYSOLD', StringType(), True),('INACTIVE', StringType(), True),('UOMPRICE', StringType(), True),('MONTOAPA', StringType(), True),('PAGINI', StringType(), True),('ENGANCHE', StringType(), True),('FECHESCRIPRO', StringType(), True),('FECHAENTREGA', StringType(), True),('FECHASALIDAVENTAS', StringType(), True),('LIBERADO_NOLIBERADO', StringType(), True),('ACTIVO_INACTIVO', StringType(), True),('Estatus1Vivienda', StringType(), True),('Estatus2Vivienda', StringType(), True),('CUSTNMBR', StringType(), True),('Nombre_Completo', StringType(), True),('cNombre', StringType(), True),('cApellidoPaterno', StringType(), True),('cApellidoMaterno', StringType(), True),('cRfc', StringType(), True),('cCurp', StringType(), True),('fkIdGradoInteres', StringType(), True),('cSexo', StringType(), True),('cEmail', StringType(), True),('cTelefonoCasa', StringType(), True),('cTelefonoCelular', StringType(), True),('cTelefonoTrabajo', StringType(), True),('cNumeroSeguroSocial', StringType(), True),('dFechaNacimiento', StringType(), True),('cEstadoCivil', StringType(), True),('cRegimenConyugal', StringType(), True),('cNacionalidad', StringType(), True),('cLugarNacimiento', StringType(), True),('cRecomendadoPor', StringType(), True),('fkIdMedio', StringType(), True),('cMedioContacto', StringType(), True),('cCalle', StringType(), True),('cNumeroExterior', StringType(), True),('cNumeroInterior', StringType(), True),('cColonia', StringType(), True),('cMunicipio', StringType(), True),('cEstado', StringType(), True),('cPais', StringType(), True),('cCodigoPostal', StringType(), True),('nTiempoResidencia', StringType(), True),('cComentario', StringType(), True),('cNumeroIdentificacion', StringType(), True),('cTipoIdentificación', StringType(), True),('REFERENCIA', StringType(), True),('FACTURA', StringType(), True),('NOTACR', StringType(), True),('Precio_cierre', StringType(), True),('Precio_cierre_Tot', StringType(), True),('Aumento_al_Contrato', StringType(), True),('Condonacón', StringType(), True),('Precio_Escritura_Total', StringType(), True),('Precio_Dev', StringType(), True),('Precio_Dev_Total', StringType(), True),('Notarios_Proyectados', StringType(), True),('Gatos_A_terceros', StringType(), True),('Depositos', StringType(), True),('Saldo', StringType(), True),('dFechaCreacion', StringType(), True),('dFechaModificacion', StringType(), True),('FECHA_Cotizado', StringType(), True),('FECHA_SolApartado', StringType(), True),('FECHA_AutApartado', StringType(), True),('Vigencia_Apartado', StringType(), True),('FechaVencimientoApartado', StringType(), True),('FECHA_SolDictamen', StringType(), True),('FECHA_ProcDictamen', StringType(), True),('FECHA_DictaminadoLlamada', StringType(), True),('FECHA_DictaminadoFirma', StringType(), True),('FECHA_Dictaminado', StringType(), True),('FECHA_Rechazado', StringType(), True),('FECHA_EscrituraAvaluo', StringType(), True),('FECHA_EscrituraFolio', StringType(), True),('FolioEscsritura', StringType(), True),('FECHA_EscrituraReal', StringType(), True),('FECHA_Cancelado', StringType(), True),('FECHA_Liberado', StringType(), True),('FECHA_Entregado', StringType(), True),('MotivoCancelacion', StringType(), True),('STATE***FINGERPRINT', StringType(), True)], [('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV008', 'ALVCDEY0080', None, None, None, None, '008', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'aldediederfeisitofraristtotr'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV021', 'ALVCDEY0690', None, None, None, None, '069', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'aldediederfeisitofraristtotr'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV022', 'ALVCDEY0710', None, None, None, None, '071', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'aldediederfeisitofraristtotr'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV027', 'ALVCDEY0810', None, None, None, None, '081', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'aldediederfeisitofraristtotr'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV032', 'ALVCEEY0090', None, None, None, None, '009', None, '0', '0', 'EST CEEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'aldediederfeisitofraristtotr'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV035', 'ALVCEEY0150', None, None, None, None, '015', None, '0', '0', 'EST CEEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'aldediederfeisitofraristtotr'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV009', 'ALVCDEY0100', None, None, None, None, '010', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'aldediederfeisitofraristtotr'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV012', 'ALVCDEY0160', None, None, None, None, '016', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'aldediederfeisitofraristtotr'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV019', 'ALVCDEY0650', None, None, None, None, '065', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'aldediederfeisitofraristtotr'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV044', 'ALVCUEY0340', None, None, None, None, '034', None, '0', '0', 'EST CUEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'aldediederfeisitofraristtotr')])
		assert (expected_df.collect() == actual_df.collect())
Exemplo n.º 16
0
# # This notebook create the tests in python code. All this cells must be run to executed the tests

# %load_ext autoreload
# %autoreload 2

# + {"outputHidden": false, "inputHidden": false}
import sys
sys.path.append("../..")
# -

from optimus import Optimus
from optimus.helpers.test import Test

from optimus import Optimus
op = Optimus("dask", n_workers=1, threads_per_worker=8, processes=False, memory_limit="3G", comm=True)

import numpy as np
import pandas as pd


# +
import pandas as pd
from pyspark.sql.types import *
from datetime import date, datetime

cols = [
    ("names", "str"),
    ("height(ft)", ShortType()),
    ("function", "str"),
    ("rank", ByteType()),
Exemplo n.º 17
0
sys.path.append("..")

# ## Install Optimus 
#
# from command line:
#
# `pip install optimuspyspark`
#
# from a notebook you can use:
#
# `!pip install optimuspyspark`

# ## Import optimus and start it

from optimus import Optimus
op= Optimus(master="local")

# ## Dataframe creation
#
# Create a dataframe to passing a list of values for columns and rows. Unlike pandas you need to specify the column names.
#

df = op.create.df(
    [
        "names",
        "height(ft)",
        "function",
        "rank",
        "weight(t)",
        "japanese name",
        "last position",
        model_path = "mlruns_artifacts/2/6f7291973ca345b3974e2346bbdb9261/artifacts/" \
                     "gbm_grid1_model_179.zip"
        table_name = "predictions"
        config_name = 'nrl_config.ini'

    # load in config file
    config = configparser.ConfigParser()
    config.read(config_name)

    # get secret keys from config file
    access_key = config['aws']['aws_access_key_id']
    secret_key = config['aws']['aws_secret_access_key']

    # Initiate spark
    findspark.init("/home/user/spark-2.4.3-bin-hadoop2.7")
    op = Optimus()
    op.sc._jsc.hadoopConfiguration().set(
        "fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
    op.sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", access_key)
    op.sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey",
                                         secret_key)

    name_changer_udf = udf(name_changer, StringType())

    df = get_data_from_db(db_path)

    pdf = transform_df(df)

    predictions = predict_results(pdf, model_path)

    print(send_to_db(predictions, table_name, db_path))
Exemplo n.º 19
0
# Autores/as: Nahir Saddi y Guido Canevello

# Importamos las librerías necesarias, nos conectamos al HDFS para recuperar el archivo almacenado con anterioridad
# y realizamos una operación de Select para observar la columna ISO y buscar tuplas inconsistentes.
# REEMPLAZAR LAS DIRECCIONES PARA CADA ACCESO A HADOOP
import findspark
import pyspark
findspark.init()
from pyspark.sql import SparkSession
from operator import add
from optimus import Optimus 
import matplotlib.pyplot as plt
import numpy

sparkSession = SparkSession.builder.appName("ejercicio-informe").getOrCreate()
op = Optimus(sparkSession)
df = sparkSession.read.csv('hdfs://*servidorHDFS*/*direccion del archivo*/fuente3.1.csv', header=True, multiLine=True)

df.cols.select(["ISO"]).rdd.map(lambda x: (x,1)).reduceByKey(add).collect()


# In[ ]:


# Luego de observar los resultados de la consulta, se corrigió manualmente aquellas tuplas inconsistentes 
# dentro del .csv original y se publicó la fuente mejorada nuevamente en el sistema HDFS mediante el programa Java.

df = sparkSession.read.csv('hdfs://*servidorHDFS*/*direccion del archivo*/fuente3.1_mejorada.csv', header=True, multiLine=True)

# Eliminamos las columnas, "ID, PCODE, LINK"
df=df.drop("ID", "PCODE", "LINK")