예제 #1
0
def main(spark, data_file, model_file, user_file, track_file):
    df = spark.read.parquet(data_file)

    user_indexer = StringIndexer(inputCol="user_id",
                                 outputCol="user_idx",
                                 handleInvalid="keep")
    track_indexer = StringIndexer(inputCol="track_id",
                                  outputCol="track_idx",
                                  handleInvalid="keep")

    pipeline = Pipeline(stages=[user_indexer, track_indexer])
    mapping = pipeline.fit(df)
    df = mapping.transform(df)

    #create + fit an ALS model
    als = ALS(maxIter=5,
              regParam=0.01,
              implicitPrefs=True,
              ratingCol="count",
              userCol="user_idx",
              itemCol="track_idx")
    als_model = als.fit(df)

    #save trained ALS model
    als_model.write().overwrite().save(model_file)
    print("Model sucessfully saved to HFS")

    #save string indexers
    user_indexer.write().overwrite().save(user_file)
    track_indexer.write().overwrite().save(track_file)
    print("String Indexers sucessfully saved to HFS")
예제 #2
0
def encode_strings(df, cols, fname) -> pyspark.sql.DataFrame:
    """ """
    for col in cols:
        indexer = StringIndexer(inputCol=f"{col}",
                                outputCol=f"encoded_{col}",
                                stringOrderType="alphabetAsc")
        model = indexer.fit(df)
        df = model.transform(df)

        # We'll write the models in a TEMP dictorory and later we'll move them
        # into our project's subdirectory.
        indexer.write().overwrite().save(
            f"{config.TEMP}/{fname}_stringindexer_{col}")
        model.write().overwrite().save(
            f"{config.TEMP}/{fname}_stringindexer_model_{col}")

        shutil.move(
            f"{config.TEMP}/{fname}_stringindexer_{col}",
            f"{config.SPARK_MODELS}/{fname}/stringindexer_{col}",
        )
        shutil.move(
            f"{config.TEMP}/{fname}_stringindexer_model_{col}",
            f"{config.SPARK_MODELS}/{fname}/stringindexer_model_{col}",
        )

        # indexer.save(f"{config.SPARK_MODELS}/stringindexer_{col}")
        # model.save(f"{config.SPARK_MODELS}/stringindexer_model_{col}")
    return df
예제 #3
0
def main(spark,
         data_file,
         model_file,
         user_file,
         track_file,
         model_formulation=None):
    df = spark.read.parquet(data_file)

    if model_formulation == 'log':
        #log compression on training
        df = df.withColumn('count', F.log(F.col('count')))
        print("log")

    elif model_formulation == 'ct1':
        #subsetting all train counts greater than 1
        df.createOrReplaceTempView('df')
        df = spark.sql('SELECT * FROM df WHERE count > 1')
        print("ct1")

    elif model_formulation == 'ct2':
        #subsetting all train counts greater than 2
        df.createOrReplaceTempView('df')
        df = spark.sql('SELECT * FROM df WHERE count > 2')
        print("ct2")

    else:
        #If no model formulation is specified, pass
        print("default")
        pass

    user_indexer = StringIndexer(inputCol="user_id",
                                 outputCol="user_idx",
                                 handleInvalid="keep")
    track_indexer = StringIndexer(inputCol="track_id",
                                  outputCol="track_idx",
                                  handleInvalid="keep")

    pipeline = Pipeline(stages=[user_indexer, track_indexer])
    mapping = pipeline.fit(df)
    df = mapping.transform(df)

    #create + fit an ALS model
    als = ALS(maxIter=5,
              regParam=0.01,
              implicitPrefs=True,
              ratingCol="count",
              userCol="user_idx",
              itemCol="track_idx")
    als_model = als.fit(df)

    #save trained ALS model
    als_model.write().overwrite().save(model_file)
    print("Model sucessfully saved to HFS")

    #save string indexers
    user_indexer.write().overwrite().save(user_file)
    track_indexer.write().overwrite().save(track_file)
    print("String Indexers sucessfully saved to HFS")
def review_ids_to_number(dataframe):
    #build indexer model for user_id
    indexer_user = StringIndexer(inputCol ="user_id",outputCol="user_id_num").fit(dataframe) 
    indexer_user_save = os.path.join('model','user_ind_model')
    indexer_user.write().overwrite().save(indexer_user_save)
    #build indexer model for business_id
    indexer_business = StringIndexer(inputCol ="business_id",outputCol="business_id_num",handleInvalid="skip").fit(dataframe)
    indexer_business_save = os.path.join('model', 'bus_ind_model')
    indexer_business.write().overwrite().save(indexer_business_save)
    #transform id columns to string
    indexed = indexer_user.transform(dataframe)
    final_indexed = indexer_business.transform(indexed)
    final_indexed.show(20)
    #save fitted strtingIndexer models
    final_indexed_save = os.path.join('dataset','review_vegas_als.parquet')
    final_indexed.write.mode('overwrite').parquet(final_indexed_save)
    logger.error('Indexed dataframe for ALS traing saved to review_vegas_als.parquet')
    logger.error('{} seconds has elapsed'.format(time.time() - start_time))
    def stringIndexer(infoData):
        colmToIndex = infoData.get(mc.COLMTOINDEX)
        dataset = infoData.get(mc.DATASET)
        indexedColm = infoData.get(mc.INDEXEDCOLM)
        storageLocation = infoData.get(mc.STORAGELOCATION)
        indexerName = colmToIndex + mc.INDEXER
        file = storageLocation + indexerName
        # check if the datatype of the col is integer or float or double. if yes then no need to do the indexing-- sahil.
        '''for now converting each datatypes to string and then indexing it.'''
        dataset = dataset.withColumn(colmToIndex, dataset[colmToIndex].cast(StringType()))
        stringIndexer = StringIndexer(inputCol=colmToIndex, outputCol=indexedColm,
                                     handleInvalid="keep").fit(dataset)
        dataset = stringIndexer.transform(dataset)
        stringIndexer.write().overwrite().save(file)  # will update this later
        indexerPathMapping = infoData.get(mc.INDEXERPATHMAPPING)
        indexerPathMapping.update({colmToIndex: file})
        infoData.update({
            mc.INDEXERPATHMAPPING: indexerPathMapping,
            mc.DATASET: dataset
        })

        return infoData
예제 #6
0
def prepare_data_ml3(spark, jenkins_builds, sonar_issues, sonar_analyses,
                     spark_artefacts_dir, run_mode):

    # Change build result to only SUCCESS/FAIL for binary classification
    modify_result = udf(lambda x: "SUCCESS" if x == "SUCCESS" else "FAIL",
                        StringType())
    spark.udf.register("modify_result", modify_result)

    if jenkins_builds is not None:
        jenkins_builds = jenkins_builds.withColumn("result",
                                                   modify_result("result"))

    pipeline_path = Path(spark_artefacts_dir).joinpath("pipeline_3")
    label_idx_model_path = Path(spark_artefacts_dir).joinpath(
        "label_indexer_3")

    # Getting pipeline and label indexer models
    if run_mode == "first":

        pipeline_model = get_ml3_pipeline().fit(sonar_issues)
        pipeline_model.write().overwrite().save(str(pipeline_path.absolute()))

        label_idx_model = StringIndexer(
            inputCol="result", outputCol="label",
            handleInvalid="skip").fit(jenkins_builds)
        label_idx_model.write().overwrite().save(
            str(label_idx_model_path.absolute()))

    elif run_mode == "incremental":

        pipeline_model = PipelineModel.load(str(pipeline_path.absolute()))
        label_idx_model = StringIndexerModel.load(
            str(label_idx_model_path.absolute()))

    # Columns to return
    rules = pipeline_model.stages[0].labels
    columns = list(map(lambda x: "removed_" + x, rules)) + list(
        map(lambda x: "introduced_" + x, rules))

    # Preparing
    removed_rules_df = sonar_issues.filter(
        "status IN ('RESOLVED', 'CLOSED', 'REVIEWED')").select(
            "current_analysis_key", "rule")

    df1 = pipeline_model.transform(removed_rules_df)
    rdd1 = df1.rdd.map(lambda x : (x[0],x[3])).reduceByKey(lambda v1,v2: sum_sparse_vectors(v1,v2)) \
                                                .map(lambda x: Row(current_analysis_key = x[0], removed_rule_vec = x[1]))

    if rdd1.count() == 0:
        return None, columns
    removed_issues_rule_vec_df = spark.createDataFrame(rdd1)

    introduced_rules_df = sonar_issues.filter(
        "status IN ('OPEN', 'REOPENED', 'CONFIRMED', 'TO_REVIEW')").select(
            "creation_analysis_key", "rule")
    df2 = pipeline_model.transform(introduced_rules_df)
    rdd2 = df2.rdd.map(lambda x : (x[0],x[3])).reduceByKey(lambda v1,v2: sum_sparse_vectors(v1,v2)) \
                                                .map(lambda x: Row(creation_analysis_key = x[0], introduced_rule_vec = x[1]))

    if rdd2.count() == 0:
        return None, columns
    introduced_issues_rule_vec_df = spark.createDataFrame(rdd2)

    joined_sonar_rules_df = removed_issues_rule_vec_df.join(
        introduced_issues_rule_vec_df,
        removed_issues_rule_vec_df.current_analysis_key ==
        introduced_issues_rule_vec_df.creation_analysis_key,
        how="outer")

    joined_sonar_rules_df.createOrReplaceTempView("sonar_rules")
    joined_sonar_rules_df = spark.sql("""SELECT 
        coalesce(current_analysis_key, creation_analysis_key) AS analysis_key,
        introduced_rule_vec,
        removed_rule_vec
        FROM sonar_rules
    """)

    num_rules = len(pipeline_model.stages[0].labels)

    imputed_sonar_rules_rdd = joined_sonar_rules_df.rdd.map(
        lambda row: Row(analysis_key=row[0],
                        introduced_rule_vec=SparseVector(num_rules, {})
                        if row[1] is None else row[1],
                        removed_rule_vec=SparseVector(num_rules, {})
                        if row[2] is None else row[2]))

    imputed_sonar_rules_df = spark.createDataFrame(imputed_sonar_rules_rdd)

    v_assembler = VectorAssembler(
        inputCols=["removed_rule_vec", "introduced_rule_vec"],
        outputCol="features")
    sonar_issues_df = v_assembler.transform(imputed_sonar_rules_df).select(
        "analysis_key", "features")

    sonar_df = sonar_issues_df.join(
        sonar_analyses,
        sonar_issues_df.analysis_key == sonar_analyses.analysis_key,
        how="inner")
    df = sonar_df.join(jenkins_builds,
                       sonar_df.revision == jenkins_builds.revision_number,
                       how="inner").select("result", "features")
    ml_df = label_idx_model.transform(df).select("label", "features")

    return ml_df, columns
    def dataTranform(self, dataInfo):
        featuresColm = dataInfo.get(
            PredictiveConstants.FEATURESCOLM)  # featureColmList -replaced
        labelColm = dataInfo.get(PredictiveConstants.LABELCOLM)
        modelSheetName = dataInfo.get(PredictiveConstants.MODELSHEETNAME)
        modelId = dataInfo.get(PredictiveConstants.MODELID)
        storageLocation = dataInfo.get(PredictiveConstants.LOCATIONADDRESS)

        indexerPathMapping = {}
        oneHotEncoderPathMapping = {}

        self.labelColm = None if labelColm == None else labelColm
        self.featuresColm = None if featuresColm == None else featuresColm
        dataset = self.dataset
        vectorizedFeaturescolmName = modelSheetName + PredictiveConstants.DMXFEATURE
        dataset = dataset.drop(vectorizedFeaturescolmName)

        schemaData = dataset.schema

        if self.labelColm is not None:
            for labelName in self.labelColm:
                label = labelName
        else:
            label = self.labelColm

        nonNumericData = self.nonNumericToString(schemaData=schemaData,
                                                 dataset=dataset)
        categoricalFeatures = nonNumericData.get(
            PredictiveConstants.CATEGORICALFEATURES)
        numericalFeatures = nonNumericData.get(
            PredictiveConstants.NUMERICALFEATURES)
        dataset = nonNumericData.get(PredictiveConstants.DATASET)
        schemaData = dataset.schema

        # indexing of label column
        isLabelIndexed = "no"
        if self.labelColm is not None:
            labelIndexedInfo = self.isLabelIndexed(schemaData=schemaData,
                                                   label=label,
                                                   dataset=dataset)
            dataset = labelIndexedInfo.get(PredictiveConstants.DATASET)
            isLabelIndexed = labelIndexedInfo.get(
                PredictiveConstants.ISLABELINDEXED)
            labelIndexer = labelIndexedInfo.get(
                PredictiveConstants.LABELINDEXER)
            # store the label indexer here.
            if labelIndexer is not None:
                labelIndexerStoragepath = storageLocation + modelId.upper(
                ) + label.upper() + PredictiveConstants.INDEXER.upper(
                ) + PredictiveConstants.PARQUETEXTENSION
                labelIndexer.save(labelIndexerStoragepath)  #correct this
                indexerPathMapping.update({label: labelIndexerStoragepath})

        oneHotEncodedFeaturesList = []
        indexedFeatures = []
        nonOneHotEncoded = []
        for colm in categoricalFeatures:
            indexedColmName = PredictiveConstants.INDEXED_ + colm
            oneHotEncodedColName = PredictiveConstants.ONEHOTENCODED_ + colm
            indexer = StringIndexer(inputCol=colm,
                                    outputCol=indexedColmName,
                                    handleInvalid="skip").fit(dataset)
            dataset = indexer.transform(dataset)
            '''store the indexer here- saving mechanism should be modelid+colmName+indexer.parquet -->
            not storing the features indexer for now but keeping this for future use.'''
            featuresIndexerPath = storageLocation + modelId.upper(
            ) + colm.upper() + PredictiveConstants.INDEXER.upper(
            ) + PredictiveConstants.PARQUETEXTENSION
            indexer.write().overwrite().save(featuresIndexerPath)
            indexerPathMapping.update({colm: featuresIndexerPath})
            rowNo = dataset.select(indexedColmName).distinct().count()
            '''Incase of only one category in the colm or more than one in case of training and prediction
            -- naming it with the onehotencoded colmName to avoid that uncertainity. '''
            if (rowNo == 1):
                nonOneHotEncoded.append(
                    oneHotEncodedColName
                )  # to avoid the problem when only single value colm is passed at the time of prediction.
            else:
                indexedFeatures.append(indexedColmName)
                oneHotEncodedFeaturesList.append(oneHotEncodedColName)

        oneHotEncoder = OneHotEncoderEstimator(
            inputCols=indexedFeatures,
            outputCols=oneHotEncodedFeaturesList,
            handleInvalid="error")
        oneHotEncoderPath = storageLocation + modelId.upper(
        ) + PredictiveConstants.ONEHOTENCODED.upper(
        ) + PredictiveConstants.PARQUETEXTENSION
        oneHotEncoder.write().overwrite().save(oneHotEncoderPath)
        oneHotEncoderPathMapping.update(
            {PredictiveConstants.ONEHOTENCODED: oneHotEncoderPath})
        oneHotEncoderFit = oneHotEncoder.fit(dataset)
        dataset = oneHotEncoderFit.transform(dataset)

        combinedFeatures = oneHotEncodedFeaturesList + numericalFeatures + nonOneHotEncoded
        categoryColmListDict = {}
        countOfCategoricalColmList = []
        for value in categoricalFeatures:
            listValue = []
            categoryColm = dataset.groupby(value).count()
            countOfCategoricalColmList.append(categoryColm.count())
            categoryColmJson = categoryColm.toJSON()
            for row in categoryColmJson.collect():
                categoryColmSummary = json.loads(row)
                listValue.append(categoryColmSummary)
            categoryColmListDict[value] = listValue

        self.numericalFeatures = numericalFeatures
        self.categoricalFeatures = categoricalFeatures
        if not categoricalFeatures:
            maxCategories = 5
        else:
            maxCategories = max(countOfCategoricalColmList)
        dataset = dataset.drop(vectorizedFeaturescolmName)
        featureassembler = VectorAssembler(
            inputCols=combinedFeatures,
            outputCol=vectorizedFeaturescolmName,
            handleInvalid="skip")
        dataset = featureassembler.transform(dataset)

        # retrieve the features colm name after onehotencoding
        indexOfFeatures = dataset.schema.names.index(
            vectorizedFeaturescolmName)
        oneHotEncodedFeaturesDict = dataset.schema.fields[
            indexOfFeatures].metadata['ml_attr']['attrs']
        idNameFeatures = {}

        if not oneHotEncodedFeaturesDict:
            idNameFeaturesOrderedTemp = None
        else:
            for type, value in oneHotEncodedFeaturesDict.items():
                for subKey in value:
                    idNameFeatures[subKey.get("idx")] = subKey.get("name")
                    idNameFeaturesOrderedTemp = {}
                    for key in sorted(idNameFeatures):
                        idNameFeaturesOrderedTemp[key] = idNameFeatures[
                            key].replace(PredictiveConstants.ONEHOTENCODED_,
                                         "")

        idNameFeaturesOrdered = None if idNameFeaturesOrderedTemp == None else idNameFeaturesOrderedTemp

        # retrieve the label colm name only after label encoding
        indexedLabelNameDict = {}
        if isLabelIndexed == "yes":
            indexOfLabel = dataset.schema.names.index(label)
            indexedLabel = dataset.schema.fields[indexOfLabel].metadata[
                "ml_attr"]["vals"]

            for value in indexedLabel:
                indexedLabelNameDict[indexedLabel.index(value)] = value

        # this code was for vector indexer since it is not stable for now from spark end
        # so will use it in future if needed.
        '''
        vec_indexer = VectorIndexer(inputCol='features', outputCol='vec_indexed_features',
        maxCategories=maxCategories,
                                    handleInvalid="skip").fit(dataset)
        categorical_features = vec_indexer.categoryMaps
        print("Choose %d categorical features: %s" %
              (len(categorical_features), ", ".join(str(k) for k in categorical_features.keys())))
        dataset= vec_indexer.transform(dataset)
        '''

        result = {
            PredictiveConstants.DATASET: dataset,
            PredictiveConstants.CATEGORICALFEATURES: categoricalFeatures,
            PredictiveConstants.NUMERICALFEATURES: numericalFeatures,
            PredictiveConstants.MAXCATEGORIES: maxCategories,
            PredictiveConstants.CATEGORYCOLMSTATS: categoryColmListDict,
            PredictiveConstants.INDEXEDFEATURES: indexedFeatures,
            PredictiveConstants.LABEL: label,
            PredictiveConstants.ONEHOTENCODEDFEATURESLIST:
            oneHotEncodedFeaturesList,
            PredictiveConstants.INDEXEDLABELNAMEDICT: indexedLabelNameDict,
            PredictiveConstants.ISLABELINDEXED: isLabelIndexed,
            PredictiveConstants.VECTORFEATURES: vectorizedFeaturescolmName,
            PredictiveConstants.IDNAMEFEATURESORDERED: idNameFeaturesOrdered,
            PredictiveConstants.INDEXERPATHMAPPING: indexerPathMapping,
            PredictiveConstants.ONEHOTENCODERPATHMAPPING:
            oneHotEncoderPathMapping
        }
        return result
def business_ids_to_number(dataframe):
    indexer_business = StringIndexer(inputCol ="business_id",outputCol="business_id_num").fit(dataframe)
    indexer_business_save = os.path.join('model', 'bus_ind_model')
    indexer_business.write().overwrite().save(indexer_business_save)
예제 #9
0
def main():
    spark = SparkSession \
        .builder \
        .config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:2.7.0') \
        .getOrCreate()

    # use for only one file
    # filename = 'chicago_taxi_trips_2016_01.csv'

    # use for reading all files
    filename = '*'

    df = spark.read \
        .format('csv') \
        .options(header=True, inferSchema=True) \
        .load(os.path.join(etl_conf['s3_taxi_dir_path'], filename))
    # df.printSchema()

    # Take a look at the top rows
    # df.limit(5).toPandas()

    # Check initial number of records
    # df.count()

    df_with_hour = df.withColumn('year', year(df.trip_start_timestamp))\
                     .withColumn('month', month(df.trip_start_timestamp))\
                     .withColumn('day', dayofmonth(df.trip_start_timestamp))\
                     .withColumn('hour', hour(df.trip_start_timestamp))

    df_features = df_with_hour.select('year', 'month', 'day', 'hour',
                                      'pickup_community_area',
                                      'dropoff_community_area')

    df_no_nulls = df_features.dropna()

    # df_no_nulls.count()

    # Create StringIndexer and fit + transform pickup data
    pickup_indexer = StringIndexer(inputCol='pickup_community_area',
                                   outputCol='pickup_community_area_indexed')

    pickup_indexer_model = pickup_indexer.fit(df_no_nulls)
    df_pickup_indexed = pickup_indexer_model.transform(df_no_nulls)

    # Create StringIndexer and fit + transform dropoff data
    dropoff_indexer = StringIndexer(inputCol='dropoff_community_area',
                                    outputCol='dropoff_community_area_indexed')

    dropoff_indexer_model = dropoff_indexer.fit(df_pickup_indexed)
    df_dropoff_indexed = dropoff_indexer_model.transform(df_pickup_indexed)

    # Create OneHotEncoder and fit + transform pickup & dropoff data
    encoder = OneHotEncoderEstimator() \
        .setInputCols(['hour',
                       'pickup_community_area_indexed',
                       'dropoff_community_area_indexed']) \
        .setOutputCols(['hour_encoded',
                        'pickup_community_area_encoded',
                        'dropoff_community_area_encoded'])

    encoder_model = encoder.fit(df_dropoff_indexed)
    df_encoded = encoder_model.transform(df_dropoff_indexed)

    # df_encoded.printSchema()

    bucket = output_conf['s3_bucket']
    key = output_conf['s3_model_key']

    # save the pickup stringINdexer and model
    pickup_indexer_name = 'pickup_indexer_name'
    pickup_indexer_path = os.path.join(bucket, key, pickup_indexer_name)
    pickup_indexer.write().overwrite().save(pickup_indexer_path)

    pickup_indexer_model_name = 'pickup_indexer_model_name'
    pickup_indexer_model_name_path = os.path.join(bucket, key,
                                                  pickup_indexer_model_name)
    pickup_indexer_model \
        .write() \
        .overwrite() \
        .save(pickup_indexer_model_name_path)

    # save the dropoff stringINdexer and model
    dropoff_indexer_name = 'dropoff_indexer_name'
    dropoff_indexer_path = os.path.join(bucket, key, dropoff_indexer_name)
    dropoff_indexer.write().overwrite().save(dropoff_indexer_path)

    dropoff_indexer_model_name = 'dropoff_indexer_model_name'
    dropoff_indexer_model_name_path = os.path.join(bucket, key,
                                                   dropoff_indexer_model_name)
    dropoff_indexer_model \
        .write() \
        .overwrite() \
        .save(dropoff_indexer_model_name_path)

    # save the one-hot encoder and model
    encoder_name = 'encoder_name'
    encoder_name_path = os.path.join(bucket, key, encoder_name)
    encoder.write().overwrite().save(encoder_name_path)

    encoder_model_name = 'encoder_model_name'
    encoder_model_name_path = os.path.join(bucket, key, encoder_model_name)
    encoder_model.write().overwrite().save(encoder_model_name_path)

    # make final dataframe and store back to S3
    df_final = df_encoded.select('year', 'month', 'day', 'hour_encoded',
                                 'pickup_community_area_encoded',
                                 'dropoff_community_area_encoded')

    bucket = output_conf['s3_bucket']
    key = output_conf['s3_data_key']

    output_path = os.path.join(bucket, key)

    df_final.write.partitionBy('year', 'month', 'day') \
            .parquet(output_path, mode='overwrite')
    return ss


if __name__ == '__main__':

    start_time = time.time()
    ss = init_spark_session()  #initial spark session

    reviewDF_vegas_save = os.path.join('dataset', 'review_vegas.parquet')
    reviewDF_vegas = ss.read.parquet(reviewDF_vegas_save).cache()
    #covert user_id and bussiness_id from string to int
    indexer_user = StringIndexer(inputCol="user_id",
                                 outputCol="user_id_int").fit(reviewDF_vegas)
    indexer_user_save = os.path.join('model', 'user_ind_model')
    indexer_user.write().overwrite().save(indexer_user_save)

    indexer_business = StringIndexer(
        inputCol="business_id",
        outputCol="business_id_int").fit(reviewDF_vegas)
    indexer_business_save = os.path.join('model', 'bus_ind_model')
    indexer_business.write().overwrite().save(indexer_business_save)
    #transform id columns to string
    indexed = indexer_user.transform(reviewDF_vegas)
    final_indexed = indexer_business.transform(indexed)
    final_indexed.show(20)
    #save fitted strtingIndexer models
    final_indexed_save = os.path.join('dataset', 'review_vegas_als.parquet')
    final_indexed.write.mode('overwrite').parquet(final_indexed_save)
    logger.error('index compelted save to file')
    logger.error('{} seconds has elapsed'.format(time.time() - start_time))