示例#1
0
    def ingest_data(self, filepath, dataset):
        """
        load data from a file to a dataframe and store it on the db

         Parameters
        ----------
        filepath : String
            file path of the .csv file for the dataset
        dataset: DataSet
            The DataSet object that holds the Session ID for HoloClean
        Returns
        -------
        No Return
        """
        # Spawn new reader and load data into dataframe
        fileReader = Reader(self.holoEnv.spark_session)
        df = fileReader.read(filepath)

        # Store dataframe to DB table
        schema = df.schema.names
        name_table = self._add_info_to_meta('Init', schema, dataset)
        self._dataframe_to_table(name_table, df)
        table_attribute_string = self.get_schema(dataset, "Init")
        count = 0
        map_schema = []
        attributes = table_attribute_string.split(',')
        for attribute in attributes:
            if attribute != "index":
                count = count + 1
                map_schema.append([count, attribute])

        dataframe_map_schema = self.holoEnv.spark_session.createDataFrame(
            map_schema,
            StructType([
                StructField("index", IntegerType(), False),
                StructField("attribute", StringType(), True)
            ]))
        self.add_db_table('Map_schema', dataframe_map_schema, dataset)

        for tuple in map_schema:
            self.attribute_map[tuple[1]] = tuple[0]
        return
示例#2
0
    def ingest_data(self, filepath, dataset):
        """
        Load data from a file to a dataframe and store it on the db

        filepath : String
            File path of the .csv file for the dataset
        dataset: DataSet
            The DataSet object that holds the Session ID for HoloClean

        """

        # Spawn new reader and load data into dataframe
        filereader = Reader(self.holo_env.spark_session)

        # read with an index column
        df = filereader.read(filepath,1)

        # Store dataframe to DB table
        schema = df.schema.names
        name_table = dataset.table_specific_name('Init')
        self.dataframe_to_table(name_table, df)
        dataset.attributes['Init'] = schema
        count = 0
        map_schema = []
        attribute_map = {}
        for attribute in schema:
            if attribute != GlobalVariables.index_name:
                count = count + 1
                map_schema.append([count, attribute])
                attribute_map[attribute] = count

        dataframe_map_schema = self.holo_env.spark_session.createDataFrame(
            map_schema, dataset.attributes['Map_schema'])
        self.add_db_table('Map_schema', dataframe_map_schema, dataset)

        for table_tuple in map_schema:
            self.attribute_map[table_tuple[1]] = table_tuple[0]

        return df, attribute_map
示例#3
0
 def get_correct_array(self, current_file_path):
     my_reader = Reader()
     init_array = my_reader.read(current_file_path)
     return init_array