Exemplo n.º 1
0
    def query_to_df(self, query_string, header="HEADER"):
        """ Given a query, write the requested data to csv.

        :param query_string: query to send
        :param file_name: name to save the file as
        :header: text to include in query indicating if a header should be saved
                 in output
        :type query_string: str
        :type file_name: str
        :type header: str

        :return: none
        :rtype: none
        """
        logging.debug("Copying to CSV query %s", query_string)
        copy_sql = "COPY ({query}) TO STDOUT WITH CSV {head}".format(
            query=query_string, head=header)
        conn = self.db_engine.raw_connection()
        cur = conn.cursor()
        out = io.StringIO()
        cur.copy_expert(copy_sql, out)
        out.seek(0)
        df = pandas.read_csv(out, parse_dates=["as_of_date"])
        df.set_index(["entity_id", "as_of_date"], inplace=True)
        return downcast_matrix(df)
Exemplo n.º 2
0
def test_downcast_matrix():
    df = matrix_creator()
    downcasted_df = downcast_matrix(df)

    # make sure the contents are equivalent
    assert ((downcasted_df == df).all().all())

    # make sure the memory usage is lower because there would be no point of this otherwise
    assert downcasted_df.memory_usage().sum() < df.memory_usage().sum()
Exemplo n.º 3
0
    def matrix(self):
        """The raw matrix. Will load from storage into memory if not already loaded"""
        if self.__matrix is None:
            self.__matrix = self._load()
            # Is the index already in place?
            if self.__matrix.index.names != self.metadata['indices']:
                self.__matrix.set_index(self.metadata['indices'], inplace=True)

            self.__matrix = downcast_matrix(self.__matrix)
        return self.__matrix
Exemplo n.º 4
0
    def _preprocess_and_split_matrix(self, matrix_with_labels):
        """Perform desired preprocessing that we generally want to do after loading a matrix

        This includes setting the index (depending on the storage, may not be serializable)
        and downcasting.
        """
        indices = self.metadata['indices']
        if matrix_with_labels.index.names != indices:
            matrix_with_labels.set_index(indices, inplace=True)
        matrix_with_labels = downcast_matrix(matrix_with_labels)
        labels = matrix_with_labels.pop(self.label_column_name)
        design_matrix = matrix_with_labels
        return design_matrix, labels
Exemplo n.º 5
0
    def _preprocess_and_split_matrix(self, matrix_with_labels):
        """Perform desired preprocessing that we generally want to do after loading a matrix

        This includes setting the index (depending on the storage, may not be serializable)
        and downcasting.
        """
        if matrix_with_labels.index.names != self.indices:
            matrix_with_labels.set_index(self.indices, inplace=True)
        index_of_date = matrix_with_labels.index.names.index('as_of_date')
        if matrix_with_labels.index.levels[
                index_of_date].dtype != "datetime64[ns]":
            raise ValueError(
                f"Woah is {matrix_with_labels.index.levels[index_of_date].dtype}"
            )
        matrix_with_labels = downcast_matrix(matrix_with_labels)
        labels = matrix_with_labels.pop(self.label_column_name)
        design_matrix = matrix_with_labels
        return design_matrix, labels