Exemplo n.º 1
0
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

# import csv
_path = 'C:/Users/lxabc/Desktop/QAdata/'
t = pd.read_csv(_path+'ISKG.csv', sep=',', encoding='utf-8', usecols=['_id', '_start', '_end', '_type'], low_memory=False)
t['_id'] = t['_id'].astype(pd.Int32Dtype())
t['_start'] = t['_start'].astype(pd.Int32Dtype())
t['_end'] = t['_end'].astype(pd.Int32Dtype())
print('lines: %d' % t.shape[0])

# split to entites and relations
_idx = t['_start'].first_valid_index()
print('\nrel index start at : %d' % _idx)
entities = t['_id'][:_idx]
triples = t[['_start', '_end', '_type']][_idx:]
print('\nshow entities series: ')
print(entities.head(5))
print('\nshow rels dataframe: ')
print(triples.head(5))

# entities drop duplicates
entities = entities.drop_duplicates()
# print(type(entities)) entities is series
entities = entities.sort_values(ascending=True)
print('\nshow entities series after drop duplicates: ')
print(entities.head(5))

# re-encode entities from 0-n, save as dict
entities.reset_index(drop=True, inplace=True)
Exemplo n.º 2
0
def downloadFundosFile():
    logger = logging.getLogger(name="download")

    download_url = "http://dados.cvm.gov.br/dados/FI/CAD/DADOS/cad_fi.csv"

    dtypes = {
        'TP_FUNDO': str,
        'CNPJ_FUNDO': str,
        'DENOM_SOCIAL': str
        #,'DT_REG': 'datetime'
        #,'DT_CONST': 'datetime'
        ,
        'CD_CVM': pd.Int32Dtype()
        #,'DT_CANCEL': 'datetime'
        ,
        'SIT': str
        #,'DT_INI_SIT': 'datetime'
        #,'DT_INI_ATIV': 'datetime'
        #,'DT_INI_EXERC': 'datetime'
        #,'DT_FIM_EXERC': 'datetime'
        ,
        'CLASSE': str
        #,'DT_INI_CLASSE': 'datetime'
        ,
        'RENTAB_FUNDO': str,
        'CONDOM': str,
        'FUNDO_COTAS': str,
        'FUNDO_EXCLUSIVO': str,
        'TRIB_LPRAZO': str,
        'INVEST_QUALIF': str,
        'ENTID_INVEST': str,
        'TAXA_PERFM': float,
        'INF_TAXA_PERFM': str,
        'TAXA_ADM': float,
        'INF_TAXA_ADM': str,
        'VL_PATRIM_LIQ': float
        #,'DT_PATRIM_LIQ': 'datetime'
        ,
        'DIRETOR': str,
        'CNPJ_ADMIN': str,
        'ADMIN': str,
        'PF_PJ_GESTOR': str,
        'CPF_CNPJ_GESTOR': str,
        'GESTOR': str,
        'CNPJ_AUDITOR': str,
        'AUDITOR': str,
        'CNPJ_CUSTODIANTE': str,
        'CUSTODIANTE': str,
        'CNPJ_CONTROLADOR': str,
        'CONTROLADOR': str
    }

    dates_columns = [
        'DT_REG', 'DT_CONST', 'DT_CANCEL', 'DT_INI_SIT', 'DT_INI_ATIV',
        'DT_INI_EXERC', 'DT_FIM_EXERC', 'DT_INI_CLASSE', 'DT_PATRIM_LIQ'
    ]
    dateparse_func = lambda x: convertDate(x)

    with requests.Session() as s:
        logger.info("start download")
        download_content = s.get(download_url)
        logger.info("finish download")
        logger.info("start reading")
        decoded_content = download_content.content.decode('latin-1')
        parsed_df = pd.read_csv(io.StringIO(decoded_content),
                                delimiter=';',
                                dtype=dtypes,
                                date_parser=dateparse_func,
                                parse_dates=dates_columns)
        logger.info("finish reading")
        return parsed_df

    return None
Exemplo n.º 3
0
    for key, value in df['county_name'].items():
        state = df['state'][key]
        if str(state) == "nan":
            state_fips.append(None)
        else:
            state_fips.append(af.get_state_fips(state))

        if str(value) == "nan":
            county_fips.append(None)
        else:
            county_fips.append(af.get_county_fips(value, state))
    df['us_state_fips'] = state_fips
    df['us_county_fips'] = county_fips

    # ensure locale has no decimals
    df['locale'] = df['locale'].astype(pd.Int32Dtype())
    df['grade_low'] = df['grade_low'].astype(str)
    df['grade_high'] = df['grade_high'].astype(str)
    df['grade_low'] = df['grade_low'].str.replace('nan', '')
    df['grade_high'] = df['grade_high'].str.replace('nan', '')

    # 0 pad any ids that do not match the NCES length
    df['school_id_nces'] = df['school_id_nces'].apply(
        lambda x: '{0:0>12}'.format(x))
    df['district_id_nces'] = df['district_id_nces'].apply(
        lambda x: '{0:0>7}'.format(x))
    df['zip'] = df['zip'].apply(lambda x: '{0:0>5}'.format(x))
    df['zip_4_digit'] = df['zip_4_digit'].apply(lambda x: '{0:0>4}'.format(x))
    df['grade_low'] = df['grade_low'].apply(lambda x: '{0:0>2}'.format(x))
    df['grade_high'] = df['grade_high'].apply(lambda x: '{0:0>2}'.format(x))
Exemplo n.º 4
0
    def _remove_tech_rep_negatives(self):
        """
        The samples that we have processed for the negative controls contains technical
        replicates. We want to get rid of these for the zenodo uploads.
        To do this we will use Stephanes table to identify those readsets that share
        a TARA id (these will be tech reps) and we will only keep the largest sample
        (i.e highest number of contigs). We can get the contig information from the
        abund_meta file in the post_med directory.
        We can then process the post_med and pre_med count files here.
        """

        # For each row in the post_med_df, find the mapping key that is a substring
        # Should be only one, check this.
        # Then once you have found the one, check all samples in the post_med df to see if it matches any other
        # if you return multiple matches, then keep only the one with the biggest number of contigs,
        # and all others to a drop list. Keep a checked list so that we don't have to check readsets twice.
        # Also up date a dictionary as you go that is the full readset to the sample-id that it needs to become.
        # Once this has been done for the post-med do it for the pre-med.
        # For the pre-med, use the dictionary we created while doing the post-med

        # Get the post med df. Read it in with index as false and set index manually without dropping
        # this way we can work with the index, but then we can not write it out later so as not
        # to disturb the column orders.
        post_med_count_path = os.path.join(
            self.negative_output_dir_path, 'post_med_seqs', [
                _ for _ in os.listdir(
                    os.path.join(self.negative_output_dir_path,
                                 'post_med_seqs')) if 'abund' in _
            ][0])
        post_med_df = pd.read_csv(post_med_count_path, index_col=False)
        post_med_df = post_med_df.set_index('sample-id', drop=False)

        # Same for the pre_med
        pre_med_count_path = os.path.join(
            self.negative_output_dir_path, 'pre_med_seqs', [
                _ for _ in os.listdir(
                    os.path.join(self.negative_output_dir_path,
                                 'pre_med_seqs')) if 'abund' in _
            ][0])
        pre_med_df = pd.read_csv(pre_med_count_path, index_col=False)
        pre_med_df = pre_med_df.set_index('sample-id', drop=False)

        # First check to see if the sample-ids have already been fixed
        if 'TARA' in pre_med_df.index[0] and 'TARA' in post_med_df.index[0]:
            return
        if 'TARA' in pre_med_df.index[0] and 'TARA' not in post_med_df.index[0]:
            raise RuntimeError
        if 'TARA' not in pre_med_df.index[0] and 'TARA' in post_med_df.index[0]:
            raise RuntimeError

        # The dictionary df that Setphane produced
        mapping_df = pd.read_csv(self.negative_mapping_file_path, index_col=0)
        # Make the mapping dictionary from the Stephane df
        raw_mapping_dict = {}
        for df_ind in mapping_df.index:
            raw_mapping_dict[df_ind] = mapping_df.at[df_ind,
                                                     'sample-id_source']

        # This is the dictionary we are going to populate that had the full genoscope readset
        # as the key and the equivalent TARA sample-id as the value
        curated_mapping_dict = {}

        # Check that the assumption holds that both of the indeces are identifcal except for order.
        # NB the post med df has an annoying row at the end.
        assert (set(post_med_df.index[:-1]) == set(pre_med_df.index))
        contig_dict = {
            readset: contig
            for readset, contig in zip(post_med_df['sample-id'][:-1],
                                       post_med_df['raw_contigs'][:-1])
        }

        to_drop_list = []
        checked_list = []
        for pm_ind in post_med_df.index[:-1]:
            if pm_ind in checked_list:
                continue
            match = []
            for map_ind in mapping_df.index:
                if map_ind in pm_ind:
                    match.append(map_ind)
            if len(match) == 0:
                print(
                    f'pm_ind: {pm_ind} found 0 matches. This sample will be dropped.'
                )
                to_drop_list.append(pm_ind)
                continue
            elif len(match) > 1:
                raise RuntimeError

            # Now we have the mapping indice that matches
            match = match[0]
            pm_matches = []
            for pm_ind_again in post_med_df.index[:-1]:
                if match in pm_ind_again:
                    pm_matches.append(pm_ind_again)
            assert (len(pm_matches) > 0)
            if len(pm_matches) > 1:
                # Then we have technical replicates and we only want to keep the largest
                contig_match_dict = {
                    pm_match: contig_dict[pm_match]
                    for pm_match in pm_matches
                }
                sorted_keys = sorted(contig_match_dict,
                                     key=contig_match_dict.get,
                                     reverse=True)
                # Add all of the matches to the check_list
                checked_list.extend(sorted_keys)
                curated_mapping_dict[sorted_keys[0]] = raw_mapping_dict[match]
                to_drop_list.extend(sorted_keys[1:])
            else:
                checked_list.append(pm_matches[0])
                curated_mapping_dict[pm_matches[0]] = raw_mapping_dict[match]

        # drop the rows
        post_med_df.drop(index=to_drop_list, inplace=True)
        # We now need to get rid of any sequence count columns that only have 0s after dropping the samples
        # The last meta column is post_med_unique
        cols = list(post_med_df)
        c_ind = cols.index('post_med_unique') + 1
        cols_to_check = cols[c_ind:]
        cols_to_drop = []
        for col in cols_to_check:
            if (post_med_df[col][:-1] == 0).all():
                cols_to_drop.append(col)

        # drop the cols
        post_med_df.drop(columns=cols_to_drop, inplace=True)

        # rename
        for ind in post_med_df.index[:-1]:
            current = post_med_df.at[ind, 'sample-id']
            post_med_df.at[ind, 'sample-id'] = curated_mapping_dict[current]

        # Here we have the curated mapping dict popualted and we can now use this to
        # process the pre_med df
        pre_med_df.drop(index=to_drop_list, inplace=True)
        # We now need to get rid of any sequence count columns that only have 0s after dropping the samples
        # The last meta column is post_med_unique
        cols = list(pre_med_df)
        c_ind = cols.index('sample-id') + 1
        cols_to_check = cols[c_ind:]
        cols_to_drop = []
        for col in cols_to_check:
            if (pre_med_df[col][:-1] == 0).all():
                cols_to_drop.append(col)

        # drop the cols
        pre_med_df.drop(columns=cols_to_drop, inplace=True)

        # rename
        for ind in pre_med_df.index:
            current = pre_med_df.at[ind, 'sample-id']
            pre_med_df.at[ind, 'sample-id'] = curated_mapping_dict[current]

        # Now convert the columns to int32
        d_type_dict = {
            col_name: pd.Int32Dtype()
            for col_name in list(post_med_df)[2:]
        }
        post_med_df = post_med_df.astype(d_type_dict)
        d_type_dict = {
            col_name: pd.Int32Dtype()
            for col_name in list(pre_med_df)[2:]
        }
        pre_med_df = pre_med_df.astype(d_type_dict)

        # Important to write out with index as false
        post_med_df.to_csv(post_med_count_path, index=False, header=True)
        pre_med_df.to_csv(pre_med_count_path, index=False, header=True)
Exemplo n.º 5
0
def read_gdf_bho(file_gdf_bho):
    """
    Read geopackage file with BHO drainage (cotrecho)
        adjust dtypes and returns as geodataframe

    Args:
        file_gdf_bho(str)  :: pathfile to BHO drainage in .gpkg/shp

    Returns:
        gdf_tble_bho (gpd.GeoDataFrame) :: table for BHO drainage (polyline)

    Notes:
        - see required columns in variable 'cols'

    """
    gdf_tble_bho = gpd.GeoDataFrame.from_file(file_gdf_bho)

    # save crs for later
    crs = gdf_tble_bho.crs

    # required cols
    cols = [
        'cotrecho',
        'cobacia',
        'nucomptrec',
        'nuareacont',
        'nuareamont',
        'nutrjus',
        'dedominial',
        'nustrahler',
        'nuordemcda',
        'cocursodag',
        'cocdadesag',
        'nudistbact',
        'nunivotto',
        'geometry',
    ]

    gdf_tble_bho = gdf_tble_bho[cols]

    # apply dtypes
    bho_dtypes = {
        'fid': pd.Int64Dtype(),
        'drn_pk': int,
        'cotrecho': int,
        'noorigem': int,
        'nodestino': int,
        'cocursodag': str,
        'cobacia': str,
        'nucomptrec': float,
        'nudistbact': float,
        'nudistcdag': float,
        'nuareacont': float,
        'nuareamont': float,
        'nogenerico': str,
        'noligacao': str,
        'noespecif': str,
        'noriocomp': str,
        'nooriginal': str,
        'cocdadesag': str,
        'nutrjus': pd.Int32Dtype(),
        'nudistbacc': float,
        'nuareabacc': float,
        'nuordemcda': pd.Int32Dtype(),
        'nucompcda': float,
        'nunivotto': int,
        'nunivotcda': pd.Int32Dtype(),
        'nustrahler': pd.Int32Dtype(),
        'dedominial': str,
        'dsversao': str,
        'cobacia_50k': str,
        'lat': float,
        'lon': float,
    }
    hmap = {k: v for k, v in bho_dtypes.items() if k in gdf_tble_bho.columns}
    gdf_tble_bho = gdf_tble_bho.astype(hmap)

    #recover crs
    gdf_tble_bho.crs = crs

    #make spatial index
    gdf_tble_bho.sindex

    return gdf_tble_bho
def test_sum_intna():
    a = pd.Series([1, None, 2], dtype=pd.Int32Dtype())
    b = dd.from_pandas(a, 2)
    assert_eq(a.sum(), b.sum())
Exemplo n.º 7
0
import functools
import io
import zipfile
from collections import namedtuple
from datetime import date, datetime, timedelta
from typing import Iterator, Optional, Tuple

import pandas as pd
import requests

from . import cache

NULLABLE_INT = pd.Int32Dtype()
DATE_FORMAT = "%Y-%m-%d"

# dictionary containing team abbreviations and their first year in existance
first_season_map = {'ALT': 1884, 'ANA': 1997, 'ARI': 1998, 'ATH': 1871,
                    'ATL': 1966, 'BAL': 1872, 'BLA': 1901, 'BLN': 1892,
                    'BLU': 1884, 'BOS': 1871, 'BRA': 1872, 'BRG': 1890,
                    'BRO': 1884, 'BSN': 1876, 'BTT': 1914, 'BUF': 1879,
                    'BWW': 1890, 'CAL': 1965, 'CEN': 1875, 'CHC': 1876,
                    'CHI': 1871, 'CHW': 1901, 'CIN': 1876, 'CKK': 1891,
                    'CLE': 1871, 'CLV': 1879, 'COL': 1883, 'COR': 1884,
                    'CPI': 1884, 'DET': 1901, 'DTN': 1881, 'ECK': 1872,
                    'FLA': 1993, 'HAR': 1874, 'HOU': 1962, 'IND': 1878,
                    'KCA': 1955, 'KCC': 1884, 'KCN': 1886, 'KCP': 1914,
                    'KCR': 1969, 'KEK': 1871, 'LAA': 1961, 'LAD': 1958,
                    'LOU': 1876, 'MAN': 1872, 'MAR': 1873, 'MIA': 2012,
                    'MIL': 1884, 'MIN': 1961, 'MLA': 1901, 'MLG': 1878,
                    'MLN': 1953, 'MON': 1969, 'NAT': 1872, 'NEW': 1915,
                    'NHV': 1875, 'NYG': 1883, 'NYI': 1890, 'NYM': 1962,
Exemplo n.º 8
0
                                               dtype=data.dtype)
    assert_eq(cudf.Series(actual_column), cudf.Series(expected))


@pytest.mark.parametrize(
    "pd_dtype,expect_dtype",
    [
        # TODO: Nullable float is coming
        (pd.StringDtype(), np.dtype("O")),
        (pd.UInt8Dtype(), np.dtype("uint8")),
        (pd.UInt16Dtype(), np.dtype("uint16")),
        (pd.UInt32Dtype(), np.dtype("uint32")),
        (pd.UInt64Dtype(), np.dtype("uint64")),
        (pd.Int8Dtype(), np.dtype("int8")),
        (pd.Int16Dtype(), np.dtype("int16")),
        (pd.Int32Dtype(), np.dtype("int32")),
        (pd.Int64Dtype(), np.dtype("int64")),
        (pd.BooleanDtype(), np.dtype("bool")),
    ],
)
def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
    if pd_dtype == pd.StringDtype():
        data = ["a", pd.NA, "c", pd.NA, "e"]
    elif pd_dtype == pd.BooleanDtype():
        data = [True, pd.NA, False, pd.NA, True]
    else:
        data = [1, pd.NA, 3, pd.NA, 5]

    pd_data = pd.DataFrame.from_dict({"a": data}, dtype=pd_dtype)
    gd_data = cudf.DataFrame.from_pandas(pd_data)
Exemplo n.º 9
0
class DataMapping:
    """
    Map primary data between different supported data frameworks, preserving equivalent data types.

    DataMapping is for primary data, to map metadata types and values use
    :py:class:`TypeMapping <tracdap.rt.impl.type_system.TypeMapping>` and
    :py:class:`TypeMapping <tracdap.rt.impl.type_system.MetadataCodec>`.
    """

    __log = _util.logger_for_namespace(_DataInternal.__module__ +
                                       ".DataMapping")

    # Matches TRAC_ARROW_TYPE_MAPPING in ArrowSchema, tracdap-lib-data

    __TRAC_DECIMAL_PRECISION = 38
    __TRAC_DECIMAL_SCALE = 12
    __TRAC_TIMESTAMP_UNIT = "ms"
    __TRAC_TIMESTAMP_ZONE = None

    __TRAC_TO_ARROW_BASIC_TYPE_MAPPING = {
        _meta.BasicType.BOOLEAN:
        pa.bool_(),
        _meta.BasicType.INTEGER:
        pa.int64(),
        _meta.BasicType.FLOAT:
        pa.float64(),
        _meta.BasicType.DECIMAL:
        pa.decimal128(__TRAC_DECIMAL_PRECISION, __TRAC_DECIMAL_SCALE),
        _meta.BasicType.STRING:
        pa.utf8(),
        _meta.BasicType.DATE:
        pa.date32(),
        _meta.BasicType.DATETIME:
        pa.timestamp(__TRAC_TIMESTAMP_UNIT, __TRAC_TIMESTAMP_ZONE)
    }

    # Check the Pandas dtypes for handling floats are available before setting up the type mapping
    __PANDAS_FLOAT_DTYPE_CHECK = _DataInternal.float_dtype_check()
    __PANDAS_DATETIME_TYPE = pd.to_datetime([]).dtype

    # Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way
    __ARROW_TO_PANDAS_TYPE_MAPPING = {
        pa.bool_(): pd.BooleanDtype(),
        pa.int8(): pd.Int8Dtype(),
        pa.int16(): pd.Int16Dtype(),
        pa.int32(): pd.Int32Dtype(),
        pa.int64(): pd.Int64Dtype(),
        pa.uint8(): pd.UInt8Dtype(),
        pa.uint16(): pd.UInt16Dtype(),
        pa.uint32(): pd.UInt32Dtype(),
        pa.uint64(): pd.UInt64Dtype(),
        pa.float16(): pd.Float32Dtype(),
        pa.float32(): pd.Float32Dtype(),
        pa.float64(): pd.Float64Dtype(),
        pa.utf8(): pd.StringDtype()
    }

    @staticmethod
    def arrow_to_python_type(arrow_type: pa.DataType) -> type:

        if pa.types.is_boolean(arrow_type):
            return bool

        if pa.types.is_integer(arrow_type):
            return int

        if pa.types.is_floating(arrow_type):
            return float

        if pa.types.is_decimal(arrow_type):
            return decimal.Decimal

        if pa.types.is_string(arrow_type):
            return str

        if pa.types.is_date(arrow_type):
            return dt.date

        if pa.types.is_timestamp(arrow_type):
            return dt.datetime

        raise _ex.ETracInternal(
            f"No Python type mapping available for Arrow type [{arrow_type}]")

    @classmethod
    def python_to_arrow_type(cls, python_type: type) -> pa.DataType:

        if python_type == bool:
            return pa.bool_()

        if python_type == int:
            return pa.int64()

        if python_type == float:
            return pa.float64()

        if python_type == decimal.Decimal:
            return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION,
                                 cls.__TRAC_DECIMAL_SCALE)

        if python_type == str:
            return pa.utf8()

        if python_type == dt.date:
            return pa.date32()

        if python_type == dt.datetime:
            return pa.timestamp(cls.__TRAC_TIMESTAMP_UNIT,
                                cls.__TRAC_TIMESTAMP_ZONE)

        raise _ex.ETracInternal(
            f"No Arrow type mapping available for Python type [{python_type}]")

    @classmethod
    def trac_to_arrow_type(cls,
                           trac_type: _meta.TypeDescriptor) -> pa.DataType:

        return cls.trac_to_arrow_basic_type(trac_type.basicType)

    @classmethod
    def trac_to_arrow_basic_type(
            cls, trac_basic_type: _meta.BasicType) -> pa.DataType:

        arrow_type = cls.__TRAC_TO_ARROW_BASIC_TYPE_MAPPING.get(
            trac_basic_type)

        if arrow_type is None:
            raise _ex.ETracInternal(
                f"No Arrow type mapping available for TRAC type [{trac_basic_type}]"
            )

        return arrow_type

    @classmethod
    def trac_to_arrow_schema(cls,
                             trac_schema: _meta.SchemaDefinition) -> pa.Schema:

        if trac_schema.schemaType != _meta.SchemaType.TABLE:
            raise _ex.ETracInternal(
                f"Schema type [{trac_schema.schemaType}] cannot be converted for Apache Arrow"
            )

        arrow_fields = [(f.fieldName,
                         cls.trac_to_arrow_basic_type(f.fieldType))
                        for f in trac_schema.table.fields]

        return pa.schema(arrow_fields, metadata={})

    @classmethod
    def trac_arrow_decimal_type(cls) -> pa.Decimal128Type:

        return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION,
                             cls.__TRAC_DECIMAL_SCALE)

    @classmethod
    def pandas_datetime_type(cls):
        return cls.__PANDAS_DATETIME_TYPE

    @classmethod
    def view_to_pandas(cls, view: DataView, part: DataPartKey) -> pd.DataFrame:

        deltas = view.parts.get(part)

        # Sanity checks

        if not view.arrow_schema:
            raise _ex.ETracInternal(f"Data view schema not set")

        if not deltas:
            raise _ex.ETracInternal(
                f"Data view for part [{part.opaque_key}] does not contain any items"
            )

        if len(deltas) == 1:
            return cls.item_to_pandas(deltas[0])

        batches = {
            batch
            for delta in deltas for batch in (
                delta.batches if delta.batches else delta.table.to_batches())
        }

        table = pa.Table.from_batches(batches)  # noqa
        return table.to_pandas()

    @classmethod
    def item_to_pandas(cls, item: DataItem) -> pd.DataFrame:

        if item.pandas is not None:
            return item.pandas.copy()

        if item.table is not None:
            return cls.arrow_to_pandas(item.table)

        if item.batches is not None:
            table = pa.Table.from_batches(item.batches, item.schema)  # noqa
            return cls.arrow_to_pandas(table)

        raise _ex.ETracInternal(f"Data item does not contain any usable data")

    @classmethod
    def arrow_to_pandas(cls, table: pa.Table) -> pd.DataFrame:

        return table.to_pandas(
            ignore_metadata=True,  # noqa
            date_as_object=False,  # noqa
            timestamp_as_object=False,  # noqa
            types_mapper=cls.__ARROW_TO_PANDAS_TYPE_MAPPING.get)

    @classmethod
    def pandas_to_view(cls, df: pd.DataFrame, prior_view: DataView,
                       part: DataPartKey):

        item = cls.pandas_to_item(df, prior_view.arrow_schema)
        return cls.add_item_to_view(prior_view, part, item)

    @classmethod
    def pandas_to_item(cls, df: pd.DataFrame,
                       schema: tp.Optional[pa.Schema]) -> DataItem:

        table = cls.pandas_to_arrow(df, schema)
        return DataItem(table.schema, table)

    @classmethod
    def pandas_to_arrow(cls,
                        df: pd.DataFrame,
                        schema: tp.Optional[pa.Schema] = None) -> pa.Table:

        # Here we convert the whole Pandas df and then pass it to conformance
        # An optimization would be to filter columns before applying conformance
        # To do this, we'd need the case-insensitive field matching logic, including output of warnings

        # Also, note that schema is not applied in from_pandas
        # This is because the conformance logic allows for a wider range of conversions
        # Applying the schema directly would fail for some types where casting is possible

        if len(df) == 0:
            df_schema = pa.Schema.from_pandas(df, preserve_index=False)  # noqa
            table = pa.Table.from_batches(list(), df_schema)  # noqa
        else:
            table = pa.Table.from_pandas(df, preserve_index=False)  # noqa

        # If there is no explict schema, give back the table exactly as it was received from Pandas
        # There could be an option here to coerce types to the appropriate TRAC standard types
        # E.g. unsigned int 32 -> signed int 64, TRAC standard integer type

        if schema is None:
            return table
        else:
            return DataConformance.conform_to_schema(table, schema, df.dtypes)

    @classmethod
    def add_item_to_view(cls, view: DataView, part: DataPartKey,
                         item: DataItem) -> DataView:

        prior_deltas = view.parts.get(part) or list()
        deltas = [*prior_deltas, item]
        parts = {**view.parts, part: deltas}

        return DataView(view.trac_schema, view.arrow_schema, parts)
Exemplo n.º 10
0
Arquivo: utils.py Projeto: rongou/cudf
import pyorc

import cudf
from cudf.testing._utils import assert_eq
from cudf.utils.dtypes import (
    pandas_dtypes_to_np_dtypes,
    pyarrow_dtypes_to_pandas_dtypes,
)

ALL_POSSIBLE_VALUES = "ALL_POSSIBLE_VALUES"

_PANDAS_TO_AVRO_SCHEMA_MAP = {
    cudf.dtype("int8"): "int",
    pd.Int8Dtype(): ["int", "null"],
    pd.Int16Dtype(): ["int", "null"],
    pd.Int32Dtype(): ["int", "null"],
    pd.Int64Dtype(): ["long", "null"],
    pd.BooleanDtype(): ["boolean", "null"],
    pd.StringDtype(): ["string", "null"],
    cudf.dtype("bool_"): "boolean",
    cudf.dtype("int16"): "int",
    cudf.dtype("int32"): "int",
    cudf.dtype("int64"): "long",
    cudf.dtype("O"): "string",
    cudf.dtype("str"): "string",
    cudf.dtype("float32"): "float",
    cudf.dtype("float64"): "double",
    cudf.dtype("<M8[ns]"): {
        "type": "long",
        "logicalType": "timestamp-millis"
    },
#%%
zarr_file = zarr.open(zarr_path, "a")

image = da.from_zarr(zarr_file["image"])
mask = da.from_zarr(zarr_file["mask"])[:, np.newaxis, ...]
sizeT = mask.shape[0]
mask[mask == -1] = 0  # mask.max().compute() + 1

segment_columns = ["segment_id", "bbox_y0", "bbox_y1", "bbox_x0", "bbox_x1"]
df_segments2 = pd.read_csv(
    path.join(base_dir, "df_segments2_updated.csv"),
    index_col=["frame", "label"],
    #    dtype = pd.Int64Dtype()
)[segment_columns]
df_segments2 = df_segments2.astype(
    dict(zip(segment_columns, [pd.Int64Dtype()] + [pd.Int32Dtype()] * 4)))

division_columns = [
    "segment_id_parent", "frame_child1", "label_child1", "frame_child2",
    "label_child2"
]
df_divisions2 = pd.read_csv(
    path.join(base_dir, "df_divisions2.csv"),
    index_col=0,
    #    dtype = pd.Int64Dtype()
)[division_columns]
df_divisions2 = df_divisions2.astype(pd.Int64Dtype())

#segment_labels = df_segments2.xs(
#    0, level="frame", drop_level=False
#).index.get_level_values("label")
Exemplo n.º 12
0
     pd.Series(
         [234, 2323, 23432, None, None, 224], dtype=pd.UInt64Dtype()
     ),
 ),
 (
     cudf.Series([-10, 1, None, -1, None, 3], dtype="int8"),
     pd.Series([-10, 1, None, -1, None, 3], dtype=pd.Int8Dtype()),
 ),
 (
     cudf.Series([111, None, 222, None, 13], dtype="int16"),
     pd.Series([111, None, 222, None, 13], dtype=pd.Int16Dtype()),
 ),
 (
     cudf.Series([11, None, 22, 33, None, 2, None, 3], dtype="int32"),
     pd.Series(
         [11, None, 22, 33, None, 2, None, 3], dtype=pd.Int32Dtype()
     ),
 ),
 (
     cudf.Series(
         [32431, None, None, 32322, 0, 10, -32324, None], dtype="int64"
     ),
     pd.Series(
         [32431, None, None, 32322, 0, 10, -32324, None],
         dtype=pd.Int64Dtype(),
     ),
 ),
 (
     cudf.Series(
         [True, None, False, None, False, True, True, False],
         dtype="bool_",
Exemplo n.º 13
0
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}

cudf_dtypes_to_pandas_dtypes = {
    np.dtype("uint8"): pd.UInt8Dtype(),
    np.dtype("uint16"): pd.UInt16Dtype(),
    np.dtype("uint32"): pd.UInt32Dtype(),
    np.dtype("uint64"): pd.UInt64Dtype(),
    np.dtype("int8"): pd.Int8Dtype(),
    np.dtype("int16"): pd.Int16Dtype(),
    np.dtype("int32"): pd.Int32Dtype(),
    np.dtype("int64"): pd.Int64Dtype(),
    np.dtype("bool_"): pd.BooleanDtype(),
    np.dtype("object"): pd.StringDtype(),
}

pyarrow_dtypes_to_pandas_dtypes = {
    pa.uint8(): pd.UInt8Dtype(),
    pa.uint16(): pd.UInt16Dtype(),
    pa.uint32(): pd.UInt32Dtype(),
    pa.uint64(): pd.UInt64Dtype(),
    pa.int8(): pd.Int8Dtype(),
    pa.int16(): pd.Int16Dtype(),
    pa.int32(): pd.Int32Dtype(),
    pa.int64(): pd.Int64Dtype(),
    pa.bool_(): pd.BooleanDtype(),
Exemplo n.º 14
0
def expand_signed(df: pd.DataFrame, sign_dict: Dict[str, int],
                  stmt_types: List[str], use_descendants: bool = True) \
        -> pd.DataFrame:
    """Expands out which statements should be added to the signed graph

    The statements types provided in 'stmt_types' will be added for both
    signs. To add more statement types of just one sign, add it to 'sign_dict'.

    Parameters
    ----------
    df : pd.DataFrame
    sign_dict : Dict[str, int]
        A dictionary mapping a Statement type to a sign to be used for the
        edge. By default only Activation and IncreaseAmount are added as
        positive edges and Inhibition and DecreaseAmount are added as
        negative edges, but a user can pass any other Statement types in a
        dictionary.
    stmt_types : List[str]
        The statement types to match to expand signs to. The rows matching
        these types will be duplicated and each copy gets a distinct sign.
    use_descendants : bool
        If True, also match descendants of the statements provided in
        'stmt_types' when adding the extended signs.

    Returns
    -------
    pd.DataFrame
    """
    if use_descendants:
        logger.info('Getting descendants to match for expanded signed graph')
        # Get name of descendants
        more_stmt_types = set(stmt_types)
        for s in stmt_types:
            more_stmt_types.update({
                s.__name__
                for s in get_all_descendants(get_statement_by_name(s))
            })
        stmt_types = list(more_stmt_types)

    # Add new sign column, set to None. Using 'initial_sign' allows usage of
    # IndraNet.to_signed_graph
    df['initial_sign'] = None

    # Locate relevant rows
    standard_sign = df.stmt_type.isin(sign_dict.keys())
    expand_sign = df.stmt_type.isin(stmt_types)
    assert sum(standard_sign) + sum(expand_sign) > 0, \
        'All rows filtered out from DataFrame. Check that statement types ' \
        'in sign_dict and stmt_types exist in the DataFrame.'
    if sum(expand_sign) == 0:
        logger.warning('No rows can be used for expanded signed edges. Check '
                       'that statement types in stmt_types exist in the '
                       'DataFrame.')

    # Add sign for signed statements
    logger.info('Setting initial sign for signed types')
    df.loc[standard_sign, 'initial_sign'] = \
        df.loc[standard_sign, 'stmt_type'].apply(lambda st: sign_dict.get(st))

    # Add positive sign to the rows with types in stmt_types
    df.loc[expand_sign, 'initial_sign'] = INT_PLUS

    # Copy rows for expand sign and switch sign
    logger.info('Setting initial sign for expanded signed types')
    add_rows = []
    for _, expand_row in df[expand_sign].iterrows():
        exp_row = [
            INT_MINUS if col == 'initial_sign' else val
            for col, val in expand_row.items()
        ]
        add_rows.append(exp_row)

    logger.info('Appending extended signed rows')
    extra_df = pd.DataFrame(add_rows, columns=df.columns.values)
    df = df.append(extra_df)

    # Remove all rows without assigned sign
    logger.info('Removing rows without signed')
    df = df[~df.initial_sign.isna()]

    # Re-cast sign column as int
    try:
        df.initial_sign = df.initial_sign.astype(pd.Int32Dtype())
    except Exception as exc:
        link = 'https://pandas.pydata.org/pandas-docs/stable/user_guide' \
          '/integer_na.html'
        logger.warning(f'Could not set sign column as Nullable Integer Data '
                       f'Type. MAke sure to use pandas v0.24+. See {link}')

    return df
Exemplo n.º 15
0
__all__ = ('BatchRowsAsDataFrame', 'generate_proxy', 'UnbatchPandas',
           'element_type_from_dataframe')

T = TypeVar('T', bound=NamedTuple)

# Generate type map (presented visually in the docstring)
_BIDIRECTIONAL = [
    (bool, bool),
    (np.int8, np.int8),
    (np.int16, np.int16),
    (np.int32, np.int32),
    (np.int64, np.int64),
    (pd.Int8Dtype(), Optional[np.int8]),
    (pd.Int16Dtype(), Optional[np.int16]),
    (pd.Int32Dtype(), Optional[np.int32]),
    (pd.Int64Dtype(), Optional[np.int64]),
    (np.float32, Optional[np.float32]),
    (np.float64, Optional[np.float64]),
    (object, Any),
    (pd.StringDtype(), Optional[str]),
    (pd.BooleanDtype(), Optional[bool]),
]

PANDAS_TO_BEAM = {
    pd.Series([], dtype=dtype).dtype: fieldtype
    for dtype, fieldtype in _BIDIRECTIONAL
}
BEAM_TO_PANDAS = {fieldtype: dtype for dtype, fieldtype in _BIDIRECTIONAL}

# Shunt non-nullable Beam types to the same pandas types as their non-nullable
Exemplo n.º 16
0
import requests
import re
import sqlite3
import datetime
import pandas as pd
import numpy as np

# get NYT data
cases = pd.read_csv(
    'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'
)

cases = cases.assign(date=pd.to_datetime(cases['date']),
                     fips=cases['fips'].astype(pd.Int32Dtype()))

# calculate new cases per day
cases['cases_shifted'] = (cases.groupby(
    ['county', 'state']).cases.shift(1).fillna(0).astype(int))

cases['cases_new'] = cases['cases'] - cases['cases_shifted']

# 2017 county census estimate from US Census Bureau.  Written locally to SQLite database
# to avoid having to re-download every time I run the notebook
# see census_etl.py for details

conn = sqlite3.connect('US_county_census.db')

# start with 2017 census and total population
# might as well resort to SQL yelling bc names are all in caps
pop17 = pd.read_sql(
    'SELECT STNAME, CTYNAME, COUNTY_KEY, TOT_POP FROM CENSUS WHERE YEAR = 10 AND AGEGRP = 0;',
Exemplo n.º 17
0
    df = pd.concat([train.assign(is_train=1), test.assign(is_train=0)])

    df['create_order_time'] = pd.to_datetime(df['create_order_time'])
    df['date'] = df['create_order_time'].dt.date
    df['day'] = df['create_order_time'].dt.day
    df['hour'] = df['create_order_time'].dt.hour

    df = pd.merge(df, item, how='left', on='item_id')

    memory = df.memory_usage().sum() / 1024**2
    print('Before memory usage of properties dataframe is :', memory, " MB")

    dtype_dict = {
        'buyer_admin_id': 'int32',
        'item_id': 'int32',
        'store_id': pd.Int32Dtype(),
        'irank': 'int16',
        'item_price': pd.Int16Dtype(),
        'cate_id': pd.Int16Dtype(),
        'is_train': 'int8',
        'day': 'int8',
        'hour': 'int8',
    }

    df = df.astype(dtype_dict)
    memory = df.memory_usage().sum() / 1024**2
    print('After memory usage of properties dataframe is :', memory, " MB")
    del train, test
    gc.collect()

    # Before memory usage of properties dataframe is : 1292.8728713989258  MB
Exemplo n.º 18
0
        tobson = bson.dumps
    except:

        def unbson(x):
            raise ImportError("BSON not found")

        def tobson(x):
            raise ImportError("BSON not found")


DAYS_TO_MILLIS = 86400000000000
"""Number of millis in a day. Used to convert a Date to a date"""
nat = np.datetime64('NaT').view('int64')

simple = {
    parquet_thrift.Type.INT32: pd.Int32Dtype(),
    parquet_thrift.Type.INT64: pd.Int64Dtype(),
    parquet_thrift.Type.FLOAT: np.dtype('float32'),
    parquet_thrift.Type.DOUBLE: np.dtype('float64'),
    parquet_thrift.Type.BOOLEAN: pd.BooleanDtype(),
    parquet_thrift.Type.INT96: np.dtype('S12'),
    parquet_thrift.Type.BYTE_ARRAY: np.dtype("O"),
    parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY: np.dtype("O")
}
complex = {
    parquet_thrift.ConvertedType.UTF8: np.dtype("O"),
    parquet_thrift.ConvertedType.DECIMAL: np.dtype('float64'),
    parquet_thrift.ConvertedType.UINT_8: pd.UInt8Dtype(),
    parquet_thrift.ConvertedType.UINT_16: pd.UInt16Dtype(),
    parquet_thrift.ConvertedType.UINT_32: pd.UInt32Dtype(),
    parquet_thrift.ConvertedType.UINT_64: pd.UInt64Dtype(),
Exemplo n.º 19
0
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}

cudf_dtypes_to_pandas_dtypes = {
    np.dtype("uint8"): pd.UInt8Dtype(),
    np.dtype("uint16"): pd.UInt16Dtype(),
    np.dtype("uint32"): pd.UInt32Dtype(),
    np.dtype("uint64"): pd.UInt64Dtype(),
    np.dtype("int8"): pd.Int8Dtype(),
    np.dtype("int16"): pd.Int16Dtype(),
    np.dtype("int32"): pd.Int32Dtype(),
    np.dtype("int64"): pd.Int64Dtype(),
    np.dtype("bool_"): pd.BooleanDtype(),
    np.dtype("object"): pd.StringDtype(),
}

SIGNED_INTEGER_TYPES = {"int8", "int16", "int32", "int64"}
UNSIGNED_TYPES = {"uint8", "uint16", "uint32", "uint64"}
INTEGER_TYPES = SIGNED_INTEGER_TYPES | UNSIGNED_TYPES
FLOAT_TYPES = {"float32", "float64"}
SIGNED_TYPES = SIGNED_INTEGER_TYPES | FLOAT_TYPES
NUMERIC_TYPES = SIGNED_TYPES | UNSIGNED_TYPES
DATETIME_TYPES = {
    "datetime64[s]",
    "datetime64[ms]",
    "datetime64[us]",
Exemplo n.º 20
0
def general_data_from_search(team,
                             year,
                             playertype="pitcher",
                             date1="",
                             date2="",
                             addid=False):
    """Gets data from Baseball Savant's search function.  This function gets every pitch event in the given time frame. The playertype
    argument decides whether you get batter or pitcher data.

    If creating an exhaustive database, note that you can get overlapping data because one teams pitcher data will return the same events
    of another teams batter data with the only change being the "player_name" field.  See 'addid' if interested in adding a unique event id
    for simple duplicate detection.

    Args:
        team (string): Team abbreviation in form 'XXX'
        year (int): Year number between 2012-2021 in form YYYY
        playertype (str, optional): Either 'pitcher' or 'batter'. Defaults to "pitcher".
        date1 (str, optional): The bottom date range to search for. Defaults to empty string. yyyy-mm-dd
        date2 (str, optional): The top date range to search for. Defaults to empty string. yyyy-mm-dd
        addid (bool, optional): Adds a custom ID.  Defaults False.
        clean (bool, optional): Clean data for DB. Defaults True.

    Returns:
        pandas dataframe: dateframe of every event from search parameters
    """

    url = (
        "https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB="
        f"&hfGT=R%7CPO%7C&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea={year}%7C&hfSit="
        f"&player_type={playertype}&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt={date1}"
        f"&game_date_lt={date2}&hfInfield=&team={team}&position=&hfOutfield=&hfRO=&home_road=&hfFlag=&hfBBT="
        "&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=api_p_release_speed"
        "&sort_order=desc&min_pas=0&type=details&")

    # Read in the csv and specify certain columns to have nullable int types
    scrapedData = pandas.read_csv(url,
                                  dtype={
                                      'zone': pandas.Int16Dtype(),
                                      'hit_location': pandas.Int16Dtype(),
                                      'on_1b': pandas.Int32Dtype(),
                                      'on_2b': pandas.Int32Dtype(),
                                      'on_3b': pandas.Int32Dtype(),
                                      'hit_distance_sc': pandas.Int16Dtype(),
                                      'launch_angle': pandas.Int16Dtype(),
                                      'release_spin_rate': pandas.Int32Dtype(),
                                      'launch_speed_angle':
                                      pandas.Int16Dtype(),
                                      'spin_axis': pandas.Int32Dtype()
                                  })

    # Data comes with columns that will always be empty **EVEN IN YEARS WHERE THIS DATA WAS NOT DEPRECATED, ITS ALL DELETED**
    # Dropping player name, pitcher.1 and fielder_2.1 because they are duplicates of other cols
    scrapedData.drop([
        'umpire', 'spin_dir', 'spin_rate_deprecated', 'break_angle_deprecated',
        'break_length_deprecated', 'tfs_deprecated', 'tfs_zulu_deprecated',
        'player_name', 'pitcher.1', 'fielder_2.1'
    ],
                     axis=1,
                     inplace=True)

    scrapedData.rename(columns={"type": "type_"}, inplace=True)

    if (addid): addPitchIds(scrapedData)

    return scrapedData
Exemplo n.º 21
0
    # Loop over the files within the folder
    for filename in os.listdir('./data/nces/raw'):
            if filename.endswith('.csv'):
                data = pd.read_csv(f"./data/nces/raw/{filename}")
                if df is None:
                    print("Initializing: "+filename)
                    df = data
                else:
                    print("Concat: "+filename)
                    df = pd.concat([df, data], axis=0)
            
    df = df.sort_values(by=['State School ID'])


    # ensure the 4 digit zip does not have decimals
    df['ZIP'] = df['ZIP'].astype(pd.Int32Dtype())
    df['ZIP'] = df['ZIP'].astype(str)
    df['ZIP 4-digit'] = df['ZIP 4-digit'].astype(pd.Int32Dtype())
    df['ZIP 4-digit'] = df['ZIP 4-digit'].astype(str)
    df['ZIP 4-digit'] = df['ZIP 4-digit'].str.replace('<NA>','')

    # 0 pad any ids that do not match the NCES length
    df['NCES School ID'] = df['NCES School ID'].apply(lambda x: '{0:0>12}'.format(x))
    df['NCES District ID'] = df['NCES District ID'].apply(lambda x: '{0:0>7}'.format(x))
    df['ZIP'] = df['ZIP'].apply(lambda x: '{0:0>5}'.format(x))
    df['ZIP 4-digit'] = df['ZIP 4-digit'].apply(lambda x: '{0:0>4}'.format(x))

    df.to_csv(f"./data/nces/clean/combined_raw.csv", header=True, index=False)

    # remove the special characters from the data
    # Read in the file
Exemplo n.º 22
0
     cudf.Series([234, 2323, 23432, None, None, 224], dtype="uint64"),
     pd.Series([234, 2323, 23432, None, None, 224],
               dtype=pd.UInt64Dtype()),
 ),
 (
     cudf.Series([-10, 1, None, -1, None, 3], dtype="int8"),
     pd.Series([-10, 1, None, -1, None, 3], dtype=pd.Int8Dtype()),
 ),
 (
     cudf.Series([111, None, 222, None, 13], dtype="int16"),
     pd.Series([111, None, 222, None, 13], dtype=pd.Int16Dtype()),
 ),
 (
     cudf.Series([11, None, 22, 33, None, 2, None, 3], dtype="int32"),
     pd.Series([11, None, 22, 33, None, 2, None, 3],
               dtype=pd.Int32Dtype()),
 ),
 (
     cudf.Series([32431, None, None, 32322, 0, 10, -32324, None],
                 dtype="int64"),
     pd.Series(
         [32431, None, None, 32322, 0, 10, -32324, None],
         dtype=pd.Int64Dtype(),
     ),
 ),
 (
     cudf.Series(
         [True, None, False, None, False, True, True, False],
         dtype="bool_",
     ),
     pd.Series(
Exemplo n.º 23
0
    'cotrecho': int,
    'noorigem': int,
    'nodestino': int,
    'cocursodag': str,
    'cobacia': str,
    'nucomptrec': float,
    'nudistbact': float,
    'nudistcdag': float,
    'nuareacont': float,
    'nuareamont': float,
    'nogenerico': str,
    'noligacao': str,
    'noespecif': str,
    'noriocomp': str,
    'nooriginal': str,
    'cocdadesag': str,
    'nutrjus': pd.Int32Dtype(),
    'nudistbacc': float,
    'nuareabacc': float,
    'nuordemcda': pd.Int32Dtype(),
    'nucompcda': float,
    'nunivotto': int,
    'nunivotcda': pd.Int32Dtype(),
    'nustrahler': pd.Int32Dtype(),
    'dedominial': str,
    'dsversao': str,
    'cobacia_50k': str,
    'lat': float,
    'lon': float,
}
Exemplo n.º 24
0
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}

cudf_dtypes_to_pandas_dtypes = {
    cudf.dtype("uint8"): pd.UInt8Dtype(),
    cudf.dtype("uint16"): pd.UInt16Dtype(),
    cudf.dtype("uint32"): pd.UInt32Dtype(),
    cudf.dtype("uint64"): pd.UInt64Dtype(),
    cudf.dtype("int8"): pd.Int8Dtype(),
    cudf.dtype("int16"): pd.Int16Dtype(),
    cudf.dtype("int32"): pd.Int32Dtype(),
    cudf.dtype("int64"): pd.Int64Dtype(),
    cudf.dtype("bool_"): pd.BooleanDtype(),
    cudf.dtype("object"): pd.StringDtype(),
}

pyarrow_dtypes_to_pandas_dtypes = {
    pa.uint8(): pd.UInt8Dtype(),
    pa.uint16(): pd.UInt16Dtype(),
    pa.uint32(): pd.UInt32Dtype(),
    pa.uint64(): pd.UInt64Dtype(),
    pa.int8(): pd.Int8Dtype(),
    pa.int16(): pd.Int16Dtype(),
    pa.int32(): pd.Int32Dtype(),
    pa.int64(): pd.Int64Dtype(),
    pa.bool_(): pd.BooleanDtype(),
Exemplo n.º 25
0
def read_tble_bho(file_tble_bho):
    """
    Read BHO table in MSExcel format
        adjust dtypes and returns as dataframe

    Args:
        file_tble_bho(str)  :: pathfile (in MSExcel format)


    Returns:
        df_tble_bho (pd.DataFrame) :: BHO drainage table

    Notes:
        - see required columns in variable 'cols'

    """
    df_tble_bho = pd.read_excel(file_tble_bho)

    # required cols
    cols = [
        'cotrecho',
        'cobacia',
        'nucomptrec',
        'nuareacont',
        'nuareamont',
        'nutrjus',
        'dedominial',
        'nustrahler',
        'nuordemcda',
        'cocursodag',
        'cocdadesag',
        'nudistbact',
        'nunivotto',
    ]

    df_tble_bho = df_tble_bho[cols]

    # apply dtypes
    bho_dtypes = {
        'fid': pd.Int64Dtype(),
        'drn_pk': int,
        'cotrecho': int,
        'noorigem': int,
        'nodestino': int,
        'cocursodag': str,
        'cobacia': str,
        'nucomptrec': float,
        'nudistbact': float,
        'nudistcdag': float,
        'nuareacont': float,
        'nuareamont': float,
        'nogenerico': str,
        'noligacao': str,
        'noespecif': str,
        'noriocomp': str,
        'nooriginal': str,
        'cocdadesag': str,
        'nutrjus': pd.Int32Dtype(),
        'nudistbacc': float,
        'nuareabacc': float,
        'nuordemcda': pd.Int32Dtype(),
        'nucompcda': float,
        'nunivotto': int,
        'nunivotcda': pd.Int32Dtype(),
        'nustrahler': pd.Int32Dtype(),
        'dedominial': str,
        'dsversao': str,
        'cobacia_50k': str,
        'lat': float,
        'lon': float,
    }
    hmap = {k: v for k, v in bho_dtypes.items() if k in df_tble_bho.columns}
    df_tble_bho = df_tble_bho.astype(hmap)

    return df_tble_bho
Exemplo n.º 26
0
import pickle
with open('nation.pkl','rb') as pkldfile :
    ndf=pickle.load(pkldfile)

pivot4years={}

for cat in ndf.TRANSACTION_DATE.cat.categories:
    temp=ndf[ndf.TRANSACTION_DATE == cat ]
    pivot4years[str(cat)]=temp.pivot_table(values='DOSAGE_UNIT',aggfunc=np.sum,index=temp.BUYER_COUNTY,columns=temp.TRANSACTION_DATE ,fill_value=0)


import pickle
pickle.dump(pivot4years, open( "pivot4years.pkl", "wb" ) )

deaths_dtypes={'Year':int,'County':str,'Notes':str,'County Code':int,'Year Code':int , 'Deaths':pd.Int32Dtype(),'Population':int,'Crude Rate':float }
deaths=pd.read_csv('MCDNewEngland20062012.tsv',sep='\t',index_col=['County','Year'],dtype=df2dtypes, na_values=['Suppressed','Unreliable'])

with open('pivot4years.pkl','rb') as pkldfile :
    pivotByYear=pickle.load(pkldfile)

import geopandas as gpd
smallcounties = gpd.read_file('cb_2018_us_county_500k/cb_2018_us_county_500k.shp')

state_dict={ '25':'Massachusetts', '09':'Connecticut' , '23':'Maine', '33':'New Hampshire', '50':'Vermont' ,'44':'Rhode Island'  }
necounties=smallcounties[smallcounties['STATEFP'].isin(state_dict.keys())].copy()

geoOPyear=dict()
for i in pivotByYear.keys():
    geoOPyear[i]=necounties.merge(pivotByYear[i],left_on='NAME',right_on='BUYER_COUNTY')
    geoOPyear[i].drop(columns=['ALAND','AWATER'])
Exemplo n.º 27
0
class TestPandasConversions(object):
    def testActivate(self):
        #FIXME: is the following still making sense ?
        assert rpyp.py2rpy != robjects.conversion.py2rpy
        l = len(robjects.conversion.py2rpy.registry)
        k = set(robjects.conversion.py2rpy.registry.keys())
        rpyp.activate()
        assert len(conversion.py2rpy.registry) > l
        rpyp.deactivate()
        assert len(conversion.py2rpy.registry) == l
        assert set(conversion.py2rpy.registry.keys()) == k

    def testActivateTwice(self):
        #FIXME: is the following still making sense ?
        assert rpyp.py2rpy != robjects.conversion.py2rpy
        l = len(robjects.conversion.py2rpy.registry)
        k = set(robjects.conversion.py2rpy.registry.keys())
        rpyp.activate()
        rpyp.deactivate()
        rpyp.activate()
        assert len(conversion.py2rpy.registry) > l
        rpyp.deactivate()
        assert len(conversion.py2rpy.registry) == l
        assert set(conversion.py2rpy.registry.keys()) == k

    def test_dataframe(self):
        # Content for test data frame
        l = (
            ('b', numpy.array([True, False, True], dtype=numpy.bool_)),
            ('i', numpy.array([1, 2, 3], dtype='i')),
            ('f', numpy.array([1, 2, 3], dtype='f')),
            # ('s', numpy.array([b'b', b'c', b'd'], dtype='S1')),
            ('u', numpy.array([u'a', u'b', u'c'], dtype='U')),
            ('dates', [
                datetime(2012, 5, 2),
                datetime(2012, 6, 3),
                datetime(2012, 7, 1)
            ]))
        od = OrderedDict(l)
        # Pandas data frame
        pd_df = pandas.core.frame.DataFrame(od)
        # Convert to R
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_df = robjects.conversion.py2rpy(pd_df)
        assert pd_df.shape[0] == rp_df.nrow
        assert pd_df.shape[1] == rp_df.ncol
        # assert tuple(rp_df.rx2('s')) == (b'b', b'c', b'd')
        assert tuple(rp_df.rx2('u')) == ('a', 'b', 'c')

    def test_dataframe_columnnames(self):
        pd_df = pandas.DataFrame({'the one': [1, 2], 'the other': [3, 4]})
        # Convert to R
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_df = robjects.conversion.py2rpy(pd_df)
        assert tuple(rp_df.names) == ('the one', 'the other')

    def test_series(self):
        Series = pandas.core.series.Series
        s = Series(numpy.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_s = robjects.conversion.py2rpy(s)
        assert isinstance(rp_s, rinterface.FloatSexpVector)

    @pytest.mark.parametrize('dtype',
                             ('i', numpy.int32 if has_pandas else None,
                              numpy.int8 if has_pandas else None,
                              numpy.int16 if has_pandas else None,
                              numpy.int32 if has_pandas else None,
                              numpy.int64 if has_pandas else None,
                              numpy.uint8 if has_pandas else None,
                              numpy.uint16 if has_pandas else None,
                              pandas.Int32Dtype if has_pandas else None,
                              pandas.Int64Dtype if has_pandas else None))
    def test_series_int(self, dtype):
        Series = pandas.core.series.Series
        s = Series(range(5), index=['a', 'b', 'c', 'd', 'e'], dtype=dtype)
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_s = robjects.conversion.py2rpy(s)
        assert isinstance(rp_s, rinterface.IntSexpVector)

    @pytest.mark.parametrize('dtype',
                             (pandas.Int32Dtype() if has_pandas else None,
                              pandas.Int64Dtype() if has_pandas else None))
    def test_dataframe_int_nan(self, dtype):
        a = pandas.DataFrame([(numpy.NaN, )], dtype=dtype, columns=['z'])
        with localconverter(default_converter + rpyp.converter) as cv:
            b = robjects.conversion.py2rpy(a)
        assert b[0][0] is rinterface.na_values.NA_Integer
        with localconverter(default_converter + rpyp.converter) as cv:
            c = robjects.conversion.rpy2py(b)

    @pytest.mark.parametrize('dtype',
                             (pandas.Int32Dtype() if has_pandas else None,
                              pandas.Int64Dtype() if has_pandas else None))
    def test_series_int_nan(self, dtype):
        a = pandas.Series((numpy.NaN, ), dtype=dtype, index=['z'])
        with localconverter(default_converter + rpyp.converter) as _:
            b = robjects.conversion.py2rpy(a)
        assert b[0] is rinterface.na_values.NA_Integer
        with localconverter(default_converter + rpyp.converter) as _:
            c = robjects.conversion.rpy2py(b)

    @pytest.mark.skipif(not (has_numpy and has_pandas),
                        reason='Packages numpy and pandas must be installed.')
    @pytest.mark.parametrize('data',
                             (['x', 'y', 'z'], ['x', 'y', None],
                              ['x', 'y', numpy.nan], ['x', 'y', pandas.NA]))
    @pytest.mark.parametrize(
        'dtype', ['O', pandas.StringDtype() if has_pandas else None])
    def test_series_obj_str(self, data, dtype):
        Series = pandas.core.series.Series
        s = Series(data, index=['a', 'b', 'c'], dtype=dtype)
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_s = robjects.conversion.py2rpy(s)
        assert isinstance(rp_s, rinterface.StrSexpVector)

    def test_series_obj_mixed(self):
        Series = pandas.core.series.Series
        s = Series(['x', 1, False], index=['a', 'b', 'c'])
        with localconverter(default_converter + rpyp.converter) as cv:
            with pytest.raises(ValueError):
                rp_s = robjects.conversion.py2rpy(s)

        s = Series(['x', 1, None], index=['a', 'b', 'c'])
        with localconverter(default_converter + rpyp.converter) as cv:
            with pytest.raises(ValueError):
                rp_s = robjects.conversion.py2rpy(s)

    def test_series_obj_bool(self):
        Series = pandas.core.series.Series
        s = Series([True, False, True], index=['a', 'b', 'c'], dtype='bool')
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_s = robjects.conversion.py2rpy(s)
        assert isinstance(rp_s, rinterface.BoolSexpVector)

        s = Series([True, False, None], index=['a', 'b', 'c'], dtype='bool')
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_s = robjects.conversion.py2rpy(s)
        assert isinstance(rp_s, rinterface.BoolSexpVector)

    def test_series_obj_allnone(self):
        Series = pandas.core.series.Series
        s = Series([None, None, None], index=['a', 'b', 'c'])
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_s = robjects.conversion.py2rpy(s)
        assert isinstance(rp_s, rinterface.BoolSexpVector)

    def test_series_issue264(self):
        Series = pandas.core.series.Series
        s = Series(('a', 'b', 'c', 'd', 'e'),
                   index=pandas.Int64Index([0, 1, 2, 3, 4]))
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_s = robjects.conversion.py2rpy(s)
        # segfault before the fix
        str(rp_s)
        assert isinstance(rp_s, rinterface.StrSexpVector)

    def test_object2String(self):
        series = pandas.Series(["a", "b", "c", "a"], dtype="O")
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_c = robjects.conversion.py2rpy(series)
            assert isinstance(rp_c, rinterface.StrSexpVector)

    def test_object2String_with_None(self):
        series = pandas.Series([None, "a", "b", "c", "a"], dtype="O")
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_c = robjects.conversion.py2rpy(series)
            assert isinstance(rp_c, rinterface.StrSexpVector)

    def test_factor2Category(self):
        factor = robjects.vectors.FactorVector(('a', 'b', 'a'))
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_c = robjects.conversion.rpy2py(factor)
        assert isinstance(rp_c, pandas.Categorical)

    def test_factorwithNA2Category(self):
        factor = robjects.vectors.FactorVector(('a', 'b', 'a', None))
        assert factor[3] is rinterface.na_values.NA_Integer
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_c = robjects.conversion.rpy2py(factor)
        assert isinstance(rp_c, pandas.Categorical)
        assert math.isnan(rp_c[3])

    def test_orderedFactor2Category(self):
        factor = robjects.vectors.FactorVector(('a', 'b', 'a'), ordered=True)
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_c = robjects.conversion.rpy2py(factor)
        assert isinstance(rp_c, pandas.Categorical)

    def test_category2Factor(self):
        category = pandas.Series(["a", "b", "c", "a"], dtype="category")
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_c = robjects.conversion.py2rpy(category)
            assert isinstance(rp_c, robjects.vectors.FactorVector)

    def test_categorywithNA2Factor(self):
        category = pandas.Series(['a', 'b', 'c', numpy.nan], dtype='category')
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_c = robjects.conversion.py2rpy(category)
            assert isinstance(rp_c, robjects.vectors.FactorVector)
        assert rp_c[3] == rinterface.NA_Integer

    def test_orderedCategory2Factor(self):
        category = pandas.Series(
            pandas.Categorical(['a', 'b', 'c', 'a'],
                               categories=['a', 'b', 'c'],
                               ordered=True))
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_c = robjects.conversion.py2rpy(category)
            assert isinstance(rp_c, robjects.vectors.FactorVector)

    def test_datetime2posixct(self):
        datetime = pandas.Series(
            pandas.date_range('2017-01-01 00:00:00.234',
                              periods=20,
                              freq='ms',
                              tz='UTC'))
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_c = robjects.conversion.py2rpy(datetime)
            assert isinstance(rp_c, robjects.vectors.POSIXct)
            assert int(rp_c[0]) == 1483228800
            assert int(rp_c[1]) == 1483228800
            assert rp_c[0] != rp_c[1]

    def test_datetime2posixct_withNA(self):
        datetime = pandas.Series(
            pandas.date_range('2017-01-01 00:00:00.234',
                              periods=20,
                              freq='ms',
                              tz='UTC'))
        datetime[1] = pandas.NaT
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_c = robjects.conversion.py2rpy(datetime)
            assert isinstance(rp_c, robjects.vectors.POSIXct)
            assert int(rp_c[0]) == 1483228800
            assert math.isnan(rp_c[1])
            assert rp_c[0] != rp_c[1]

    def test_date2posixct(self):
        today = datetime.now().date()
        date = pandas.Series([today])
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_c = robjects.conversion.py2rpy(date)
            assert isinstance(rp_c, robjects.vectors.FloatSexpVector)
            assert tuple(int(x) for x in rp_c) == (today.toordinal(), )

    def test_timeR2Pandas(self):
        tzone = robjects.vectors.get_timezone()
        dt = [datetime(1960, 5, 2), datetime(1970, 6, 3), datetime(2012, 7, 1)]
        dt = [x.replace(tzinfo=tzone) for x in dt]
        # fix the time
        ts = [x.timestamp() for x in dt]
        # Create an R POSIXct vector.
        r_time = robjects.baseenv['as.POSIXct'](
            rinterface.FloatSexpVector(ts),
            origin=rinterface.StrSexpVector(('1970-01-01', )))

        # Convert R POSIXct vector to pandas-compatible vector
        with localconverter(default_converter + rpyp.converter) as cv:
            py_time = robjects.conversion.rpy2py(r_time)

        # Check that the round trip did not introduce changes
        for expected, obtained in zip(dt, py_time):
            assert expected == obtained.to_pydatetime()

        # Try with NA.
        r_time[1] = rinterface.na_values.NA_Real
        # Convert R POSIXct vector to pandas-compatible vector
        with localconverter(default_converter + rpyp.converter) as cv:
            py_time = robjects.conversion.rpy2py(r_time)

        assert py_time[1] is pandas.NaT

    def test_posixct_in_dataframe_to_pandas(self):
        tzone = robjects.vectors.get_timezone()
        dt = [datetime(1960, 5, 2), datetime(1970, 6, 3), datetime(2012, 7, 1)]
        dt = [x.replace(tzinfo=tzone) for x in dt]
        # fix the time
        ts = [x.timestamp() for x in dt]
        # Create an R data.frame with a posixct_vector.
        r_dataf = robjects.vectors.DataFrame({
            'mydate':
            robjects.baseenv['as.POSIXct'](rinterface.FloatSexpVector(ts),
                                           origin=rinterface.StrSexpVector(
                                               ('1970-01-01', )))
        })

        # Convert R POSIXct vector to pandas-compatible vector
        with localconverter(default_converter + rpyp.converter):
            py_dataf = robjects.conversion.rpy2py(r_dataf)
        assert pandas.core.dtypes.common.is_datetime64_any_dtype(
            py_dataf['mydate'])

    def test_repr(self):
        # this should go to testVector, with other tests for repr()
        l = (('b', numpy.array([True, False, True],
                               dtype=numpy.bool_)), ('i',
                                                     numpy.array([1, 2, 3],
                                                                 dtype="i")),
             ('f', numpy.array([1, 2, 3], dtype="f")),
             ('s', numpy.array(["a", "b", "c"],
                               dtype="S")), ('u',
                                             numpy.array([u"a", u"b", u"c"],
                                                         dtype="U")))
        od = OrderedDict(l)
        pd_df = pandas.core.frame.DataFrame(od)
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_df = robjects.conversion.py2rpy(pd_df)
        s = repr(rp_df)  # used to fail with a TypeError.
        s = s.split('\n')
        repr_str = ('[BoolSex..., IntSexp..., FloatSe..., '
                    'ByteSex..., StrSexp...]')
        assert repr_str == s[2].strip()

        # Try again with the conversion still active.
        with localconverter(default_converter + rpyp.converter) as cv:
            rp_df = robjects.conversion.py2rpy(pd_df)
            s = repr(rp_df)  # used to fail with a TypeError.
        s = s.split('\n')
        assert repr_str == s[2].strip()

    def test_ri2pandas(self):
        rdataf = robjects.r('data.frame(a=1:2, '
                            '           b=I(c("a", "b")), '
                            '           c=c("a", "b"))')
        with localconverter(default_converter + rpyp.converter) as cv:
            pandas_df = robjects.conversion.rpy2py(rdataf)

        assert isinstance(pandas_df, pandas.DataFrame)
        assert ('a', 'b', 'c') == tuple(pandas_df.keys())
        assert pandas_df['a'].dtype in (numpy.dtype('int32'),
                                        numpy.dtype('int64'))
        assert pandas_df['b'].dtype == numpy.dtype('O')
        assert isinstance(pandas_df['c'].dtype,
                          pandas.api.types.CategoricalDtype)

    def test_ri2pandas(self):
        rdataf = robjects.r('data.frame(a=1:2, '
                            '           row.names=c("a", "b"))')
        with localconverter(default_converter + rpyp.converter) as cv:
            pandas_df = cv.rpy2py(rdataf)
        assert all(x == y for x, y in zip(rdataf.rownames, pandas_df.index))

    def test_ri2pandas_issue207(self):
        d = robjects.DataFrame({'x': 1})
        with localconverter(default_converter + rpyp.converter) as cv:
            try:
                ok = True
                robjects.globalenv['d'] = d
            except ValueError:
                ok = False
            finally:
                if 'd' in robjects.globalenv:
                    del (robjects.globalenv['d'])
        assert ok
Exemplo n.º 28
0
from .librdata import Writer
from .custom_errors import PyreadrError

# configuration

int_types = {
    np.dtype('int32'),
    np.dtype('int16'),
    np.dtype('int8'),
    np.dtype('uint8'),
    np.dtype('uint16'), np.int32, np.int16, np.int8, np.uint8, np.uint16
}
int_mixed_types = {
    pd.Int8Dtype(),
    pd.Int16Dtype(),
    pd.Int32Dtype(),
    pd.UInt8Dtype(),
    pd.UInt16Dtype()
}
float_types = {
    np.dtype('int64'),
    np.dtype('uint64'),
    np.dtype('uint32'),
    np.dtype('float'), np.int64, np.uint64, np.uint32, np.float,
    pd.Int64Dtype(),
    pd.UInt32Dtype(),
    pd.UInt64Dtype()
}
datetime_types = {datetime.datetime, np.datetime64}

pyreadr_to_librdata_types = {
Exemplo n.º 29
0
def _get_raw_column(column, dataset_id):
    df = pd.read_csv(f"{RootPath.get_dataset_path()}/{dataset_id}.csv.gz",
                     compression='gzip',
                     sep='',
                     nrows=1)
    columns = len(df.columns)
    if columns == 20:
        return pd.read_csv(
            f"{RootPath.get_dataset_path()}/{dataset_id}.csv.gz",
            compression='gzip',
            sep='',
            names=[
                "raw_feature_tweet_text_token", "raw_feature_tweet_hashtags",
                "raw_feature_tweet_id", "raw_feature_tweet_media",
                "raw_feature_tweet_links", "raw_feature_tweet_domains",
                "raw_feature_tweet_type", "raw_feature_tweet_language",
                "raw_feature_tweet_timestamp", "raw_feature_creator_id",
                "raw_feature_creator_follower_count",
                "raw_feature_creator_following_count",
                "raw_feature_creator_is_verified",
                "raw_feature_creator_creation_timestamp",
                "raw_feature_engager_id", "raw_feature_engager_follower_count",
                "raw_feature_engager_following_count",
                "raw_feature_engager_is_verified",
                "raw_feature_engager_creation_timestamp",
                "raw_feature_engagement_creator_follows_engager"
            ],
            dtype={
                "raw_feature_tweet_text_token": pd.StringDtype(),
                "raw_feature_tweet_hashtags": pd.StringDtype(),
                "raw_feature_tweet_id": pd.StringDtype(),
                "raw_feature_tweet_media": pd.StringDtype(),
                "raw_feature_tweet_links": pd.StringDtype(),
                "raw_feature_tweet_domains": pd.StringDtype(),
                "raw_feature_tweet_type": pd.StringDtype(),
                "raw_feature_tweet_language": pd.StringDtype(),
                "raw_feature_tweet_timestamp": pd.Int32Dtype(),
                "raw_feature_creator_id": pd.StringDtype(),
                "raw_feature_creator_follower_count": pd.Int32Dtype(),
                "raw_feature_creator_following_count": pd.Int32Dtype(),
                "raw_feature_creator_is_verified": pd.BooleanDtype(),
                "raw_feature_creator_creation_timestamp": pd.Int32Dtype(),
                "raw_feature_engager_id": pd.StringDtype(),
                "raw_feature_engager_follower_count": pd.Int32Dtype(),
                "raw_feature_engager_following_count": pd.Int32Dtype(),
                "raw_feature_engager_is_verified": pd.BooleanDtype(),
                "raw_feature_engager_creation_timestamp": pd.Int32Dtype(),
                "raw_feature_engagement_creator_follows_engager":
                pd.BooleanDtype()
            },
            usecols=[column])
    # Read the dataframe
    elif columns == 24:
        return pd.read_csv(
            f"{RootPath.get_dataset_path()}/{dataset_id}.csv.gz",
            compression='gzip',
            sep='',
            names=[
                "raw_feature_tweet_text_token", "raw_feature_tweet_hashtags",
                "raw_feature_tweet_id", "raw_feature_tweet_media",
                "raw_feature_tweet_links", "raw_feature_tweet_domains",
                "raw_feature_tweet_type", "raw_feature_tweet_language",
                "raw_feature_tweet_timestamp", "raw_feature_creator_id",
                "raw_feature_creator_follower_count",
                "raw_feature_creator_following_count",
                "raw_feature_creator_is_verified",
                "raw_feature_creator_creation_timestamp",
                "raw_feature_engager_id", "raw_feature_engager_follower_count",
                "raw_feature_engager_following_count",
                "raw_feature_engager_is_verified",
                "raw_feature_engager_creation_timestamp",
                "raw_feature_engagement_creator_follows_engager",
                "raw_feature_engagement_reply_timestamp",
                "raw_feature_engagement_retweet_timestamp",
                "raw_feature_engagement_comment_timestamp",
                "raw_feature_engagement_like_timestamp"
            ],
            dtype={
                "raw_feature_tweet_text_token": pd.StringDtype(),
                "raw_feature_tweet_hashtags": pd.StringDtype(),
                "raw_feature_tweet_id": pd.StringDtype(),
                "raw_feature_tweet_media": pd.StringDtype(),
                "raw_feature_tweet_links": pd.StringDtype(),
                "raw_feature_tweet_domains": pd.StringDtype(),
                "raw_feature_tweet_type": pd.StringDtype(),
                "raw_feature_tweet_language": pd.StringDtype(),
                "raw_feature_tweet_timestamp": pd.Int32Dtype(),
                "raw_feature_creator_id": pd.StringDtype(),
                "raw_feature_creator_follower_count": pd.Int32Dtype(),
                "raw_feature_creator_following_count": pd.Int32Dtype(),
                "raw_feature_creator_is_verified": pd.BooleanDtype(),
                "raw_feature_creator_creation_timestamp": pd.Int32Dtype(),
                "raw_feature_engager_id": pd.StringDtype(),
                "raw_feature_engager_follower_count": pd.Int32Dtype(),
                "raw_feature_engager_following_count": pd.Int32Dtype(),
                "raw_feature_engager_is_verified": pd.BooleanDtype(),
                "raw_feature_engager_creation_timestamp": pd.Int32Dtype(),
                "raw_feature_engagement_creator_follows_engager":
                pd.BooleanDtype(),
                "raw_feature_engagement_reply_timestamp": pd.Int32Dtype(),
                "raw_feature_engagement_retweet_timestamp": pd.Int32Dtype(),
                "raw_feature_engagement_comment_timestamp": pd.Int32Dtype(),
                "raw_feature_engagement_like_timestamp": pd.Int32Dtype()
            },
            usecols=[column])
    else:
        raise Exception("something went wrong.")
Exemplo n.º 30
0
DATABASE_USER = PG_USER
DATABASE_PASSWORD = PG_PASSWORD
DATABASE_HOST = PG_URL
DATABASE_PORT = "5432"
DATABASE_DB_NAME = "movielens"

conn = f"postgres://{DATABASE_USER}:{DATABASE_PASSWORD}@{DATABASE_HOST}:{DATABASE_PORT}/{DATABASE_DB_NAME}"
movies = pd.read_sql_table('movies', conn)
ratings = pd.read_sql_table('ratings_2019_w_timestamp', conn)

movies_ratings = pd.merge(movies, ratings, "left", on="movie_id")
movies_ratings.rating_timestamp = pd.to_datetime(
    movies_ratings.rating_timestamp).copy()
movies_ratings["month"] = movies_ratings.rating_timestamp.dt.month
movies_ratings.user_id = movies_ratings.user_id.astype(pd.Int32Dtype())

avg_rating = (movies_ratings.groupby(
    ["movie_id", "month"])[["rating"]].mean().sort_values(by="rating",
                                                          ascending=False))
avg_rating.rename(columns={"rating": "avg_rating"}, inplace=True)
avg_rating.reset_index(inplace=True)
avg_rating.sort_values(by="month", inplace=True)

total_ratings = (movies_ratings.groupby(
    ["movie_id", "month"])[["rating"]].count().sort_values(by="rating",
                                                           ascending=False))
total_ratings.rename(columns={"rating": "total_ratings"}, inplace=True)
total_ratings.reset_index(inplace=True)
total_ratings.sort_values(by="month", inplace=True)
total_ratings["cumsum"] = (total_ratings["total_ratings"].groupby(