示例#1
0
文件: do.py 项目: cottrell/notebooks
def get_missing_dates(product, name):
    # get start date from raw_target() and update
    # TODO: actually do bdate range or whatever, currently making hard assumption of continuity in dates
    missing_back_dates = [] # TODO: backfill check biz days etc
    filename = os.path.join(raw_target(), 'product={}/name={}'.format(product, name))
    first_new_date = pd.read_parquet(filename).date.max()
    filename = os.path.join(download_update_target(), 'product={}/name={}'.format(product, name))
    if len(glob.glob(os.path.join(filename, '*.parquet'))) > 0:
        first_new_date = max(first_new_date, pd.read_parquet(filename).date.max())
    first_new_date = first_new_date.date() + datetime.timedelta(days=1)
    return missing_back_dates, first_new_date
示例#2
0
文件: SIA.py 项目: fccoelho/PySUS
def download(state: str, year: int, month: int, cache: bool =True) -> object:
    """
    Download SIH records for state year and month and returns dataframe
    :param month: 1 to 12
    :param state: 2 letter state code
    :param year: 4 digit integer
    """
    state = state.upper()
    year2 = str(year)[-2:]
    month = str(month).zfill(2)
    if year < 1992:
        raise ValueError("SIH does not contain data before 1994")
    ftp = FTP('ftp.datasus.gov.br')
    ftp.login()
    if year < 2008 and year > 1994:
        ftype = 'DBC'
        ftp.cwd('/dissemin/publicos/SIASUS/199407_200712/Dados')
        fname = 'PA{}{}{}.dbc'.format(state, year2, month)
        fname2 = None
    if year >= 2008:
        ftype = 'DBC'
        ftp.cwd('/dissemin/publicos/SIASUS/200801_/Dados'.format(year))
        fname = 'PA{}{}{}.dbc'.format(state, str(year2).zfill(2), month)
        fname2 = 'BI{}{}{}.dbc'.format(state, str(year2).zfill(2), month)
    # Check in Cache
    cachefile = os.path.join(CACHEPATH, 'SIA_' + fname.split('.')[0] + '_.parquet')
    if os.path.exists(cachefile):
        df = pd.read_parquet(cachefile)
    else:
        df = _fetch_file(fname, ftp, ftype)
        if cache:
            df.to_parquet(cachefile)
    if fname2 is not None:
        cachefile2 = os.path.join(CACHEPATH, 'SIA_' + fname2.split('.')[0] + '_.parquet')
        if os.path.exists(cachefile2):  # reads from cache
            df2 = pd.read_parquet(cachefile2)
        else:  # fetches from DataSUS
            try:
                df2 = _fetch_file(fname2, ftp, ftype)
                if cache:  #saves to cache
                    df2.to_parquet(cachefile2)
            except Exception as e:
                df2 = None
                print(e)
    else:
        df2 = None

    return df, df2
示例#3
0
文件: do.py 项目: cottrell/notebooks
def get_sample_data():
    """
    reload(do); globals().update(do.get_sample_data())
    """
    df = pd.read_parquet('enriched/nrows=all/product=etfs/name=qqq')
    X_train, y_train, X_val, y_val = get_xy_data_plain(df)
    return locals()
示例#4
0
文件: SIH.py 项目: fccoelho/PySUS
def download(state: str, year: int, month: int, cache: bool=True) -> object:
    """
    Download SIH records for state year and month and returns dataframe
    :param month: 1 to 12
    :param state: 2 letter state code
    :param year: 4 digit integer 
    """
    state = state.upper()
    year2 = int(str(year)[-2:])
    month = str(month).zfill(2)
    if year < 1992:
        raise ValueError("SIH does not contain data before 1994")
    ftp = FTP('ftp.datasus.gov.br')
    ftp.login()
    if year < 2008:
        ftype = 'DBC'
        ftp.cwd('/dissemin/publicos/SIHSUS/199201_200712/Dados')
        fname = 'RD{}{}{}.dbc'.format(state, year2, month)
    if year >= 2008:
        ftype = 'DBC'
        ftp.cwd('/dissemin/publicos/SIHSUS/200801_/Dados'.format(year))
        fname = 'RD{}{}{}.dbc'.format(state, str(year2).zfill(2), month)
    cachefile = os.path.join(CACHEPATH, 'SIH_' + fname.split('.')[0] + '_.parquet')
    if os.path.exists(cachefile):
        df = pd.read_parquet(cachefile)
        return df
    df = _fetch_file(fname, ftp, ftype)
    if cache:
        df.to_parquet(cachefile)
    return df
示例#5
0
    def test_local(self):
        with self.temp_dir() as tmp:
            data = pd.DataFrame({
                'i32': np.arange(1000, dtype=np.int32),
                'i64': np.arange(1000, dtype=np.int64),
                'f': np.arange(1000, dtype=np.float64),
                'bhello': np.random.choice(['hello', 'yo', 'people'], size=1000).astype("O")})
            data = data[['i32', 'i64', 'f', 'bhello']]
            self.spark.createDataFrame(data, 'i32 int, i64 long, f double, bhello string') \
                .coalesce(1).write.parquet(tmp, mode='overwrite')

            def check(columns, expected):
                if LooseVersion("0.21.1") <= LooseVersion(pd.__version__):
                    expected = pd.read_parquet(tmp, columns=columns)
                actual = koalas.read_parquet(tmp, columns=columns)
                self.assertPandasEqual(expected, actual.toPandas())

            check(None, data)
            check(['i32', 'i64'], data[['i32', 'i64']])
            check(['i64', 'i32'], data[['i64', 'i32']])
            check(('i32', 'i64'), data[['i32', 'i64']])
            check(['a', 'b', 'i32', 'i64'], data[['i32', 'i64']])
            check([], pd.DataFrame([]))
            check(['a'], pd.DataFrame([]))
            check('i32', pd.DataFrame([]))
            check('float', data[['f']])

            # check with pyspark patch.
            if LooseVersion("0.21.1") <= LooseVersion(pd.__version__):
                expected = pd.read_parquet(tmp)
            else:
                expected = data
            actual = koalas.read_parquet(tmp)
            self.assertPandasEqual(expected, actual.toPandas())
示例#6
0
文件: sinasc.py 项目: fccoelho/PySUS
def download(state, year, cache=True):
    """
    Downloads data directly from Datasus ftp server
    :param state: two-letter state identifier: MG == Minas Gerais
    :param year: 4 digit integer
    :return: pandas dataframe
    """
    state = state.upper()
    if year < 1994:
        raise ValueError("SINASC does not contain data before 1994")
    ftp = FTP('ftp.datasus.gov.br')
    ftp.login()
    if year >= 1996:
        ftp.cwd('/dissemin/publicos/SINASC/NOV/DNRES')
        fname = 'DN{}{}.DBC'.format(state, year)
    else:
        ftp.cwd('/dissemin/publicos/SINASC/ANT/DNRES')
        fname = 'DNR{}{}.DBC'.format(state, str(year)[-2:])
    cachefile = os.path.join(CACHEPATH, 'SINASC_'+fname.split('.')[0] + '_.parquet')
    if os.path.exists(cachefile):
        df = pd.read_parquet(cachefile)
        return df

    ftp.retrbinary('RETR {}'.format(fname), open(fname, 'wb').write)
    df = read_dbc(fname, encoding='iso-8859-1')
    if cache:
        df.to_parquet(cachefile)
    os.unlink(fname)
    return df
def plot_category_category(input_file, col1, col2, path):
    df = pd.read_parquet(input_file, columns=[col1, col2])
    if len(df[col1].unique()) < len(df[col2].unique()):
        col1, col2 = col2, col1
    file_name = os.path.join(path, f"{col1}-{col2}-bar-plot.png")
    bar_plot(df, col1, hue=col2, file_name=file_name)

    file_name = os.path.join(path, f"{col1}-{col2}-heatmap.png")
    heatmap(pd.crosstab(df[col1], df[col2]), file_name=file_name)
def plot_single_category(input_file, col, path):
    df = pd.read_parquet(input_file, columns=[col])
    value_counts = df[col].value_counts(dropna=False)
    # if the categories are more than 50 then this should be ignored
    # TODO find a better way to visualize this
    if len(value_counts) > 50:
        ignore.add(col)
    else:
        file_name = os.path.join(path, col + "-bar-plot.png")
        bar_plot(df, col, file_name=file_name)
示例#9
0
    def check_round_trip(self, df, engine, expected=None, **kwargs):

        with tm.ensure_clean() as path:
            df.to_parquet(path, engine, **kwargs)
            result = read_parquet(path, engine)

            if expected is None:
                expected = df
            tm.assert_frame_equal(result, expected)

            # repeat
            to_parquet(df, path, engine, **kwargs)
            result = pd.read_parquet(path, engine)

            if expected is None:
                expected = df
            tm.assert_frame_equal(result, expected)
示例#10
0
def load_entity_data(metadata, root):
    '''Load an entity's data from disk.'''
    if metadata['data_files']['filetype'] == 'pickle':
        data = pd_read_pickle(os.path.join(root, metadata['data_files']['data_filename']))
        df = data['df']
    elif metadata['data_files']['filetype'] == 'parquet':
        df = pd.read_parquet(os.path.join(root,
                                          metadata['data_files']['df_filename']),
                             engine=metadata['data_files']['engine'])
        df.index = df[metadata['index']]
        to_join = metadata['data_files'].get('to_join', None)
        if to_join is not None:
            for cname, to_join_names in to_join.items():
                df[cname] = df[to_join_names].apply(tuple, axis=1)
                df.drop(to_join_names, axis=1, inplace=True)
    else:
        raise ValueError("Unknown entityset data filetype: {}".format(metadata['data_files']['filetype']))
    return df
示例#11
0
    def check_round_trip(self, df, engine, expected=None,
                         write_kwargs=None, read_kwargs=None):
        if write_kwargs is None:
            write_kwargs = {}
        if read_kwargs is None:
            read_kwargs = {}
        with tm.ensure_clean() as path:
            df.to_parquet(path, engine, **write_kwargs)
            result = read_parquet(path, engine, **read_kwargs)

            if expected is None:
                expected = df
            tm.assert_frame_equal(result, expected)

            # repeat
            to_parquet(df, path, engine, **write_kwargs)
            result = pd.read_parquet(path, engine, **read_kwargs)

            if expected is None:
                expected = df
            tm.assert_frame_equal(result, expected)
示例#12
0
文件: SIM.py 项目: fccoelho/PySUS
def download(state, year, cache=True):
    """
    Downloads data directly from Datasus ftp server
    :param state: two-letter state identifier: MG == Minas Gerais
    :param year: 4 digit integer
    :return: pandas dataframe
    """
    year2 = str(year)[-2:].zfill(2)
    state = state.upper()
    if year < 1979:
        raise ValueError("SIM does not contain data before 1979")
    ftp = FTP('ftp.datasus.gov.br')
    ftp.login()
    if year >= 1996:
        ftp.cwd('/dissemin/publicos/SIM/CID10/DORES')
        fname = 'DO{}{}.DBC'.format(state, year)
    else:
        ftp.cwd('/dissemin/publicos/SIM/CID9/DORES')
        fname = 'DOR{}{}.DBC'.format(state, year2)
    cachefile = os.path.join(CACHEPATH, 'SIM_'+fname.split('.')[0] + '_.parquet')
    if os.path.exists(cachefile):
        df = pd.read_parquet(cachefile)
        return df

    try:
        ftp.retrbinary('RETR {}'.format(fname), open(fname, 'wb').write)
    except:
        try:
            ftp.retrbinary('RETR {}'.format(fname.upper()), open(fname, 'wb').write)
        except:
            raise Exception("File {} not available".format(fname))

    df = read_dbc(fname, encoding='iso-8859-1')
    if cache:
        df.to_parquet(cachefile)
    os.unlink(fname)
    return df
示例#13
0
文件: SIM.py 项目: fccoelho/PySUS
def get_CID9_table(cache=True):
    """
    Fetch the CID9 table
    :param cache:
    :return:
    """
    ftp = FTP('ftp.datasus.gov.br')
    ftp.login()
    ftp.cwd('/dissemin/publicos/SIM/CID9/TABELAS')
    fname = 'CID9.DBF'
    cachefile = os.path.join(CACHEPATH, 'SIM_' + fname.split('.')[0] + '_.parquet')
    if os.path.exists(cachefile):
        df = pd.read_parquet(cachefile)
        return df
    try:
        ftp.retrbinary('RETR {}'.format(fname), open(fname, 'wb').write)
    except:
        raise Exception('Could not download {}'.format(fname))
    dbf = DBF(fname, encoding='iso-8859-1')
    df = pd.DataFrame(list(dbf))
    if cache:
        df.to_parquet(cachefile)
    os.unlink(fname)
    return df
示例#14
0
 parser = argparse.ArgumentParser()
 parser.add_argument('--snps', required=True, help='Whitespace-delimited file with SNPs to extract. Must include columns A1,A2 and either (1) SNP or (2) both CHR and BP')
 parser.add_argument('--out', required=True, help='Prefix of the name of the output file')
 parser.add_argument('--allow-missing', default=False, action='store_true', help='If specified, the script will not terminate if some SNPs are not found in the meta file')
 parser.add_argument('--q', type=int, default=100, help='The maximum ratio between the largest and smallest prior causal probabilities')
 args = parser.parse_args()
 
 #check package versions
 check_package_versions()
 
 #configure logger
 configure_logger(args.out)
 
 #read snps file
 try:
     df_snps = pd.read_parquet(args.snps)
 except ArrowIOError:
     df_snps = pd.read_table(args.snps, delim_whitespace=True)
 if 'A1' not in df_snps.columns:
     raise ValueError('missing column A1')
 if 'A2' not in df_snps.columns:
     raise ValueError('missing column A2')
 if 'SNP' not in df_snps.columns:
     if 'CHR' not in df_snps.columns:
         raise ValueError('You must provide either a SNP or a CHR column')
     if 'BP' not in df_snps.columns:
         raise ValueError('You must provide either a SNP or a BP column')
         
 #read df_meta        
 script_dir = os.path.dirname(os.path.realpath(__file__))
 df_meta1 = pd.read_parquet(os.path.join(script_dir, 'snpvar_meta.chr1_7.parquet'))
示例#15
0
 def pd_df_parquet_load(filename, **kwargs):
     return pd.read_parquet(filename, **kwargs)
示例#16
0
MODEL_TYPE = "rnn_updated"
LOG_DIR = f"../EiT/tmp/logs/{MODEL_TYPE}"
LOG_LEVEL = "ERROR"
TARGET_PATH = "../EiT/final_datasets"

model_24h_path_local = "../EiT/tmp/models/rnn_updated/rnn_updated_20210303_162009/model"
model_48h_path_local = "../EiT/tmp/models/rnn_updated/rnn_updated_20210303_172934/model"

model_24h = tf.keras.models.load_model(f"{model_24h_path_local}/saved_model")
model_48h = tf.keras.models.load_model(f"{model_48h_path_local}/saved_model")

# Load all .parquet files as dataframes
dfs = {}
for path in glob.glob(f"{TARGET_PATH}/**/*.parquet", recursive=True):
    df = pd.read_parquet(path)
    df_name = path.split(os.sep)[-1].split('.')[0]
    dfs[df_name] = df

# Convert dataframe to numpy arrays
train_df, val_df, test_df = dfs['train'], dfs['val'], dfs['test']

train_data = train_df.values.astype('float32')
train_targets = train_df['nox'].values.astype('float32')

val_data = val_df.values.astype('float32')
val_targets = val_df['nox'].values.astype('float32')

test_data = test_df.values.astype('float32')
test_targets = test_df['nox'].values.astype('float32')
示例#17
0
def sklearn_regressor(booster, params, num_round):
    reg = XGBRegressor(n_estimators=num_round,
                       missing=-999,
                       **_complete_params(params))
    reg._Booster = booster
    return reg


if __name__ == '__main__':
    import sys
    train_set_path = sys.argv[1]
    trials_db_path = sys.argv[2]
    output_path = sys.argv[3]

    train_set = pd.read_parquet(train_set_path)
    cv_splits = tscv.split(train_set['date_block_num'].values, n=1, window=16)
    dtrain = xgb.DMatrix(*df_to_X_y(train_set), missing=-999)
    del train_set

    trials_db = 'sqlite:///%s' % trials_db_path
    study = optuna.create_study(direction='minimize',
                                load_if_exists=True,
                                study_name=output_path,
                                storage=trials_db,
                                pruner=optuna.pruners.HyperbandPruner())

    n_trials = MAX_EVALS - len(study.trials)
    if n_trials > 0:
        objective = make_xgb_objective(make_xgb_loss(dtrain, cv_splits))
        try:
示例#18
0
def query(parquetFilePath,
          columnList: list = [],
          continuousQueries: list = [],
          discreteQueries: list = []) -> pd.DataFrame:
    """
	Performs mulitple queries on a parquet dataset. If no queries or columns are passed, it returns the entire dataset as a pandas dataframe. Otherwise, returns only the queried data over the requested columns as a Pandas dataframe

	:type parquetFilePath: string
	:param parquetFilePath: filepath to a parquet file to be queried on

	:type columnList: list of strings
	:param columnList: list of column names that will be included in the data resulting from the queries

	:type continuousQueries: list of ContinuousQuery objects
	:param continuousQueries: list of objects representing queries on a column of continuous data
	
	:type discreteQueries: list of DiscreteQuery objects
	:param discreteQueries: list of objects representing queries on a column of discrete data

	:return: Requested columns with results of all queries 
	:rtype: Pandas dataframe
	"""
    if len(columnList) == 0 and len(continuousQueries) == 0 and len(
            discreteQueries) == 0:
        df = pd.read_parquet(parquetFilePath)
        df.set_index("Sample", drop=True, inplace=True)
        return df

    #extract all necessary columns in order to read them into pandas
    for query in continuousQueries:
        if query.columnName not in columnList:
            columnList.append(query.columnName)
    for query in discreteQueries:
        if query.columnName not in columnList:
            columnList.append(query.columnName)
    columnList.insert(0, "Sample")
    df = pd.read_parquet(parquetFilePath, columns=columnList)
    df.set_index("Sample", drop=True, inplace=True)
    del columnList[0]

    #perform continuous queries, adjusting for which operator is to be used
    for query in continuousQueries:
        if query.operator == OperatorEnum.Equals:
            df = df.loc[df[query.columnName] == query.value,
                        [col for col in columnList]]
        elif query.operator == OperatorEnum.GreaterThan:
            df = df.loc[df[query.columnName] > query.value,
                        [col for col in columnList]]
        elif query.operator == OperatorEnum.GreaterThanOrEqualTo:
            df = df.loc[df[query.columnName] >= query.value,
                        [col for col in columnList]]
        elif query.operator == OperatorEnum.LessThan:
            df = df.loc[df[query.columnName] < query.value,
                        [col for col in columnList]]
        elif query.operator == OperatorEnum.LessThanOrEqualTo:
            df = df.loc[df[query.columnName] <= query.value,
                        [col for col in columnList]]
    #perform discrete queries
    for query in discreteQueries:
        df = df.loc[df[query.columnName].isin(query.values),
                    [col for col in columnList]]

    return df
示例#19
0
import pandas as pd
from sklearn import preprocessing


def get_logger():
    FORMAT = '[%(levelname)s]%(asctime)s:%(name)s:%(message)s'
    logging.basicConfig(format=FORMAT)
    logger = logging.getLogger('main')
    logger.setLevel(logging.INFO)
    return logger


logger = get_logger()

# Load different data sources.
train_df = pd.read_parquet('input/processed/train_static_features.parquet.gzip')
test_df = pd.read_parquet('input/processed/test_static_features.parquet.gzip')

# OHE Cols with small number of uniques (less than 20)
ohe_cols = ['trafficSource.campaign', 'channelGrouping',
            'trafficSource.adwordsClickInfo.page', 'trafficSource.medium',
            'geoNetwork.continent', 'trafficSource.keyword_groups',
            'device.deviceCategory', 'totals.bounces', 'totals.newVisits',
            'trafficSource.adwordsClickInfo.slot',
            'trafficSource.adwordsClickInfo.adNetworkType']

# Remove target col.
y_train = train_df['totals.transactionRevenue'].values
train_df = train_df.drop(['totals.transactionRevenue'], axis=1)

# Join datasets for rowise feature engineering.
def plot_numeric_numeric(input_file, col1, col2, path):
    df = pd.read_parquet(input_file, columns=[col1, col2])
    file_name = os.path.join(path, f"{col1}-{col2}-scatter-plot.png")
    scatter_plot(df, col1, col2, file_name=file_name)
示例#21
0
import os
import pandas as pd
_mydir = os.path.dirname(os.path.realpath(__file__))

# 8 sec
_filename = os.path.join(_mydir, 'raw/yahoo/2010-01-01_to_2018-11-29')

try:
    df_orig
except NameError as e:
    df_orig = pd.read_parquet(_filename)

示例#22
0
 def read_dfr_from_parquet(self, region_id):
     columns=['lon', 'lat', 'date', 'day_since'] + self.labels
     store_name = os.path.join(self.data_path, '{0}.parquet'.format(region_id))
     dfr = pd.read_parquet(store_name, columns=columns)
     return dfr
示例#23
0
def test_linear():
    old_uri = tracking.get_tracking_uri()
    with TempDir(chdr=False, remove_on_exit=True) as tmp:
        try:
            diamonds = tmp.path("diamonds")
            root_tracking_dir = tmp.path("root_tracking_dir")
            os.mkdir(diamonds)
            os.mkdir(root_tracking_dir)
            tracking.set_tracking_uri(root_tracking_dir)
            # Download the diamonds dataset via mlflow run
            run(".",
                entry_point="download-example-data",
                version=None,
                parameters={"dest-dir": diamonds},
                experiment_id=0,
                mode="local",
                cluster_spec=None,
                git_username=None,
                git_password=None,
                use_conda=True,
                use_temp_cwd=False,
                storage_dir=None)

            initial = os.path.join(root_tracking_dir, "0")
            dir_list = os.listdir(initial)

            # Run the main linear app via mlflow
            run("examples/linear-regression",
                entry_point="main",
                version=None,
                parameters={
                    "training-data-path":
                    os.path.join(diamonds, "train_diamonds.parquet"),
                    "test-data-path":
                    os.path.join(diamonds, "test_diamonds.parquet"),
                    "alpha":
                    .001,
                    "l1-ratio":
                    .5,
                    "label-col":
                    "price"
                },
                experiment_id=0,
                mode="local",
                cluster_spec=None,
                git_username=None,
                git_password=None,
                use_conda=True,
                use_temp_cwd=False,
                storage_dir=None)

            # Identifying the new run's folder
            main = None
            for item in os.listdir(initial):
                if item not in dir_list:
                    main = item

            pyfunc = load_pyfunc(
                os.path.join(initial, main, "artifacts/model/model.pkl"))

            df = pandas.read_parquet(
                os.path.join(diamonds, "test_diamonds.parquet"))

            # Removing the price column from the DataFrame so we can use the features to predict
            df = df.drop(columns="price")

            # Predicting from the saved pyfunc
            predict = pyfunc.predict(df)

            # Make sure the data is of the right type
            assert isinstance(predict[0], numpy.float64)
        finally:
            tracking.set_tracking_uri(old_uri)
示例#24
0
def test_from_parquet_partitioned_columns_with_columns(make_parquet_file):
    make_parquet_file(SMALL_ROW_SIZE, partitioned_columns=["col1"])

    pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
    modin_df = pd.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
    df_equals(modin_df, pandas_df)
示例#25
0
def test_from_parquet_partition(make_parquet_file):
    make_parquet_file(SMALL_ROW_SIZE, directory=True)

    pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME)
    modin_df = pd.read_parquet(TEST_PARQUET_FILENAME)
    df_equals(modin_df, pandas_df)
示例#26
0
def test_from_parquet(make_parquet_file):
    make_parquet_file(SMALL_ROW_SIZE)

    pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME)
    modin_df = pd.read_parquet(TEST_PARQUET_FILENAME)
    df_equals(modin_df, pandas_df)
示例#27
0
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


def group_mean_log_mae(y_true, y_pred, groups, floor=1e-9):
    maes = (y_true - y_pred).abs().groupby(groups).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()


#####################
# READ INPUT FILES
#####################
logger.info('Reading input files....')
path = 'data/'
train_df = pd.read_parquet(f'{path}/FE005-train.parquet')
test_df = pd.read_parquet(f'{path}/FE005-test.parquet')
ss = pd.read_csv('input/sample_submission.csv')
# structures = pd.read_csv('input/structures.csv')
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)
train_df['atom_0'] = train_df['atom_0'].astype('category')
train_df['atom_1'] = train_df['atom_1'].astype('category')
test_df['atom_0'] = test_df['atom_0'].astype('category')
test_df['atom_1'] = test_df['atom_1'].astype('category')

train_df['type_0'] = train_df['type_0'].astype('category')
test_df['type_0'] = test_df['type_0'].astype('category')

#####################
# FEATURE CREATION
示例#28
0
 def __init__(self):
     self._holiday_df = pd.read_parquet(constants.holidays_parquet_table)
def pd_read_s3_parquet(key, bucket, s3_client=None, **args):
    if s3_client is None:
        s3_client = boto3.client('s3')
    obj = s3_client.get_object(Bucket=bucket, Key=key)
    return pd.read_parquet(io.BytesIO(obj['Body'].read()), **args)
示例#30
0
文件: s3.py 项目: RubiAguilar/etl
 def readParquetAndReturnDataFrame(bucket_name, key):
     dataS3 = S3Operations.readFile(bucket_name, key, True)
     return pd.read_parquet(dataS3)
示例#31
0
def get_data():
    filename = os.path.expanduser('~/projects/data/extractors/pdr/yahoo_price_volume/product=etfs')
    df = pd.read_parquet(filename)
    return df
示例#32
0
文件: dicarlo.py 项目: elijahc/vae
def get_dicarlo_su(proj_dir,fn='su_selectivity_dicarlo_hi_var.pqt'):
    return pd.read_parquet(os.path.join(proj_dir,'data',fn))
示例#33
0
def main(load_folder=f'../../data/xgb_split/',
         label='interventions',
         dir_name=DIR_NAME):
    # fixes:
    # - reduce n workers
    # - less features
    # - smaller dataset

    load_folder = Path(load_folder)

    overwrite = False
    if not load_folder.exists() or overwrite:
        print('Data path not found. Creating data.')
        n_docs = prepare_data()
        print('processing complete. Recommended to rerun the script.')
        quit()
        load_folder = (Path(
            '/'.join(load_folder.name.split('_')[:-1] + [n_docs]))).mkdir(
                exist_ok=True, parents=True)  # todo: doesn't work

    n_string = f"{load_folder.name.split('_')[-1]}"
    dir_name = f'{random.randint(0,999)}_{label}' if not dir_name else dir_name

    start_time = time.time()
    label = sys.argv[2] if len(sys.argv) > 2 else label

    # Load X and y
    print(
        f"Loading datasets {', '.join([str(p.name) for p in list(load_folder.glob('*.parquet'))])} from {load_folder}"
    )

    X_train = pd.read_parquet(load_folder / f'X_train.parquet')
    features = X_train.columns
    X_train = X_train.values

    y_train_df = pd.read_parquet(load_folder / f'y_train.parquet')
    X_test = pd.read_parquet(load_folder / f'X_test.parquet').values
    y_test_df = pd.read_parquet(load_folder / f'y_test.parquet')

    test_words = y_test_df.copy()['Word']
    train_index = pd.read_parquet(load_folder / f'X_train.parquet').index

    y_train_df['all'] = ~(y_train_df.sum(axis=1) == 0)
    y_test_df['all'] = ~(y_test_df.sum(axis=1) == 0)

    print('\ntrain docs:\n', train_index)
    print('\n Docs in validation set:\n', y_train_df.index.unique('doc'))

    n_est = 500
    params['n_estimators'] = n_est

    print(f"\nTraining a GBC model to predict label '{label}'.")
    print('input shapes: ',
          [df.shape for df in [X_train, X_test, y_train_df, y_test_df]])
    print('Starting SKLRF.')

    # PIO.insert(0, 'all')
    print(PIO)

    # Set the parameters by cross-validation
    # for n_est in n_estimators:
    # for i, label in enumerate(PIO):

    y_train = np.where(y_train_df[label].values, label, f'not_{label}')
    y_test = np.where(y_test_df[label].values, label, f'not_{label}')
    print(np.unique(y_train, return_counts=True))

    label_counts = np.unique(y_train, return_counts=True)[1]

    params['class_weight'] = {
        label: label_counts[1] / label_counts[0],
        f'not_{label}': 1.0
    }

    print(params['class_weight'])

    print(f'\nTesting n_estimators {n_est} for label {label}')

    clf = RandomForestClassifier(**params)

    clf.fit(X=X_train, y=y_train)

    y_pred = clf.predict(X_test)

    with open('dump1.pickle', 'wb') as f:
        pickle.dump(y_pred, f)

    print("\n******************************\n")
    report = pd.DataFrame(
        classification_report(y_test.flatten(),
                              y_pred.flatten(),
                              digits=3,
                              output_dict=True))
    print(report.head())
    print("\n******************************\n")

    print(report)
    time.sleep(3)

    print(f"Saving data...")

    exp_folder = EXPERIMENTS / dir_name / label / str(n_est)
    print('exp folder is here:', exp_folder.resolve())
    exp_folder.mkdir(exist_ok=True, parents=True)

    probs = clf.predict_proba(X_test)
    preds = {
        'token': test_words,
        f'not_{label}': probs[:, 0],
        f'{label}': probs[:, 1],
        f'pred': y_pred,
        'true': y_test
    }
    pd.DataFrame(preds,
                 index=test_words.index).to_csv(exp_folder / 'predict.csv')

    feat_imps = clf.feature_importances_
    pd.DataFrame(feat_imps,
                 index=features).to_csv(exp_folder / 'feature_importances.csv')

    report.to_csv(exp_folder / 'class_report.csv')
    # pd.to_csv(evals_result, exp_folder / 'evals_result.csv')
    # pd.DataFrame({'pred': prediction*1, 'true': sets['val'][1]*1}, index=val_index).to_csv(exp_folder / 'prediction.csv')

    if hasattr(clf, 'clf.best_score'):
        print(
            f"No score improvement detected over {params['early_stopping_rounds']} steps, terminating."
        )

    write_dict((exp_folder / '.params'), params)

    print(f'\n Total computation time: {time.time() - start_time:.2f}s')
    print(f"Saved data for run with target '{label}' in '{dir_name}'")
示例#34
0
lgb_params = {
    "boosting_type": "gbdt",
    "objective": "regression_l2",
    "learning_rate": LEARNING_RATE,
    "num_leaves": 255,
    "sub_feature": 0.50,
    "sub_row": 0.75,
    "bagging_freq": 1,
    "metric": EVAL_METRIC,
    'random_state': RANDOM_STATE
}

folds = GroupKFold(n_splits=N_FOLDS)

# Setup arrays for storing results
train_df = pd.read_parquet(
    'data/FE008_train.parquet')  # only loading for skeleton not features
oof_df = train_df[['id', 'type', 'scalar_coupling_constant']].copy()
mol_group = train_df[['molecule_name', 'type']].copy()
del train_df
gc.collect()

oof_df['oof_preds'] = 0
test_df = pd.read_parquet(
    'data/FE008_test.parquet')  # only loading for skeleton not features
prediction = np.zeros(len(test_df))
feature_importance = pd.DataFrame()
test_pred_df = test_df[['id', 'type', 'molecule_name']].copy()
del test_df
gc.collect()
test_pred_df['prediction'] = 0
bond_count = 1
def plot_single_numeric(input_file, col, path):
    df = pd.read_parquet(input_file, columns=[col])
    file_name = os.path.join(path, f"{col}-dist-plot.png")
    data = df[col].dropna()
    f, axes = plt.subplots(2, 1, sharex=True, figsize=(8, 6))
    histogram_violin_plots(data, axes, file_name=file_name)
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from plotnine import *
import gzip

data = pd.read_parquet("merged_data_pop2010.gzip")

'''
This code produces difference-in-difference plots for policy interventions in Florida, Texas, and Washington. 
Each state is compared to the rest of the United States in each plot

### Reading in data 

df = pd.read_parquet('merged_data.gzip')
df.head()

##Processing data. No longer needed with waht we are using now. 
#df['Deaths'] = df['Deaths'].astype('float')
#df['POPESTIMATE2010'] = df['POPESTIMATE2010'].astype('float')
#df.drop(['COUNTY', 'FIPS'], axis = 1)


### Calculating per capita Drugs

df['DrugsPerCapita'] = df['MME_Calculated']/df['population']
    
    
###Start by subsetting data by years. The analysis will be completed using all available data for that specific area. 
###The following function subsets data by years. 
示例#37
0
print('Load random Tweets:')

start_time = time.time()

paths_to_random = list(
    np.array_split(glob(os.path.join(path_to_data, '*.parquet')),
                   SLURM_ARRAY_TASK_COUNT)[SLURM_ARRAY_TASK_ID])
print('#files:', len(paths_to_random))

tweets_random = pd.DataFrame()
for file in paths_to_random:
    print(file)
    tweets_random = pd.concat(
        [tweets_random,
         pd.read_parquet(file)[['tweet_id', 'text']]])
    print(tweets_random.shape)

print('load random sample:', str(time.time() - start_time), 'seconds')
print(tweets_random.shape)

print('dropping duplicates:')
# random contains 7.3G of data!!
start_time = time.time()
tweets_random = tweets_random.drop_duplicates('text')
print('drop duplicates:', str(time.time() - start_time), 'seconds')
print(tweets_random.shape)

start_time = time.time()
print('converting to list')
examples = tweets_random.text.values.tolist()
示例#38
0
def load_test_features(col):
    return pd.read_parquet(
        "filtered_test_features/filtered_test_feature{}.gzip".format(col))
示例#39
0
 def load_parquet(self):
     f = io.BytesIO()
     f.write(self.parquet_file.value)
     f.seek(0)
     self.df = pd.read_parquet(f)
示例#40
0
log_file = '../log.txt'
logging.basicConfig(filename=log_file, level=logging.INFO)
logger = logging.getLogger()
handler = logging.StreamHandler()
logger.addHandler(handler)
logging.info('\n<============================================================================>')
logging.info(f'\nCurrent working directory: {os.getcwd()}')
logging.info(f'\nApplication started ... ({time.ctime()})\n')

# Load calibration data

# y = (pd.read_csv('RevenewML/flatfiles/Final Preprocessed For MichaelSampled Latest.txt', sep='\t')
#      .loc[:, ['ProjectID', 'Report_Group_Flag', 'Y_Has_Claim', 'Partition']]
# )
# y.to_parquet('RevenewML/flatfiles/Y_Calibration.parquet')
y = pd.read_parquet('RevenewML/flatfiles/Y_Calibration.parquet')

# count_profiles = (dt.fread('datasets/Count_Profiles_Calibration.csv', show_progress=True)
count_profiles = (pd.read_parquet('RevenewML/flatfiles/Count_Profiles_Calibration.parquet')
                  # .to_pandas()
                  .merge(y, on=['ProjectID', 'Report_Group_Flag'])
                  .drop(columns=['Y_Has_Claim', 'Partition'])
                  )
# count_profiles.to_parquet('RevenewML/flatfiles/Count_Profiles_Calibration.parquet')

# duplicate_reports = (dt.fread('datasets/Duplicate_Reports_Calibration.csv', show_progress=True)
duplicate_reports = (pd.read_parquet('RevenewML/flatfiles/Duplicate_Reports_Calibration.parquet')
                     # .to_pandas()
                     .merge(y, on=['ProjectID', 'Report_Group_Flag'])
                     .drop(columns=['Y_Has_Claim', 'Partition'])
                     # ).sample(100000)
def plot_category_numeric(input_file, category_col, numeric_col, path):
    df = pd.read_parquet(input_file, columns=[category_col, numeric_col])
    f, axes = plt.subplots(2, 2, sharex="col", sharey="row", figsize=(8, 6))
    axes = list(chain.from_iterable(axes))
    file_name = os.path.join(path, f"{category_col}-{numeric_col}-plot.png")
    bar_box_violin_dot_plots(df, category_col, numeric_col, axes, file_name=file_name)
示例#42
0
 def read_parquet(file_widget):
     f = io.BytesIO()
     f.write(file_widget.value)
     f.seek(0)
     return pd.read_parquet(f)
示例#43
0
# Methods


if __name__ == "__main__":

    # Get subdirectories of gamedays
    subdirs = utils.get_gameday_subdirs(path=CONFIG.get('inpath'),
                                        window_start=CONFIG.get(
                                            'windowStartDate'),
                                        window_end=CONFIG.get(
                                            'windowEndDate'),
                                        left_incl=True, right_incl=True)

    # Append Files
    df_innings = pd.concat(
        objs=[pd.read_parquet(
            CONFIG.get('inpath') + subdir + '/innings.parquet')
              for subdir in subdirs if
              "innings.parquet" in os.listdir(
                  CONFIG.get('inpath') + subdir)],
        axis=0
    )

    # Get atbat level subset
    atbat_vars = [x for x in df_innings if x[:6] == 'atbat_']
    atbat_vars.append('game_id')
    df_atbats = df_innings.loc[:, atbat_vars]
    df_atbats.drop_duplicates(inplace=True)

    # Sort
    df_atbats.sort_values(
        by=['game_id', 'atbat_batter', 'atbat_num'],
# sort by date
kb.sort_values(by='date', inplace=True)
kb.reset_index(drop=True, inplace=True)

# give unique alliance IDs
kb['alliance_id'] = kb.apply(lambda x: uuid.uuid4(), axis=1)
kb['alliance_id'] = kb['alliance_id'].astype('str')

# save as parquet file
kb.to_parquet('/Users/Jakob/Documents/financial_news_data/kb.parquet.gzip')

## News articles

# read news articles
reuters = pd.read_parquet(
    '/Users/Jakob/Documents/financial_news_data/reuters.parquet.gzip')
bloomberg = pd.read_parquet(
    '/Users/Jakob/Documents/financial_news_data/bloomberg.parquet.gzip')

news = reuters.append(bloomberg)
news = news[['Date', 'Link', 'Article', 'Headline']]
news.columns = [x.lower() for x in news.columns]  # lowercase all column names

# sort out dates
news['date'] = pd.to_datetime(news['date'],
                              utc=True,
                              infer_datetime_format=True)
news.sort_values(by='date', inplace=True)
news['date'] = news['date'].dt.date  # keep only date not time

# remove city and source tag (only Reuters articles have them)
示例#45
0
文件: do.py 项目: cottrell/notebooks
 def read_transform_write(infile, outfile):
     print('{} -> {}'.format(infile, outfile))
     df = pd.read_parquet(infile)
     enrich_pandas_single(df, inplace=True)
     table = pa.Table.from_pandas(df, preserve_index=False)
     pq.write_to_dataset(table, root_path=outfile, preserve_index=False)
    parser.add_argument("--inference_folder", type=str)
    parser.add_argument("--new_iteration_folder", type=str)

    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = get_args_from_command_line()
    data_path = '/scratch/mt4493/twitter_labor/twitter-labor-data/data'
    already_labelled_ids_path = os.path.join(data_path, 'active_learning',
                                             'sampling_top_lift',
                                             args.country_code,
                                             args.inference_folder,
                                             'already_labelled_ids.parquet')
    already_labelled_ids_df = pd.read_parquet(already_labelled_ids_path)
    already_labelled_ids_df['tweet_id'] = already_labelled_ids_df[
        'tweet_id'].astype(str)
    already_labelled_labels_path = os.path.join(data_path, 'qualtrics',
                                                args.country_code, 'old_iters',
                                                'labeling')
    already_labelled_labels_df = pd.concat([
        pd.read_parquet(path)
        for path in Path(already_labelled_labels_path).glob('*.parquet')
    ])
    already_labelled_labels_df['tweet_id'] = already_labelled_labels_df[
        'tweet_id'].astype(str)
    labels_df = already_labelled_ids_df.merge(already_labelled_labels_df,
                                              on=['tweet_id'
                                                  ]).reset_index(drop=True)
示例#47
0
    def _load(self) -> pd.DataFrame:
        load_path = PurePosixPath(self._get_load_path())

        with self._s3.open(str(load_path), mode="rb") as s3_file:
            return pd.read_parquet(s3_file, **self._load_args)
示例#48
0
def read_parquet(parq_file: pathlib.Path) -> pd.DataFrame:
    return pd.read_parquet(parq_file)
示例#49
0
import boto3
import pandas
import time

FILE_NAME = 'some-file.parquet'

pandas.set_option('display.expand_frame_repr', False)

session = boto3.Session(profile_name='sbx')
s3 = session.client('s3')
file_object = s3.download_file(
    Bucket='test-bucket',
    Key='location/to/folder/partition='+time.strftime('%Y%m%d')+'/'+FILE_NAME,
    Filename=FILE_NAME)
df = pandas.read_parquet(FILE_NAME)

print(df.count())
print("=======================")
print(df.head(20))

示例#50
0
 def pd_series_parquet_load(filename, **kwargs):
     series = pd.read_parquet(filename, **kwargs).ix[:, 0]
     if series.name == '_series':
         series.name = None
     return series
示例#51
0
 def check(columns, expected):
     if LooseVersion("0.21.1") <= LooseVersion(pd.__version__):
         expected = pd.read_parquet(tmp, columns=columns)
     actual = koalas.read_parquet(tmp, columns=columns)
     self.assertPandasEqual(expected, actual.toPandas())
from bigdl.optim.optimizer import *
from bigdl.dataset.transformer import *

#from matplotlib.pyplot import imshow
#import matplotlib.pyplot as plt

# create sparkcontext with bigdl configuration
sc = SparkContext.getOrCreate(conf=create_spark_conf().setMaster("local[*]"))
init_engine()  # prepare the bigdl environment
bigdl.version.__version__  # Get the current BigDL version

# using SQLContext to read parquet file
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

pdf = pd.read_parquet("/home/matt/Bengalia/train_image_data_0.parquet").iloc[
    0:50, :]
pdf_sml = pdf.iloc[0:25, :]
sdf = sqlContext.createDataFrame(pdf_sml)
rdd_train_images = sdf.drop('image_id').rdd
rdd_train_labels = sc.parallelize(
    pd.read_csv("/home/matt/Bengalia/train.csv")["grapheme_root"].iloc[0:25])
rdd_train_sample = rdd_train_images.zip(rdd_train_labels).map(
    lambda features_label: common.Sample.from_ndarray(
        np.asarray([x / 255
                    for x in features_label[0]]), features_label[1] + 1))

pdf_sml = pdf.iloc[26:50, :]
sdf = sqlContext.createDataFrame(pdf_sml)
rdd_test_images = sdf.drop('image_id').rdd
rdd_test_labels = sc.parallelize(
    pd.read_csv("/home/matt/Bengalia/train.csv")["grapheme_root"].iloc[26:50])