示例#1
0
def worker(inpt):
    print(inpt)
    offset, chunksize, path = inpt
    df, meta = pyreadstat.read_sav(path, row_offset=offset, row_limit=chunksize)
    # df, meta = pyreadstat.read_file_in_chunks(pyreadstat.read_sav, path, offset=offset, chunksize=chunksize,
    #                                           multiprocess=True, num_processes=10)
    return df
示例#2
0
def worker(inpt):
    import pyreadstat
    offset, chunksize, path = inpt
    df, meta = pyreadstat.read_sav(path,
                                   row_offset=offset,
                                   row_limit=chunksize)
    return df
示例#3
0
def extract_sav_data(sav_file,
                     ioLocale='en_US.UTF-8',
                     ioUtf8=True,
                     engine='savReaderWriter'):
    """ see parse_sav_file doc """
    if engine == 'savReaderWriter':
        with sr.SavReader(sav_file,
                          returnHeader=True,
                          ioLocale=ioLocale,
                          ioUtf8=ioUtf8) as reader:
            thedata = [x for x in reader]
            header = thedata[0]
            dataframe = pd.DataFrame.from_records(thedata[1:],
                                                  coerce_float=False)
            dataframe.columns = header
            for column in header:
                if isinstance(dataframe[column].dtype, np.object):
                    # Replace None with NaN because SRW returns None if casting dates fails (dates are of type np.object))
                    values = dataframe[column].dropna().values
                    if len(values) > 0:
                        if isinstance(values[0], str):
                            dataframe[column] = dataframe[column].dropna().map(
                                str.strip)
                        elif isinstance(values[0], str):
                            # savReaderWriter casts dates to str
                            dataframe[column] = dataframe[column].dropna().map(
                                str.strip)
                            # creating DATETIME objects should happen here
            return dataframe
    elif engine == 'readstat':
        df, meta = pyreadstat.read_sav(sav_file)
        return df
示例#4
0
def get_data(filename, survey_name, variables):
    variables = get_lit_variables(survey_name, variables)
    df, meta = pyreadstat.read_sav(filename,
                                   apply_value_formats=True,
                                   usecols=variables)
    df['survey'] = survey_name
    df.columns = df.columns.str.upper()
    return [df, meta]
示例#5
0
def spss_to_csv(source_filename, dest_fileobj):
    df, meta = pyreadstat.read_sav(source_filename)
    df.to_csv(
        dest_fileobj,
        float_format='%g',
        index=False,
        line_terminator='\r\n',
    )
示例#6
0
文件: featherize.py 项目: drmrd/scdb
def scdb_sav_to_dataframe(scdb_sav_path):
    try:
        dataset = pd.read_spss(scdb_sav_path)
    except (PyreadstatError, ReadstatError):
        dataset, _ = pyreadstat.read_sav(scdb_sav_path,
                                         apply_value_formats=True,
                                         encoding='iso-8859-1')
    return dataset
示例#7
0
def read_exportCSV(filepath: str):
    '''
    這邊只處理SAV
    '''

    # ----sav
    if filepath.endswith('sav'):
        try:
            dt, meta = pyreadstat.read_sav(filepath)
        except:
            try:
                dt, meta = pyreadstat.read_sav(filepath, encoding='Big5-HKSCS')
            except Exception as e:
                return {filepath: e}

        # label
        col = dt.columns
        value_lab = []  # 處理及插補value label
        for i in col:
            try:
                target = meta.variable_value_labels[i]
                target = {str(key): target[key] for key in target}
                str1 = str(target)
                #str1.replace(',',', \n')
            except:
                str1 = ''
        value_lab.append(str1)

        labelDF = pd.DataFrame({
            'col_name': col,
            'col_lab': meta.column_labels,
            'val_lab': value_lab
        })

        labelDF.to_csv(re.sub(r"(?<=\.).*", 'label.csv', filepath),
                       encoding='utf_8_sig',
                       index=False)

        dt.to_csv(re.sub(r"(?<=\.).*", 'csv', filepath),
                  encoding='utf_8_sig',
                  index=False)
示例#8
0
 def feed(self):
     dataframe, meta = pyreadstat.read_sav(self.data_file)
     data = dataframe.drop(dataframe.columns[269:525],
                           axis=1).drop(dataframe.columns[526:], axis=1)
     weights = dataframe.drop(dataframe.columns[1:268], axis=1)
     # df(dataframe).to_sql("raw_data", self.engine, if_exists='replace', index=False)
     DataFrame(data).to_sql("questionnaire_data",
                            self.engine,
                            if_exists='replace',
                            index=False)
     DataFrame(weights).to_sql("weights_data",
                               self.engine,
                               if_exists='replace',
                               index=False)
示例#9
0
文件: outlier.py 项目: Ralireza/DMTM
def iso_farest(test_data, max_samples):
    df, meta = pyreadstat.read_sav(
        "/Users/alireza/project/DMTM/flask/er0827t.sav")

    for column in df:
        if df[column].isnull().values.all():
            df.drop(columns=column, axis=1, inplace=True)
        else:
            labels = imp.imputation(df[column], "mean")
            df[column] = pd.DataFrame(labels)
    rng = np.random.RandomState(42)
    outlier_df = []
    for column in df:
        if df[column].isnull().values.all():
            df.drop(columns=column, axis=1, inplace=True)
        else:
            labels = imp.imputation(df[column], "mean")
            df[column] = pd.DataFrame(labels)
            mean = df[column].mean()
            std = df[column].std()
            tmp = rng.uniform(low=mean + 5 * std, high=10 * std, size=(100, 1))
            outlier_col = []
            for data in tmp:
                outlier_col.append(data[0])
            outlier_df.append(outlier_col)
    outlier_df = pd.DataFrame(outlier_df).transpose()

    # df_all=pd.DataFrame( np.concatenate( (df.values, outlier_df.values), axis=0 ) )
    # df_all.columns=df.columns
    train, test = train_test_split(df, test_size=0.2)

    # train=df[:9999]
    # test=df[10000:]
    # anomaly = outlier_df

    # training the model
    clf = IsolationForest(max_samples=max_samples, random_state=rng)
    clf.fit(train)
    y_pred_test = clf.predict(test_data)
    # y_pred_outliers = clf.predict(anomaly)

    # new, 'normal' observations ----
    # normal_accuracy = (list(y_pred_test).count(1) / y_pred_test.shape[0])
    # outliers ----
    # outlier_accuracy = (list(y_pred_outliers).count(-1) / y_pred_outliers.shape[0])

    return y_pred_test
示例#10
0
def get_data_with_filename(file):
    """Get metadata and data from the .sav file.

    Args:
        file (str): .sav filename.

    Returns:
        data, meta (tuple): data
    """
    try:
        data, meta = pyreadstat.read_sav(file,
                                         apply_value_formats=True,
                                         encoding="ISO-8859-1")
        return data, meta
    except FileNotFoundError as FE:
        print("Need to investigate ", file)
        return None, None
def main():
    print('Loading data...')
    # load raw data
    df, meta = pyreadstat.read_sav(raw_file)
    raw_df = df[['caseid'] + lib_qs + con_qs + demographics]

    # give each text response its own row
    df = get_unique_text(raw_df)

    # load coded data
    data = pd.read_csv(coded_file, index_col=0, sep='\t')
    data['userid'] = [str(uid) for uid in data['userid']]

    # keep just the columns we need
    data = data[['userid', 'topic', 'position', 'authentic']]

    # merge data
    all_data = data.merge(df, on=['userid', 'topic', 'position'], how='inner')
    print('%s observations loaded.' % len(all_data))

    features, networks = get_features(all_data)

    # merge features into existing df
    merged_df = all_data.merge(features, on='rid', how='inner')

    # reindex to make life easier
    df, networks = reindex(merged_df, networks)

    # Write data to file
    print('Writing features and survey data to file.')
    df.to_csv('yougov_data.txt', sep='\t')

    # calculate distances
    valid_pairs = get_pairs(df)
    distance_df = get_distances(valid_pairs, networks)

    distance_df.to_csv('distances.txt', sep='\t')

    print('Distance calculations saved to file.')
示例#12
0
def parse_file(filename):
    if "sav" in filename.lower():
        df, meta = pyreadstat.read_sav(filename,
                                       apply_value_formats=True,
                                       metadataonly=True)
        return df, meta, False
    elif "por" in filename.lower():
        df, meta = pyreadstat.read_por(filename,
                                       apply_value_formats=True,
                                       metadataonly=True)
        return df, meta, False
    elif "sas7bdat" in filename.lower():
        df, meta = pyreadstat.read_sas7bdat(filename, metadataonly=True)
        return df, meta, False
    elif "xpt" in filename.lower():
        df, meta = pyreadstat.read_xport(filename, metadataonly=True)
        return df, meta, False
    elif "dta" in filename.lower():
        df, meta = pyreadstat.read_dta(filename,
                                       apply_value_formats=True,
                                       metadataonly=True)
        return df, meta, False
    else:
        return None, None, True
示例#13
0
文件: Komstat-1.py 项目: knipsaz/proj
#!/usr/bin/env python
# coding: utf-8

# In[54]:

import pandas as pd
import numpy as np
import pyreadstat as ps

# In[11]:

df, meta = ps.read_sav("D:/KOMSTAT/Pert2/komstat1.sav")

# In[193]:

df.head(10)

# In[15]:

df.shape

# In[70]:

df.columns

# In[117]:

df.dtypes

# In[118]:
示例#14
0
                linewidth=1)

    if threshold is not None:
        ax.plot(x_grid,
                threshold * np.ones_like(stability_selection.lambda_grid),
                'b--',
                linewidth=0.5)

    ax.set_ylabel('Stability score')
    ax.set_xlabel('Lambda / max(Lambda)')

    fig.tight_layout()
    return fig, ax


df, meta = pyreadstat.read_sav("Final Data, May 3.sav")
imp = KNNImputer(missing_values=np.nan)
df = df.select_dtypes(include=['float32', 'float64', 'int'])
# df.insert(3, "num2", num2)
targetIndex = -1
df = df.iloc[pd.isna(df.iloc[:, targetIndex]).values == False, :]
# df = df.drop(columns=["Num1"])
imp.fit(df)
vars = df.columns[range(len(df.columns) - 1)]
df = imp.transform(df)
# df = pd.DataFrame(vals, columns=vars)
# df = df[["WBC0", "Plt0", "Mg0", "Age", "Ca0", "BMI", "Na0", "P0", "HB0", "AST0", "PH0", "ALT0", "CRP0_Quantitative", "HeartFailure0", "Nausea0", "WeaknessFatigue0", "Cough0", "K0", "PR0", "Cr0",
#          "COVID19_outcome"]]
X = np.round(df[:, range(0, df.shape[1] - 1)])
Y = np.round(df[:, targetIndex])
示例#15
0
                         args=np.append(main_args,
                                        [(length * 7 // 8, length), res_8]))
        procs.append(proc_8)

    for proc in procs:
        proc.start()
    for proc in procs:
        proc.join()

    estimates = np.concatenate(
        (res_1, res_2, res_3, res_4, res_5, res_6, res_7, res_8))
    return estimates


if __name__ == '__main__':
    df, meta = pyreadstat.read_sav('conjoint_host_sim_dummy.sav')
    for n in range(1, 8):
        df[f'T{n}_select'] = df[f'T{n}_select'].astype(int)
    features = np.delete(
        np.unique(list(map(lambda x: x[x.rindex('_') + 1:], df.columns[2:]))),
        -1)
    df_diff = diff_model(df, features)
    with pm.Model() as logistic_model:
        pm.glm.GLM.from_formula('target ~ {0}'.format(' '.join(
            list(map(lambda x: str(x) + ' ' + '+',
                     df_diff.columns[:-1])))[:-2]),
                                data=df_diff,
                                family=pm.glm.families.Binomial())
        trace_logistic_model = pm.sample(2000,
                                         step=pm.NUTS(),
                                         chains=1,
# df_run_sps=df_run.loc[df_run.Extension==".sps"]
# df_run_xls=df_run.loc[df_run.Extension==".xls"]
# df_run_xlsx=df_run.loc[df_run.Extension==".xlsx"]
# df_run_xml=df_run.loc[df_run.Extension==".xml"]
# df_run_csv=df_run.loc[df_run.Extension==".csv"]

############################################################################
# Lecture des fichiers sav

import pyreadstat

#df1, meta1 = pyreadstat.read_sav(r"C:\Users\simon\Documents\Simon\Data4Good\9_GeoWatch\2011\Decembre11\Données_FSMS_nov11_26_12.sav")
#df2, meta2 = pyreadstat.read_sav(r"C:\Users\simon\Documents\Simon\Data4Good\9_GeoWatch\2011\Janvier11\Données\FSMS_2011_4-2-11_URBAN.sav")
#df3, meta3 = pyreadstat.read_sav(r"C:\Users\simon\Documents\Simon\Data4Good\9_GeoWatch\2011\Janvier11\Données\FSMS_2011_RURAL_FINAL.sav")
df4, meta4 = pyreadstat.read_sav(
    r"C:\Users\simon\Documents\Simon\Data4Good\9_GeoWatch\Mauritania FSMS data\2011\Juin11"
)
#df5, meta5 = pyreadstat.read_sav(r"C:\Users\simon\Documents\Simon\Data4Good\9_GeoWatch\2012\Analyse croise SA_NUT\RIM_FSMS_SMART_juil2012.sav")
#df6, meta6 = pyreadstat.read_sav(r"C:\Users\simon\Documents\Simon\Data4Good\9_GeoWatch\2012\Decembre\Donnes_FSMSdec12_HH_commun.sav")
df7, meta7 = pyreadstat.read_sav(
    r"C:\Users\simon\Documents\Simon\Data4Good\9_GeoWatch\2012\Juin\Données_FSMS_juil_12.sav"
)
#df8, meta8 = pyreadstat.read_sav(r"C:\Users\simon\Documents\Simon\Data4Good\9_GeoWatch\2013\Decembre\Données FSMS 13Dec_20_01_14.sav")
df9, meta9 = pyreadstat.read_sav(
    r"C:\Users\simon\Documents\Simon\Data4Good\9_GeoWatch\2013\Juin\FSMS_HH_juil13b_1.sav"
)
#df10, meta10 = pyreadstat.read_sav(r"C:\Users\simon\Documents\Simon\Data4Good\9_GeoWatch\2014\Decembre\Données_FSMS_24_06_15.sav")
df11, meta11 = pyreadstat.read_sav(
    r"C:\Users\simon\Documents\Simon\Data4Good\9_GeoWatch\2014\Juin\Données_FSMS_juin_2014.sav"
)
#df12, meta12 = pyreadstat.read_sav(r"C:\Users\simon\Documents\Simon\Data4Good\9_GeoWatch\2015\Decembre\Données FSMS Jan16_18_02.sav")
        print(df[feature_name].min())
        result[feature_name] = (df[feature_name] - min_value) / (max_value -
                                                                 min_value)
    return result


if __name__ == "__main__":
    if (len(sys.argv) != 3):
        sys.stderr.write('Usage: "{0}" $csvFileName $IndexOfX1\n'.format(
            sys.argv[0]))
        os._exit(1)

    if (".csv" in sys.argv[1]):
        data = pd.read_csv(sys.argv[1])
    else:
        data, meta = pyreadstat.read_sav(sys.argv[1])
    # Replace All space in column headers
    data.rename(columns=lambda name: name.replace(" ", "_"), inplace=True)

    # get column headers such as [keyX1, keyX2, keyX3, ..., keyY]
    keyList = data.columns.values[int(sys.argv[2]):]
    print("data readed table \n", data)

    #x = data.values #returns a numpy array
    #min_max_scaler = preprocessing.MinMaxScaler()
    #x_scaled = min_max_scaler.fit_transform(x)
    #data = pd.DataFrame(x_scaled)

    data = normalize(data[keyList[1:]])
    print(data)
    data = data.iloc[:, 1:6].values
示例#18
0
# -*- coding: utf-8 -*-
"""
Created on Sat Jun  8 23:21:28 2019

@author: garci
"""
'CONVERT SPSS FILES (.SAV) TO PYTHON DATAFRAME (LINE 12) PASS AS .CSV FILE (LINE 14)'

import pyreadstat
import pandas as pd

'FILE'
filename = 'experim.sav'
folder = 'data/'

df, meta = pyreadstat.read_sav(folder + filename)

'prints dataframe df'
print(df)

'converts .sav file to .csv and places it in script path'
df.to_csv(filename[:-4] + '.csv')
    # list all files in folder
    list_all_files = []
    for path, subdirs, files in os.walk(root_folder):
        for name in files:
            list_all_files.append(os.path.join(path, name))

    # list data files
    list_data_file = [f for f in list_all_files if re.search('.sav$', f)]

    return (list_data_file)


fsms_file = list_fsms_file()

data, meta = pyreadstat.read_sav(fsms_file[5],
                                 apply_value_formats=True,
                                 encoding="ISO-8859-1")


def clean_gps_coord(string):
    string = string.replace('&lt;Point&gt;&lt;coordinates&gt;', '')
    string = string.replace('&lt;/coordinates&gt;&lt;/Point&gt;', '')
    return (string)


def extract_latitude(string):
    lon = string.split(',', 2)[0]
    lat = string.split(',', 2)[1]
    return lat, lon

示例#20
0
records_ = df.to_dict(orient='records')
r = json.dumps(records_)
loaded_r = json.loads(r)
who_say.insert_many(loaded_r)

#--------------World Opinion Survey-------------------------------------------------------

import pandas as pd
import pyreadstat
import pymongo
from pymongo import MongoClient
client = MongoClient('localhost', 27017)

#--------- Import db from sav ---------------------------------------

df_WV2, meta_WV2 = pyreadstat.read_sav("WV2.sav", apply_value_formats=True)
df_WV3, meta_WV3 = pyreadstat.read_sav("WV3.sav", apply_value_formats=True)
df_WV4, meta_WV4 = pyreadstat.read_sav("WV4.sav", apply_value_formats=True)
df_WV5, meta_WV5 = pyreadstat.read_sav("WV5.sav", apply_value_formats=True)
df_WV6, meta_WV6 = pyreadstat.read_sav("WV6.sav", apply_value_formats=True)

#--------- call local host and create new data-----------------------

db = client.new_data
spss = db.spss

#--------- labels -----------------------------------------------------
df_WV2.columns = meta_WV2.column_labels
df_WV3.columns = meta_WV3.column_labels
df_WV4.columns = meta_WV4.column_labels
df_WV5.columns = meta_WV5.column_labels
示例#21
0
'NT1010',
'NT2010',
'NT1058',
'NT2058',
'NT1137',
'NT2137',
'NT1097',
'NT2097',
'NT99999999',
'NT9999'
]


name_csv='GFC_SelfReport.sav'
os.chdir('/Users/paulsharp/Documents/Dissertation_studies/data')
df_full,meta=pyreadstat.read_sav(name_csv,user_missing=True, apply_value_formats=False)
GFC_Subs=[]
for sub in df_full.NT_ID:
	sub=sub[-3:]
	GFC_Subs.append(sub)




#create conversion dictionary called power_bb from Power 264 to Big Brain 300 parcellation 
os.chdir(path_to_subs)
with open('convert_Power_to_bigbrain.csv', 'r') as f:
	r=reader(f)
	lines=[l for l in r]

for row in lines:
示例#22
0
import threading


def worker(inpt):
    print(inpt)
    offset, chunksize, path = inpt
    df, meta = pyreadstat.read_sav(path, row_offset=offset, row_limit=chunksize)
    # df, meta = pyreadstat.read_file_in_chunks(pyreadstat.read_sav, path, offset=offset, chunksize=chunksize,
    #                                           multiprocess=True, num_processes=10)
    return df


start_ts = time()

# calculate the number of rows in the file
_, meta = pyreadstat.read_sav("Surgery.sav", metadataonly=True)
numrows = meta.number_rows
# calculate number of cores in the machine, this could also be set manually to some number, i.e. 8
# calculate the chunksize and offsets
chunksize = 200
offsets = [indx * chunksize for indx in range(math.ceil(numrows / chunksize))]
# pack the data for the jobs
jobs = [(x, chunksize, "Surgery.sav") for x in offsets]

threads = []
max_threads = 30
while threads or jobs:
    for thread in threads:
        if not thread.is_alive():
            threads.remove(thread)
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import matplotlib.pyplot as plt
import pyreadstat

df1, meta = pyreadstat.read_sav(
    '/Users/lazarus/galvanize/datasets/osfstorage-archive/CCAM SPSS Data.sav')
df1.to_csv('/Users/lazarus/galvanize/capstone_1/data/climate_survey_data.csv')
'''
end product: bar graph of each static demographicgraphic's % change in belief
static demographics:
'gender'__________2
'generation'______6
'income_category'_3
'race'____________4
'party_x_ideo'____6
SPECIAL CASE ***'religion'_15***
SPECIAL CASE ***'evangelical'_4*** # look up evangelical environmental movement
'''


class DemographicAnalysis:
    def __init__(self, df):
        self.df = df1[df1.year != 1]  # get rid of 2008 because it's weird

    def split_demographics(self, df):
        # columns_to_split = input('Which categories do you want to split? ') <----interative for future users
        columns_to_split = [
            'gender', 'generation', 'income_category', 'race', 'party_x_ideo'
        ]  # choose which demographics to separate from their column categories
示例#24
0
# Read Excel File
Data = pd.read_excel("PATH_TO_FILE(or)URL_FOR_THE_FILE/example.xls",sheetname="Name")
#  Read SAS File
Data = pd.read_sas('example.sas7bdat')
# Read Stata File
Data = pd.read_stata('example.dta')
# Read R Data File
result = pyreadr.read_r('C:/Users/sampledata.RData')
print(result.keys()) # let's check what objects we got
Data = result["df1"] # extract the pandas data frame for object df1
# Read SQL Table from Sqlite3 with .db exension
conn = sqlite3.connect('C:/Users/Deepanshu/Downloads/flight.db')
query = "SELECT * FROM flight"
Data = pd.read_sql(query, con=conn)
# Read Data from SPSS File
Data, meta = pyreadstat.read_sav("file.sav", apply_value_formats=True)

"""Modules to be used based on size of the data

- Pandas - small DataSets upto 1 GB
- Dask - Medium DataSets upto XX GB
- Vaex - Large DataSets upto TB's
"""

# Example

data = pd.read_csv("https://raw.githubusercontent.com/Mineria/Titanic/master/csv/train.csv")

"""# Preprocessing

Basic preprocessing Steps:
示例#25
0
文件: savToCsv.py 项目: ssadata/rucas
#######################################################################

##### El presente scrip cumple con la función de convertir los archivos de SPSS (`.sav`) en CSV ######

# 1) Para cumplir dicho objetivo leemos desde el directorio de minio el cual tiene alojados los archivos `.sav`
#[/home/ubuntu/Rucas/data/sav/]

# 2) Luego de eso debemos agregar el nombre del archivo que transformaremos [reemplazar `<filename>`]

# 3) Por último le asignamos nombre al archivo que guardaremos como `.csv` [reemplazar `<name>`]

import pyreadstat as prst
# 1) #################################################### 2)  #########
df0, meta = prst.read_sav(
    '/home/ubuntu/Rucas/data/sav/W1 BdM BBDD HH 20 04 24.sav')

############# 3)  #####################################################
df0.to_csv(
    '/home/ubuntu/Rucas/data/dir_path/csv/tab/W1 BdM BBDD HH 20 04 24.csv',
    sep=',',
    float_format='%g',
    encoding='utf-8',
    index=False)
################################################ , sep=',', float_format nos quita los decimales '12.0' a '12' ####

# En caso de no poder ejecutar este documento: http://ezcsv2sav.com/about/ "
示例#26
0
import pandas as pd
import pyreadstat


def worker(inpt):
    import pyreadstat
    offset, chunksize, path = inpt
    df, meta = pyreadstat.read_sav(path,
                                   row_offset=offset,
                                   row_limit=chunksize)
    return df


# calculate the number of rows in the file
_, meta = pyreadstat.read_sav("big.sav", metadataonly=True)
numrows = meta.number_rows
# calculate number of cores in the machine, this could also be set manually to some number, i.e. 8
numcores = mp.cpu_count()
# calculate the chunksize and offsets
divs = [
    numrows // numcores + (1 if x < numrows % numcores else 0)
    for x in range(numcores)
]
chunksize = divs[0]
offsets = [indx * chunksize for indx in range(numcores)]
# pack the data for the jobs
jobs = [(x, chunksize, "big.sav") for x in offsets]

pool = mp.Pool(processes=numcores)
# let's go!
import pyreadstat
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from scipy.spatial import distance, distance_matrix
from sklearn.impute import SimpleImputer
import pandas as pd
from skbio import DistanceMatrix
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.utils import class_weight
from xgboost import XGBClassifier
from imblearn import pipeline as pl

df, meta = pyreadstat.read_sav("CMS.sav")
cols = df.columns
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df = df.select_dtypes(include=['float32', 'float64', 'int'])
imp.fit(df.values)
df = imp.transform(df.values)

CMS = np.round(df[:, -1])
# df = df.select_dtypes(include=['float32', 'float64', 'int'])
X = df[:, 0:df.shape[1] - 1:1]
CMS = CMS - 1
CMS[CMS == -1] = 1

Xd = pd.DataFrame(X)
duplicatedItem = Xd.duplicated(keep='first')
X = X[duplicatedItem == False, :]
示例#28
0
Created on Fri Jun 19 14:45:18 2020
@author: hsuwei
"""

import os
import re
from pathlib import Path
import pandas as pd
import pyreadstat

os.chdir('C:/Users/user/Desktop/LifeHistory')

# input

df96, meta96 = pyreadstat.read_sav('./data/1996/tscs1996q2.sav',
                                   apply_value_formats=False,
                                   formats_as_category=False
                                   )













示例#29
0
# import pandas, numpy, and pyreadstat
import pandas as pd
import numpy as np
import pyreadstat
pd.set_option('display.max_columns', 5)
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.width', 75)

# retrieve spss data, along with the meta data
nls97spss, metaspss = pyreadstat.read_sav('data/nls97.sav')
nls97spss.dtypes
nls97spss.head()

nls97spss['R0536300'].value_counts(normalize=True)

# use column labels and value labels
metaspss.variable_value_labels['R0536300']
nls97spss['R0536300'].\
  map(metaspss.variable_value_labels['R0536300']).\
  value_counts(normalize=True)
nls97spss = pyreadstat.set_value_labels(nls97spss, metaspss, formats_as_category=True)
nls97spss.columns = metaspss.column_labels
nls97spss['KEY!SEX (SYMBOL) 1997'].value_counts(normalize=True)
nls97spss.dtypes
nls97spss.columns = nls97spss.columns.\
    str.lower().\
    str.replace(' ','_').\
    str.replace('[^a-z0-9_]', '')
nls97spss.set_index('pubid__yth_id_code_1997', inplace=True)

# apply the formats from the beginning
示例#30
0
import matplotlib.pyplot as plt
import pandas as pd
import pyreadstat

from pew_crosstab import CrossTab
from pew_bars import BarNums
from cramer_chi import ContTabs

w68spss, metaspss = pyreadstat.read_sav('../data/W68.sav',
                                        apply_value_formats=True,
                                        formats_as_category=True)

fields = [
    'COVIDFOL_W68', 'COVIDCOVER1_W68', 'COVIDFACTS_b_W68',
    'COVIDNEWSCHNG_a_W68', 'COVIDNEWSCHNG_c_W68', 'COVIDNEWSCHNG_d_W68',
    'COVIDNEWSCHNG_e_W68', 'COVIDINFODIFF_W68', 'COVIDLOCINFO_W68',
    'COVIDDEAL_W68', 'COVIDPLANHRD_W68', 'COVIDPLANTRUE_W68',
    'COVIDPLANWATCH_W68', 'F_METRO', 'F_EDUCCAT', 'F_PARTY_FINAL'
]

facets = ['F_METRO', 'F_EDUCCAT', 'F_PARTY_FINAL']

answers = [
    'COVIDFOL_W68', 'COVIDCOVER1_W68', 'COVIDFACTS_b_W68',
    'COVIDNEWSCHNG_a_W68', 'COVIDNEWSCHNG_c_W68', 'COVIDNEWSCHNG_d_W68',
    'COVIDNEWSCHNG_e_W68', 'COVIDINFODIFF_W68', 'COVIDLOCINFO_W68',
    'COVIDDEAL_W68', 'COVIDPLANHRD_W68', 'COVIDPLANTRUE_W68',
    'COVIDPLANWATCH_W68'
]

df68 = pd.DataFrame(w68spss)