예제 #1
0
    def __init__(self,
                 tep_file_fault_free,
                 tep_file_faulty,
                 is_test=False,
                 normalize=True):
        """
        Args:
            csv_file (string): path to csv file
            normalize (bool): whether to normalize the data in [-1,1]
        """
        if "sampled" in tep_file_fault_free:
            df = pd.read_pickle(tep_file_fault_free)
        else:
            fault_free = py.read_r(tep_file_fault_free)
            faulty = py.read_r(tep_file_faulty)
            if is_test:
                df = pd.concat([
                    fault_free['fault_free_testing'], faulty['faulty_testing']
                ])
            else:
                df = pd.concat([
                    fault_free['fault_free_training'],
                    faulty['faulty_training']
                ])

        # todo: add conditioning on fault number, now we generate only the normal condition
        df = df[(df.faultNumber == 0)]
        work_with_columns = [
            'faultNumber', 'simulationRun', 'sample', 'xmeas_1'
        ]
        raw_data = torch.from_numpy(
            np.expand_dims(
                np.array([
                    g[1]["xmeas_1"] for g in df[work_with_columns].groupby(
                        ['faultNumber', 'simulationRun'])
                ]), -1)).float()
        # for checking if logic above is working properly
        assert np.allclose(
            raw_data.squeeze()[0, :].numpy(),
            df[(df.simulationRun == 1) & (df.faultNumber == 0)].xmeas_1.values)
        self.data = self.normalize(raw_data) if normalize else raw_data
        self.seq_len = raw_data.size(1)

        # Estimates distribution parameters of deltas (Gaussian) from normalized data
        original_deltas = raw_data[:, -1] - raw_data[:, 0]
        self.original_deltas = original_deltas
        self.or_delta_max, self.or_delta_min = original_deltas.max(
        ), original_deltas.min()
        deltas = self.data[:, -1] - self.data[:, 0]
        self.deltas = deltas
        self.delta_mean, self.delta_std = deltas.mean(), deltas.std()
        self.delta_max, self.delta_min = deltas.max(), deltas.min()
예제 #2
0
def load_1ddata(filename0, filename1):
    assert filename0[:10] == filename1[:10], 'Different date in two dataset!'
    df0 = pyreadr.read_r('%s%s' % (input_dir0, filename0))['indicator_1m']
    df0['Symbol'] = [i[2:] for i in df0['Symbol']]
    df1 = pyreadr.read_r('%s%s' %
                         (input_dir1, filename1))['data_1m'].iloc[:, :-4]
    df1['Time'] = df1['minute']
    del df1['minute']
    tdf = df1.merge(df0,
                    left_on=['Date', 'Time', 'Symbol'],
                    right_on=['Date', 'Time', 'Symbol'],
                    how='inner')
    return tdf
예제 #3
0
    def load_metadata(self, filename=None, alt_url=None, out_dir=None):
        """ Load the AURN metadata from file or URL.
            If filename is None, will use the metadata stored at the URL: alt_url
            Otherwise, will load from URL: alt_url

            Dependencies:
                Path

            Args:
                filename: (string) Valid file name of existing AURN metadata R file, or None
                alt_url: (string) Valid URL pointing to AURN metadata downloadable source, or None
                out_dir: (string) Directory to store .RData file if alt_url used.

            Returns:
                Dataframe containing the downloaded data

        """
        assert filename is None or Path(filename).is_file(), ValueError(
            'Invalid filename: {}'.format(filename))

        # Has a filename been entered and does the file exist?
        if filename is not None:
            print("Metadata file {} exists so will use this".format(filename))
            filename = Path(filename)

            # Read the RData file into a Pandas dataframe
            try:
                print('Reading filename {} into dataframe.'.format(
                    filename.name))
                return pyreadr.read_r(str(filename))
            except Exception as err:
                raise ValueError(
                    'Error reading into dataframe from R file: {} . {}'.format(
                        filename, err))

        # No filename so try URL if one exists, or raise error
        assert alt_url is not None, ValueError(
            'no filename or url'.format(filename))

        # Does the URL alternative exist and does it work
        print("\nDownloading data file using url {}".format(alt_url))
        try:
            print('\nLoading metadata file from url')
            filename = Path(wget.download(alt_url, out_dir))
            return pyreadr.read_r(str(filename))
        except Exception as err:
            raise ValueError(
                'Error obtaining metadata file from url: {}. {}'.format(
                    alt_url, err))
    def __init__(self,
                 tep_file_fault_free,
                 tep_file_faulty,
                 window_size=10,
                 is_test=False,
                 transform=None):
        self.window_size = window_size
        self.is_test = is_test
        self.transform = transform

        if "sampled" in tep_file_fault_free:
            self.df = pd.read_pickle(tep_file_fault_free)
        else:
            fault_free = py.read_r(tep_file_fault_free)
            faulty = py.read_r(tep_file_faulty)
            if is_test:
                self.df = pd.concat([
                    fault_free['fault_free_testing'], faulty['faulty_testing']
                ])
            else:
                self.df = pd.concat([
                    fault_free['fault_free_training'],
                    faulty['faulty_training']
                ])

        # cause the dataset has the broken index
        self.df = self.df \
            .sort_values(by=["faultNumber", "simulationRun", "sample"], ascending=True) \
            .reset_index(drop=True)

        self.class_count = len(self.df.faultNumber.value_counts())

        self.runs_count = self.df.faultNumber.unique(
        ).shape[0] * self.df.simulationRun.unique().shape[0]
        self.sample_count = 960 if is_test else 500
        self.shots_count = self.sample_count - self.window_size + 1

        # making labels according to TEP
        self.labels = self.df.loc[:, ["faultNumber", "sample"]]
        self.labels.loc[:, "label"] = self.labels.loc[:, "faultNumber"].astype(
            'long')
        if is_test:
            self.labels.loc[(self.labels.label != 0) &
                            (self.labels["sample"] <= 160), "label"] = 0
        else:
            self.labels.loc[(self.labels["label"] != 0) &
                            (self.labels["sample"] <= 20), "label"] = 0

        self.features_count = self.df.shape[1] - 3
    def rdata_data(self, filename, is_int=False, is_float=False):

        to_return = []
        data = pyreadr.read_r(filename)

        for k in data.keys():
            for r in range(0, len(data[k].index)):
                row = data[k].iloc[r, :]
                if r == 0:
                    for i in range(0, len(row) - 1):
                        D = {}
                        to_return.append(D)

                for j in range(1, len(row)):
                    if is_int or is_float:
                        try:
                            if not math.isnan(float(row[j])):

                                if is_float:
                                    to_return[j - 1][int(row[0])] = float(
                                        row[j])
                                else:
                                    to_return[j - 1][int(row[0])] = float(
                                        row[j])
                        except:
                            if len(row[j]) != 0:
                                to_return[j - 1][int(row[0])] = float(
                                    row[j][1:])
                    else:
                        to_return[j - 1][int(row[0])] = row[j]

        return to_return
예제 #6
0
def import_external_file_as_dataframe(public_file_url,
                                      data_name,
                                      extension,
                                      import_method="feather"):

    absolute_dir = pathlib.Path().resolve()
    file_name = data_name + extension
    destination_path = absolute_dir.parent.joinpath("data", "public_data",
                                                    file_name)
    download_a_file_from_url(public_file_url, destination_path)

    if (import_method == "rdata"):
        return import_rda_file_by_rdata(import_path=destination_path,
                                        dataframe_name=data_name)

#     this case returns error "ArrowInvalid: Not a Feather V1 or Arrow IPC file" when the file format is not feather
    elif (import_method == "feather"):
        return feather.read_dataframe(destination_path)[data_name]

    elif (import_method == "pyreadr"):
        return pyreadr.read_r(str(destination_path))[data_name]

    else:
        print("not registered import method. returns None")
        sys.exit(0)
예제 #7
0
def load_sites_and_obtain_their_grid_locations(wrf_in,sensors_file,sensor_file_type):

    # open the sensor dataset
    if(sensor_file_type == 'CSV'):
        sens = pd.read_csv(sensors_file,usecols=['long','lat','sensor_name'])
    elif(sensor_file_type == 'RDATA'):
        metadata = pyreadr.read_r(sensors_file.as_posix())
        sens = metadata['AURN_metadata'][['site_id','latitude','longitude']].drop_duplicates() 
    else:
        print('Sensor file type not recognised: {}'.format(sensor_file_type))
        print('Should be CSV or RDATA, please amend as appropriate.')
        quit()

    # get the indexes from the wrf file
    with salem.open_wrf_dataset(wrf_in) as ds:
        sens['iarray'],sens['jarray'] = ds.salem.grid.transform(sens['longitude'],sens['latitude'],nearest=True)

    #%% check to make sure that all stations are within our model domain - drop those that aren't
    if any(sens['iarray']>ds.dims['west_east']) or any(sens['jarray']>ds.dims['south_north']):
        print('dropping these stations outside of the model domain:')
        print(sens[sens['jarray']>ds.dims['south_north']])
        print(sens[sens['iarray']>ds.dims['west_east']])
        sens = sens[sens['jarray']<=ds.dims['south_north']]
        sens = sens[sens['iarray']<=ds.dims['west_east']]

    return(sens)
예제 #8
0
def rds_to_html(file_path):

    app_rds = pyreadr.read_r(file_path)
    app_df = app_rds[None]
    app_html = app_df.iloc[0, 0]

    return app_html
예제 #9
0
def generateTrainPredictFromRDS(inFile, negProtein=None):
    """
    This function reads the rds file and generates training/predict set using the given symbols and labels.
    RDS files are generated using R code.
    """
    logging.info('Loading data from RDS file to create a dictionary')
    rdsdata = pyreadr.read_r(inFile)
    df = rdsdata[None]
    trainData[True] = set(df.loc[(df['Y'] == 'pos')
                                 & (df['subset'] == 'train')]["id1"])
    trainData[False] = set(df.loc[(df['Y'] == 'neg')
                                  & (df['subset'] == 'train')]["id1"])
    # trainData[True] = set(np.where(rdsdata[None]['Y']=='pos')[0])
    # trainData[False] = set(np.where(rdsdata[None]['Y']=='neg')[0])

    # if negative label was not provided, use default protein ids
    if negProtein is not None:
        trainData[False].update(negProtein)

    # determine train and predict set
    predictProteinSet = allProteinIds.difference(trainData[True])
    predictProteinSet = predictProteinSet.difference(trainData[False])
    predictData['unknown'] = predictProteinSet
    logging.info(
        'Count of positive labels: {0}, count of negative labels: {1}'.format(
            len(trainData[True]), len(trainData[False])))
    if len(trainData[True]) == 0 or len(trainData[False]) == 0:
        logging.error('ML codes cannot be run with one class')
        exit()
    else:
        return trainData, predictData
예제 #10
0
 def readremoteRDSdata(url=''):
     """
     Read an R RDS file from a remote Internet repository by URL
 
     Parameters
     ----------
     url : string
         The raw-formatted RDS file to load in. The default is ''.
 
     Raises
     ------
     Exception
         Throws an exception if scratch disk is unavailable or data is inaccessible.
 
     Returns
     -------
     pandas dataframe
         A pandas dataframe that has the uncompressed data from the URL..
 
     """
     scratch = ''
     try:
         scratch = rdshandling.getfilename(url) + '.rda'
     except:
         raise Exception("Filename issue")
     local = None
     result = None
     try:
         local = pyreadr.download_file(url, scratch)
         result = pyreadr.read_r(local)
     except Exception as e:
         print(e)
     return result[None]
예제 #11
0
 def test_plot_mse_epoch_small(self):
     '''
     test plot&dimension of data
     '''
     try:
         newres=pyreadr.read_r(projresdir+plotdata_r)
         oldres=pyreadr.read_r(test_input+plotdata_r)
         # figequal=newres['p']==oldres['p']
         figequal=True
         tabdimequal=(newres['summtab'].shape[0]==oldres['summtab'].shape[0] and newres['msetablong'].shape==oldres['msetablong'].shape)
         print("summtab_size %s msetablong_size %s\n" % (newres['summtab'].shape,newres['msetablong'].shape,))
         if figequal and tabdimequal:
             self.assertTrue(True)
         else:
             self.assertTrue(False)
     except:
         self.assertTrue(False)
예제 #12
0
def read_rds(filepath):
    """Read an RDS-format matrix into a Pandas dataframe.
    Location can be data, scratch, or results.
    Index is populated from first column"""
    raw_df = pyreadr.read_r(filepath)[None]
    if raw_df.isnull().values.any():
        raise ValueError("NaN's were found in the data matrix.")
    return raw_df.set_index(raw_df.columns[0], drop=True)
예제 #13
0
def rds_to_html(file_path):

    f_name = str(Path(file_path))
    app_rds = pyreadr.read_r(f_name)
    app_df = app_rds[None]
    app_html = app_df.iloc[0, 0]

    return app_html
예제 #14
0
def rds_to_html(file):

    f_name = os.path.join(rds_directory_full_path, file)
    app_rds = pyreadr.read_r(f_name)
    app_df = app_rds[None]
    app_html = app_df.iloc[0, 0]

    return app_html
예제 #15
0
def read_projects_data():
    result = pd.DataFrame()
    for week in weeks:
        projects_path, _ = week_paths[week]
        p_df = pyreadr.read_r(projects_path)[None]
        result = result.append(p_df[['project_slug', 'project_name', 'creator_name']])
    result.drop_duplicates(inplace=True)
    return result
예제 #16
0
파일: slicer.py 프로젝트: Xzh0u/IDS-LSTM
def load_RData():
    # Reading train data in .R format
    train_data_0 = py.read_r("data/RData/TEP_FaultFree_Training.RData")
    train_data_1 = py.read_r("data/RData/TEP_Faulty_Training.RData")
    # Reading test data in .R format
    test_data_0 = py.read_r("data/RData/TEP_FaultFree_Testing.RData")
    test_data_1 = py.read_r("data/RData/TEP_Faulty_Testing.RData")
    print("Finish reading data.")

    # Concatinating the train and the test dataset
    tr = [train_data_0['fault_free_training'], train_data_1['faulty_training']]
    train = pd.concat(tr)  # Train dataframe
    ts = [test_data_0['fault_free_testing'], test_data_1['faulty_testing']]
    test = pd.concat(ts)  # Test dataframe

    # Save the datasets into csv file
    train.to_csv("data/train.csv")
    test.to_csv("data/test.csv")
예제 #17
0
def load_population(rds_file: str):
    """Loads population data from RDS file.

    :param rds_file: and RDS file containing a data.frame with columns 'age', 'LA.code', 'n'
    """
    raw = pyr.read_r(rds_file)
    df = list(raw.values())[0]
    df = df.sort_values(by=['LA.code', 'age'])
    return df[['LA.code', 'name', 'Area.name.2', 'age', 'n']]
예제 #18
0
def read_data(root):

    #read dataset
    result = pyreadr.read_r(root+'TNBC_data/TCGA_TNBC112.RData')

    print(result.keys()) # let's check what objects we got
    gene_df = result["TCGA"] # extract the pandas data frame for object df1

    return gene_df.T
예제 #19
0
def load_age_mixing(rds_file: str):
    """Loads age mixing matrix from R.

    :param rds_file: a .rds file containing an R data.frame with mixing matrix
    """
    raw = pyr.read_r(rds_file)
    K = list(raw.values())[0]
    age_groups = K.columns
    return K.to_numpy(dtype=np.float32), age_groups
예제 #20
0
def to_pandas(filename):
    print(filename)
    df = pyreadr.read_r(filename)[None]
    print(df)
    df = df.set_index(df.columns[0])
    old_index = df.index.tolist()
    new_index = list(map(lambda x: x.split('_')[-1], old_index))
    rename_dict = dict(zip(old_index, new_index))
    df = df.rename(index=rename_dict)
    return df.T  # (samples, genes)
def read_Rda(path_to_Rda, verbose=False):
    Rdata = pyreadr.read_r(path_to_Rda) # also works for Rds
    while len(Rdata.keys()) == 1:
        keys = np.array(list(Rdata.keys()))
        Rdata = Rdata[keys[0]]
    npdata = np.array(Rdata, dtype=np.double)
    data_size = npdata.shape
    if verbose:
        print("Sample size is ", npdata.shape)
    return npdata, data_size
예제 #22
0
def schedule(year: int) -> pd.DataFrame:
    """
    Get the schedule for a given year
    """
    schedule_data_file: str = "/".join(
        [NFLFASTR_DATA_DIR, "schedules", f"sched_{year}.rds"])
    r_data: OrderedDict = pyreadr.read_r(schedule_data_file)
    assert set(r_data.keys()) == set([None]), "Unexpected keys"
    data: pd.DataFrame = r_data[None]
    return data
예제 #23
0
 def __init__(self, filename, target = "target", ignore = []):
     
     #read data file
     data = pyreadr.read_r(filename)[None]
     
     #convert to torch tensors
     self.target     = torch.tensor(data[target].values, dtype = torch.float32).unsqueeze(1)
     self.predictors = torch.tensor(np.array(data.drop(columns = target)), dtype = torch.float32)
     
     print("Data read successfully!")
예제 #24
0
def download_schedule_data(year):
    try:
        rds_file = pyreadr.download_file(
            f'https://github.com/nflverse/nflfastR-data/blob/master/schedules/sched_{str(year)}.rds?raw=True',
            f'data/sched_{str(year)}.rds')
        rds_df = pyreadr.read_r(rds_file)
        df = rds_df[None]
        df.to_csv(f'data/sched_{year}.csv.gz', compression='gzip')
        # delete the rds file
        os.remove(rds_file)
    except:
        print(f"couldnt read schedule data for {year}")
def read_file(cluster_dir, test):
    cd = os.listdir(cluster_dir)
    clusters = []
    for cluster in cd:
        if cluster.endswith("_filter_" + test + ".rds"):
            pyr = pyreadr.read_r(os.path.join(cluster_dir, cluster))
            df1 = pyr[None]
            cluster_list = list(df1['gene'])
            cnum = str(''.join(filter(str.isdigit, cluster)))
            f = ["Cluster" + cnum, "CLUSTER" + cnum] + cluster_list
            clusters.append(f)
    return (clusters)
def loadData():
    # OriginalRData
    result = pyreadr.read_r("jobvite_1_2_merged_anonymized.RData")  # output: odict_keys(['anon'])
    merged_anonymized = result["anon"]
    #print(merged_anonymized.shape)

    # Remove Duplicates, if someone was hired for one job and rejected for another keep only hired:
    merged_anonymized.sort_values(by=['Jobvite.ID', 'Hired'], ascending=[True, False], inplace=True)
    merged_anonymized.drop_duplicates(subset='Jobvite.ID', keep='first', inplace=True)
    #print(merged_anonymized.shape)

    return merged_anonymized
def convert(filename):
    # take the file name as input
    input_rds = filename

    # Read the R-database file into an Ordered Dictionary
    result = pyreadr.read_r(input_rds)

    # put the data into a pandas dataframe
    df = result[None]

    # output the csv
    df.to_csv(f"{input_rds[:-4]}.csv")
예제 #28
0
def load_UNSC():
    """
    Function to load raw UNSC RData

    Returns:
        ids (list[str]): list of unique IDs for speeches
        flat_text (list[str]): list of raw speech text
    """
    data = pyreadr.read_r("./data/UNSC/docs.RData")["raw_docs"]
    ids = data["doc_id"].tolist()
    flat_text = data["text"].tolist()
    return ids, flat_text
예제 #29
0
파일: italk.py 프로젝트: rfour92/pypath
def italk_raw():
    """
    Returns a ``pandas.DataFrame`` with the iTalk database contents.
    """

    url = urls.urls['italk']['url']
    c = curl.Curl(url, silent=False, large=True)
    rdata_path = c.fileobj.name
    c.fileobj.close()

    rdata = pyreadr.read_r(rdata_path)['database']

    return rdata
예제 #30
0
def gen_map():
    mapdata_d = pyreadr.read_r(
        '../datasets/mapdata_copyright_openstreetmap_contributors.Rds')
    mapdata = np.reshape(to_dec(list(mapdata_d.values())[0].to_numpy()),
                         (-1, 1311)).astype(float)

    aspect = mapdata.shape[0] * 1.0 / mapdata.shape[1]
    lon_lat_box = (-88, -87.5, 41.6, 42.1)

    plt.figure(figsize=(10, 14))
    plt.imshow(mapdata,
               cmap=plt.get_cmap('gray'),
               extent=lon_lat_box,
               aspect=aspect)