def __call__(self, data):  #data is in panda format
     data = data.replace('Iris-setosa', 0)
     data = data.replace('Iris-virginica', 1)
     data = data.replace('Iris-versicolor', 2)
     data = data[data.iloc[:, -1] != 2]
     data = data.drop(columns="Id")
     data = pd.DataFrame(data.as_matrix(),
                         columns=[
                             'SepalLengthCm', 'SepalWidthCm',
                             'PetalLengthCm', 'PetalWidthCm', 'Species'
                         ])
     return data
示例#2
0
    def get_pos(self, index):

        # assume that the extraction is based on one sentence
        tree = self.tree
        pos = ""

        if index == 0:
            pos = tree['words'][0][1]['PartOfSpeech']
        else:
            data = tree['parsetree'].encode('ascii', 'ignore').replace('[', '')
            data = data.replace(']', '').split()
            pos = []
            for x in data:
                if 'PartOfSpeech' in x:
                    pos.append(x)
            if index - 1 < len(pos):
                info = pos[index - 1]
                pos = info.split("=")[1]
            else:
                pos = '.'

        if pos in self.lookuptable['pos']:
            return self.lookuptable['pos'].index(pos)
        else:
            return len(self.lookuptable['pos']) + 1
示例#3
0
    def preprocess(csv_path, domain_id):
        data = pd.read_csv(csv_path, names=COLUMNS)

        # drop rows including missing values
        for column in COLUMNS:
            if data[column].dtype.name == "object":
                data[column] = data[column].str.replace(" ", "")
        data = data[data["workclass"] != "?"]
        data = data[data["occupation"] != "?"]
        data = data[data["native-country"] != "?"]

        # reduce category within marital-status
        data.replace([
            'Divorced', 'Married-AF-spouse', 'Married-civ-spouse',
            'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed'
        ], [
            'not married', 'married', 'married', 'married', 'not married',
            'not married', 'not married'
        ],
                     inplace=True)

        category_col = [
            'workclass', 'race', 'education', 'marital-status', 'occupation',
            'relationship', 'gender', 'native-country', 'income'
        ]

        # prepare categorical variables
        for col in category_col:
            b, c = np.unique(data[col], return_inverse=True)
            data[col] = c
        data.loc[data["age"] < 30, "age"] = 0
        data.loc[(data["age"] >= 30) & (data["age"] < 40), "age"] = 1
        data.loc[(data["age"] >= 40) & (data["age"] < 50), "age"] = 2
        data.loc[(data["age"] >= 50) & (data["age"] < 60), "age"] = 3
        data.loc[data["age"] >= 60, "age"] = 4

        # prepare particular domain
        data = data[data["age"] == domain_id]
        predictors = [
            'workclass', 'education', 'educational-num', 'marital-status',
            'occupation', 'relationship', 'race', 'gender', 'capital-gain',
            'capital-loss', 'hours-per-week', 'native-country'
        ]
        y = data["income"].values
        X = data[predictors].values
        return X, y
示例#4
0
def preProcess(filepath):
    Data = pd.read_csv(filepath, header=None, sep='\s+', dtype=np.object)
    Data = Data.replace('?', '-1')
    Data = Data[Data[22] != '-1']
    Data = Data.reset_index(drop=True)
    Data = pd.DataFrame(Data, dtype=np.float)
    Data[2] /= 1000000
    Data[3] /= 100
    Data[4] /= 100
    Data[5] /= 100
    Data[18] /= 100
    Data[19] /= 100
    Data[24] /= 10000
    return Data
def df_bold_min(data):
    '''
    highlight the maximum in a Series or DataFrame
    
    
    Usage:
        `df.style.apply(df_bold_min)`
    '''
    attr = 'font-weight: bold'
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_min = data == data.min()
        return [attr if v else '' for v in is_min]
    else:  # from .apply(axis=None)
        is_min = data == data.min().min()
        return pd.DataFrame(np.where(is_min, attr, ''),
                            index=data.index, columns=data.columns)
def read_file_list(filename):
    """
    Reads a trajectory from a text file.

    File format:
    The file format is "stamp d1 d2 d3 ...", where stamp denotes the time stamp (to be matched)
    and "d1 d2 d3.." is arbitary data (e.g., a 3D position and 3D orientation) associated to this timestamp.

    Input:
    filename -- File name

    Output:
    dict -- dictionary of (stamp,data) tuples

    """
    file = open(filename)
    data = file.read()
    lines = data.replace(","," ").replace("\t"," ").split("\n")
    list = [[v.strip() for v in line.split(" ") if v.strip()!=""] for line in lines if len(line)>0 and line[0]!="#"]
    list = [(float(l[0]),l[1:]) for l in list if len(l)>1]
    return dict(list)
示例#7
0
    def get_ner(self, index):

        # assume that the extraction is based on one sentence
        tree = self.tree
        ner = ""
        if index == 0:
            ner = tree['words'][0][1]['NamedEntityTag']
        else:
            data = tree['parsetree'].encode('ascii', 'ignore').replace('[', '')
            data = data.replace(']', '').split()
            ner = []
            for x in data:
                if 'NamedEntityTag' in x and 'NormalizedNamedEntityTag' not in x:
                    ner.append(x)
            if index - 1 < len(ner):
                info = ner[index - 1]
                ner = info.split("=")[1]
            else:
                ner = 'O'

        if ner in self.lookuptable['ner']:
            return self.lookuptable['ner'].index(ner)
        else:
            return len(self.lookuptable['ner']) + 1
import torch.autograd as autograd
import torch.utils.data
from torch.nn import functional as F
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import os
from torch.autograd import Variable
data=pd.read_csv('Tab.delimited.Cleaned.dataset.WITH.variable.labels.csv', sep='\t', engine='python')
file=open('mixture.txt')
labels=[]
for line in file:
    word=line.rstrip('\n')
    labels.append(word)

data=data.loc[:,labels]  #The selected columns prediction
data=data.replace(' ', np.nan)
#print(data.info(verbose=True))
data=data.dropna()
#print(data.info(verbose=True))
#17 rows and 49 cols
file=open('mixture.txt')
categories=["diseaseframinga","reciprocityothera","reciprocityusa","allowedforbiddena","flagtimeestimate1",
            "flagtimeestimate2","flagtimeestimate3","flagtimeestimate4","sex","citizenship"]
df=pd.get_dummies(data,columns=categories)
#print(df.info(verbose=True))

df=df.replace("Very much","11")
#rint(train.loc[:,"flagsupplement1"])
df=df.replace("Not at all","1")
df=df.replace("Republican","7")
df=df.replace("Democrat","1")
示例#9
0
time_step = 100
hidden_size = 30
num_lstm_layers = 2
"""
SECTION
Name: Preprocessing
Description: Pre-process the data and load the information
"""

# Remove semicolons from the file and make a new file

with open(r'data/WISDM_ar_v1.1_raw.txt', 'r') as infile:
    if not path.exists(r"data/WISDM_ar_v1.1_prep.csv"):
        with open(r'data/WISDM_ar_v1.1_prep.csv', 'w') as outfile:
            data = infile.read()
            data = data.replace(";", "")
            outfile.write(data)

thead = pd.read_csv(
    "data/WISDM_ar_v1.1_prep.csv",
    nrows=5)  # just read in a few lines to get the column headers
dtypes = dict(
    zip(thead.columns.values,
        ['int32', 'float32', 'float64', 'float32', 'float32', 'bool'
         ]))  # datatypes as given by the data page
#NOTE: Limited as my training device doesn't have high amounts of memory
data = pd.read_csv("data/WISDM_ar_v1.1_prep.csv",
                   header=None,
                   skiprows=0,
                   dtype=dtypes,
                   names=column_names).dropna()
示例#10
0
        img, heatmap, probs = gradcam_resnet(file_name,
                                             net,
                                             tensor_transform_test,
                                             use_cuda=USE_CUDA)
        pred = probs.argmax()
        plt.figure(figsize=(7, 7))
        img = np.array(img)
        plt.imshow(img[1, :, :], cmap=plt.cm.Greys_r)
        plt.imshow(heatmap, cmap=plt.cm.jet, alpha=0.3)
        plt.xticks([])
        plt.yticks([])
        plt.title('{} KLG: {}; Prediction: {}'.format(side, klg, pred))
        if pred == klg:
            subfolder = 'correct_pred'
            klg = str(klg)
            if not os.path.exists(os.path.join(save_dir, subfolder, klg, klg)):
                os.makedirs(os.path.join(save_dir, subfolder, klg, klg))
            plt.savefig(os.path.join(save_dir, subfolder, klg, klg,
                                     data.replace('.hdf5', '.png')),
                        dpi=300)
        else:
            klg = str(klg)
            pred = str(pred)
            subfolder = 'wrong_pred'
            if not os.path.exists(os.path.join(save_dir, subfolder, klg,
                                               pred)):
                os.makedirs(os.path.join(save_dir, subfolder, klg, pred))
            plt.savefig(os.path.join(save_dir, subfolder, klg, pred,
                                     data.replace('.hdf5', '.png')),
                        dpi=300)
示例#11
0
# function to encode a DNA sequence string as an ordinal vector
# returns a numpy vector with a=0.25, c=0.50, g=0.75, t=1.00, n=0.00
def ordinal_encoder(my_array):
    integer_encoded = label_encoder.transform(my_array)
    float_encoded = integer_encoded.astype(float)
    float_encoded[float_encoded == 0] = 25 # A
    float_encoded[float_encoded == 1] = 100 # C
    float_encoded[float_encoded == 2] = 150 # G
    float_encoded[float_encoded == 3] = 250 # T
    float_encoded[float_encoded == 4] = 0.00 # anything else, z
    return float_encoded


with open('/content/drive/My Drive/Colab Notebooks/bits-back/torch_vae/data/dna/dnafrgmnt00.txt', 'r') as file:
    data = file.read().replace('/n', '')
    data = data.replace(' ', '')

split_strings = []
n  = 784
for index in range(0, len(data), n):
    split_strings.append(data[index : index + n])

dna_arrays = []
for i in range(len(split_strings)):
  yy = ordinal_encoder(string_to_array(split_strings[i]))
  if yy.shape != (784,):
     zeronp = np.zeros(784-(yy.shape[0]))
     yy = np.append(yy,(zeronp))

  dna_arrays.append(yy)