def __call__(self, data): #data is in panda format data = data.replace('Iris-setosa', 0) data = data.replace('Iris-virginica', 1) data = data.replace('Iris-versicolor', 2) data = data[data.iloc[:, -1] != 2] data = data.drop(columns="Id") data = pd.DataFrame(data.as_matrix(), columns=[ 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species' ]) return data
def get_pos(self, index): # assume that the extraction is based on one sentence tree = self.tree pos = "" if index == 0: pos = tree['words'][0][1]['PartOfSpeech'] else: data = tree['parsetree'].encode('ascii', 'ignore').replace('[', '') data = data.replace(']', '').split() pos = [] for x in data: if 'PartOfSpeech' in x: pos.append(x) if index - 1 < len(pos): info = pos[index - 1] pos = info.split("=")[1] else: pos = '.' if pos in self.lookuptable['pos']: return self.lookuptable['pos'].index(pos) else: return len(self.lookuptable['pos']) + 1
def preprocess(csv_path, domain_id): data = pd.read_csv(csv_path, names=COLUMNS) # drop rows including missing values for column in COLUMNS: if data[column].dtype.name == "object": data[column] = data[column].str.replace(" ", "") data = data[data["workclass"] != "?"] data = data[data["occupation"] != "?"] data = data[data["native-country"] != "?"] # reduce category within marital-status data.replace([ 'Divorced', 'Married-AF-spouse', 'Married-civ-spouse', 'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed' ], [ 'not married', 'married', 'married', 'married', 'not married', 'not married', 'not married' ], inplace=True) category_col = [ 'workclass', 'race', 'education', 'marital-status', 'occupation', 'relationship', 'gender', 'native-country', 'income' ] # prepare categorical variables for col in category_col: b, c = np.unique(data[col], return_inverse=True) data[col] = c data.loc[data["age"] < 30, "age"] = 0 data.loc[(data["age"] >= 30) & (data["age"] < 40), "age"] = 1 data.loc[(data["age"] >= 40) & (data["age"] < 50), "age"] = 2 data.loc[(data["age"] >= 50) & (data["age"] < 60), "age"] = 3 data.loc[data["age"] >= 60, "age"] = 4 # prepare particular domain data = data[data["age"] == domain_id] predictors = [ 'workclass', 'education', 'educational-num', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country' ] y = data["income"].values X = data[predictors].values return X, y
def preProcess(filepath): Data = pd.read_csv(filepath, header=None, sep='\s+', dtype=np.object) Data = Data.replace('?', '-1') Data = Data[Data[22] != '-1'] Data = Data.reset_index(drop=True) Data = pd.DataFrame(Data, dtype=np.float) Data[2] /= 1000000 Data[3] /= 100 Data[4] /= 100 Data[5] /= 100 Data[18] /= 100 Data[19] /= 100 Data[24] /= 10000 return Data
def df_bold_min(data): ''' highlight the maximum in a Series or DataFrame Usage: `df.style.apply(df_bold_min)` ''' attr = 'font-weight: bold' #remove % and cast to float data = data.replace('%','', regex=True).astype(float) if data.ndim == 1: # Series from .apply(axis=0) or axis=1 is_min = data == data.min() return [attr if v else '' for v in is_min] else: # from .apply(axis=None) is_min = data == data.min().min() return pd.DataFrame(np.where(is_min, attr, ''), index=data.index, columns=data.columns)
def read_file_list(filename): """ Reads a trajectory from a text file. File format: The file format is "stamp d1 d2 d3 ...", where stamp denotes the time stamp (to be matched) and "d1 d2 d3.." is arbitary data (e.g., a 3D position and 3D orientation) associated to this timestamp. Input: filename -- File name Output: dict -- dictionary of (stamp,data) tuples """ file = open(filename) data = file.read() lines = data.replace(","," ").replace("\t"," ").split("\n") list = [[v.strip() for v in line.split(" ") if v.strip()!=""] for line in lines if len(line)>0 and line[0]!="#"] list = [(float(l[0]),l[1:]) for l in list if len(l)>1] return dict(list)
def get_ner(self, index): # assume that the extraction is based on one sentence tree = self.tree ner = "" if index == 0: ner = tree['words'][0][1]['NamedEntityTag'] else: data = tree['parsetree'].encode('ascii', 'ignore').replace('[', '') data = data.replace(']', '').split() ner = [] for x in data: if 'NamedEntityTag' in x and 'NormalizedNamedEntityTag' not in x: ner.append(x) if index - 1 < len(ner): info = ner[index - 1] ner = info.split("=")[1] else: ner = 'O' if ner in self.lookuptable['ner']: return self.lookuptable['ner'].index(ner) else: return len(self.lookuptable['ner']) + 1
import torch.autograd as autograd import torch.utils.data from torch.nn import functional as F import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec import os from torch.autograd import Variable data=pd.read_csv('Tab.delimited.Cleaned.dataset.WITH.variable.labels.csv', sep='\t', engine='python') file=open('mixture.txt') labels=[] for line in file: word=line.rstrip('\n') labels.append(word) data=data.loc[:,labels] #The selected columns prediction data=data.replace(' ', np.nan) #print(data.info(verbose=True)) data=data.dropna() #print(data.info(verbose=True)) #17 rows and 49 cols file=open('mixture.txt') categories=["diseaseframinga","reciprocityothera","reciprocityusa","allowedforbiddena","flagtimeestimate1", "flagtimeestimate2","flagtimeestimate3","flagtimeestimate4","sex","citizenship"] df=pd.get_dummies(data,columns=categories) #print(df.info(verbose=True)) df=df.replace("Very much","11") #rint(train.loc[:,"flagsupplement1"]) df=df.replace("Not at all","1") df=df.replace("Republican","7") df=df.replace("Democrat","1")
time_step = 100 hidden_size = 30 num_lstm_layers = 2 """ SECTION Name: Preprocessing Description: Pre-process the data and load the information """ # Remove semicolons from the file and make a new file with open(r'data/WISDM_ar_v1.1_raw.txt', 'r') as infile: if not path.exists(r"data/WISDM_ar_v1.1_prep.csv"): with open(r'data/WISDM_ar_v1.1_prep.csv', 'w') as outfile: data = infile.read() data = data.replace(";", "") outfile.write(data) thead = pd.read_csv( "data/WISDM_ar_v1.1_prep.csv", nrows=5) # just read in a few lines to get the column headers dtypes = dict( zip(thead.columns.values, ['int32', 'float32', 'float64', 'float32', 'float32', 'bool' ])) # datatypes as given by the data page #NOTE: Limited as my training device doesn't have high amounts of memory data = pd.read_csv("data/WISDM_ar_v1.1_prep.csv", header=None, skiprows=0, dtype=dtypes, names=column_names).dropna()
img, heatmap, probs = gradcam_resnet(file_name, net, tensor_transform_test, use_cuda=USE_CUDA) pred = probs.argmax() plt.figure(figsize=(7, 7)) img = np.array(img) plt.imshow(img[1, :, :], cmap=plt.cm.Greys_r) plt.imshow(heatmap, cmap=plt.cm.jet, alpha=0.3) plt.xticks([]) plt.yticks([]) plt.title('{} KLG: {}; Prediction: {}'.format(side, klg, pred)) if pred == klg: subfolder = 'correct_pred' klg = str(klg) if not os.path.exists(os.path.join(save_dir, subfolder, klg, klg)): os.makedirs(os.path.join(save_dir, subfolder, klg, klg)) plt.savefig(os.path.join(save_dir, subfolder, klg, klg, data.replace('.hdf5', '.png')), dpi=300) else: klg = str(klg) pred = str(pred) subfolder = 'wrong_pred' if not os.path.exists(os.path.join(save_dir, subfolder, klg, pred)): os.makedirs(os.path.join(save_dir, subfolder, klg, pred)) plt.savefig(os.path.join(save_dir, subfolder, klg, pred, data.replace('.hdf5', '.png')), dpi=300)
# function to encode a DNA sequence string as an ordinal vector # returns a numpy vector with a=0.25, c=0.50, g=0.75, t=1.00, n=0.00 def ordinal_encoder(my_array): integer_encoded = label_encoder.transform(my_array) float_encoded = integer_encoded.astype(float) float_encoded[float_encoded == 0] = 25 # A float_encoded[float_encoded == 1] = 100 # C float_encoded[float_encoded == 2] = 150 # G float_encoded[float_encoded == 3] = 250 # T float_encoded[float_encoded == 4] = 0.00 # anything else, z return float_encoded with open('/content/drive/My Drive/Colab Notebooks/bits-back/torch_vae/data/dna/dnafrgmnt00.txt', 'r') as file: data = file.read().replace('/n', '') data = data.replace(' ', '') split_strings = [] n = 784 for index in range(0, len(data), n): split_strings.append(data[index : index + n]) dna_arrays = [] for i in range(len(split_strings)): yy = ordinal_encoder(string_to_array(split_strings[i])) if yy.shape != (784,): zeronp = np.zeros(784-(yy.shape[0])) yy = np.append(yy,(zeronp)) dna_arrays.append(yy)