def ReadSimpleFile(self, path): assert isinstance(path, str) if path[-4::] != '.txt': print('Read file only support txt format') return None if not os.path.exists(Util().getDirectory() + "/DATA/" + path): print('File does not exist: %s' % (path)) return None file = open(Util().getDirectory() + "/DATA/" + path, 'r') try: Contents, Labels = list(), list() lines = file.readlines() self.ResultLabels = lines.pop(0).replace("\n", "").split(" ") for line in lines: line = line.replace("\n", "").split("\t") if float(line[-1]) > 3.0: Labels.append(1) Contents.append(line[0]) elif float(line[-1]) < 3.0: Labels.append(0) Contents.append(line[0]) except IndexError and ValueError and KeyError: print('invalid file arrangement') return None except: print('unknown error') return None else: print('Read file successful') self.DataSet = Contents self.Labels = Labels
def ReadSimpleFile(self, path): assert isinstance(path, str) if path[-4::] != '.txt': print('Read file only support txt format') return None if not os.path.exists(Util().getDirectory() + "/DATA/" + path): print('File does not exist: %s' % (path)) return None file = open(Util().getDirectory() + "/DATA/" + path, 'r') try: lines = file.readlines() RawData, DataSet, Labels = None, list(), list() for line in lines: RawData = line.strip().split('\t') newLabel = RawData.pop() if not newLabel in self.LabelName: self.LabelName.append(newLabel) Labels.append(newLabel) DataSet.append([float(item) for item in RawData.copy()]) except IndexError and ValueError and KeyError: print('invalid file arrangement') return None except: print('unknown error') return None else: print('Read file successful') self.DataSet = np.array(DataSet) self.Labels = np.array(Labels, dtype=np.str).transpose() pass
def saveGraph(self, name = None, path = None): if name is not None: assert isinstance(name, str) if path is not None: assert isinstance(path, str) try: plt.savefig(Util().getDirectory() + "/DATA/save/" + path) except: print("Invalid Directory: %s" % Util().getDirectory() + "/DATA/save/" + path) else: print("file saved to: %s" % Util().getDirectory() + "/DATA/save/" + path)
def ReadSimpleFile(self, path): assert isinstance(path, str) if path[-4::] != ".txt": print("Read file only support txt format") return None if not os.path.exists(Util().getDirectory() + "/DATA/" + path): print('File does not exist: %s' % (path)) return None file = open(Util().getDirectory() + "/DATA/" + path, 'r') lines = file.readlines() self.DataSet = list() for line in lines: tempLine = line.strip().split(" ") tempLine = [int(item) for item in tempLine] self.DataSet.append(tempLine.copy())
def separateDataSet(self, testSize = 0.2, pattern = None): """ The function separates data set to train data and test data. train data stores in the object. :param testSize: float in (0.0, 10.0), the portion of test data. default value is 0.2. :param pattern: the pattern for splitting, usually using list from MachineLearningHelper.Util.SplitDataSet None will randomly split. :return: testData (ndarray), testLabel (nd.array) """ assert testSize < 1.0 and testSize > 0.0 if pattern is None: lookeup_table = Util().splitDataSet(self.DataSet.shape[0], testSize) #get flag of splitting index else: lookeup_table = pattern trainData, trainLabel, testData, testLabel = list(), list(), list(), list() testIndex = list() for i in range(len(lookeup_table)): if lookeup_table[i] == 0: trainData.append(self.DataSet[i]) trainLabel.append(self.Labels[i]) elif lookeup_table[i] == 1: testData.append(self.DataSet[i]) testLabel.append(self.Labels[i]) testIndex.append(i) else: raise ValueError("index out of range [0, 1]") self.DataSet = np.array(trainData) self.Labels = np.array(trainLabel) if pattern is None: return np.array(testData), np.array(testLabel) else: return np.array(testData), np.array(testLabel), testIndex
def Train(i): nonlocal constant i_error = CalculateError(i) if (LabelMat[i] * i_error < -tolerance and betas[i] < constrain) or \ (LabelMat[i] * i_error > tolerance and betas[i] > 0): #计算偏移初始条件 j, j_error = SelectBestPair(i, i_error) i_old = betas[i].copy() j_old = betas[j].copy() if LabelMat[j] != LabelMat[i]: min_difference = max(0, betas[j] - betas[i]) max_difference = min(constrain, constrain + betas[j] - betas[i]) else: min_difference = max(0, betas[i] + betas[j] - constrain) max_difference = min(constrain, betas[i] + betas[j]) if max_difference == min_difference: #第一个运算结束条件判断 print("BC1: equal difference, i: %d, j: %d" % (i, j)) return 0 if self.KernelValues is not None: #如果是在高次svm里调用执行这个 eta = 2.0 * self.KernelValues[i, j] - self.KernelValues[ i, i] - self.KernelValues[j, j] else: eta = 2.0 * (DataMat[i] * DataMat[j].T) - ( DataMat[i] * DataMat[i].T) - (DataMat[j] * DataMat[j].T) if eta >= 0: #第二个运算结束条件判断 print("BC2: eta >= 0, i: %d, j: %d" % (i, j)) return 0 betas[j] -= LabelMat[j] * (i_error - j_error) / eta betas[j] = Util().clipStepSize(max_difference, betas[j], min_difference) if capacity == "ENABLED": CalculateError(j, True) if np.abs(betas[j] - j_old) < pow(10, -5): #第三个运算结束条件判断 print("BC3: j_offset is less than 10^-5, i: %d, j: %d" % (i, j)) return 0 betas[i] += LabelMat[i] * LabelMat[j] * (j_old - betas[j] ) #反方向移动 if capacity == "ENABLED": CalculateError(i, True) if self.KernelValues is not None: #如果是在高次svm里调用执行这个 i_constant = constant - i_error - LabelMat[i] * (betas[i] - i_old) * self.KernelValues[i, i] - \ LabelMat[j] * (betas[j] - j_old) * self.KernelValues[i, j] j_constant = constant - i_error - LabelMat[i] * (betas[i] - i_old) * self.KernelValues[i, j] - \ LabelMat[j] * (betas[j] - j_old) * self.KernelValues[j, j] else: i_constant = constant - i_error - LabelMat[i] * (betas[i] - i_old) * (DataMat[i] * DataMat[i].T) - \ LabelMat[j] * (betas[j] - j_old) * (DataMat[i] * DataMat[j].T) j_constant = constant - j_error - LabelMat[i] * (betas[i] - i_old) * (DataMat[i] * DataMat[j].T) - \ LabelMat[j] * (betas[j] - j_old) * (DataMat[j] * DataMat[j].T) if 0 < betas[i] and constrain > betas[i]: constant = i_constant elif 0 < betas[j] and constrain > betas[j]: constant = j_constant else: constant = (i_constant - j_constant) / 2 + j_constant return 1 else: return 0
def SmartTest(self, TestData, TestLabel): assert len(TestLabel) == len(TestData) PredictedLabel = list() for i in range(len(TestData)): result = self.Predict( [TestData[i][1], TestData[i][2], TestLabel[i]]) print(result) PredictedLabel.append(result[1]) assert len(PredictedLabel) == len(TestLabel) Util().plotCurveROC(TestLabel, PredictedLabel)
def readSimpleFile(self, path): assert isinstance(path, str) data = list() fr = open(Util().getDirectory() + "/DATA/" + path, 'r') lines = fr.readlines() for line in lines: tempLine = list() splitLine = line.strip().split('\t') tempLine = [float(item) for item in splitLine] data.append(tempLine.copy()) self.DataSet = np.array(data) pass
def SelectBestPair(i, error_i): if capacity != "ENABLED": ret = Util().selectRandomItem(i, vertical) return ret, CalculateError(ret) else: assert self.ErrorsStorage.shape best, modification_best, error_best = -1, 0, 0 self.ErrorsStorage[i] = [1, error_i] ValidList = np.nonzero(self.ErrorsStorage[:, 0].A)[0] if len(ValidList) > 1: for item in ValidList: if item == i: continue error_item = CalculateError(item) modification_item = np.abs(error_i - error_item) if modification_item > modification_best: best = item modification_best = modification_item error_best = error_item return best, error_best else: ret = Util().selectRandomItem(i, vertical) return ret, CalculateError(ret)
def ReadSimpleFile(self, path): dataMat = [] labelMat = [] fr = open(Util().getDirectory() + "/DATA/" + path, 'r') for line in fr.readlines(): lineArr = line.strip().split() tempLine = list() tempLine.append(1.0) length = len(lineArr) - 1 for i in range(length): tempLine.append(float(lineArr[i])) dataMat.append(tempLine.copy()) labelMat.append(int(lineArr[-1])) self.DataSet = np.array(dataMat) self.Labels = np.array(labelMat)
def SeparateDataSet(self, TestSize=0.2, mode="DEFAULT"): assert mode in ("LOAD", "SAVE", "DEFAULT") TrainData, TrainLabel, TestData, TestLabel = list(), list(), list( ), list() Lookup_Table = Util().splitDataSet(len(self.DataSet), TestSize, mode) for i in range(len(Lookup_Table)): if Lookup_Table[i] == 0: TrainData.append(self.DataSet[i]) TrainLabel.append(self.Labels[i]) elif Lookup_Table[i] == 1: TestData.append(self.DataSet[i].tolist()[0]) TestLabel.append(int(self.Labels[i])) self.DataSet = np.mat(np.array(TrainData)) self.Labels = np.mat(np.array(TrainLabel)).transpose() return TestData, TestLabel
def SeparateDataSet(self, TestSize=0.2, mode="DEFAULT"): assert mode in ("LOAD", "SAVE", "DEFAULT") TrainData, TrainLabel, TestData, TestLabel = list(), list(), list( ), list() Lookup_Table = Util().splitDataSet(len(self.DataSet), TestSize, mode) for i in range(len(Lookup_Table)): if Lookup_Table[i] == 0: TrainData.append(self.DataSet[i]) TrainLabel.append(self.Labels[i]) elif Lookup_Table[i] == 1: TestData.append(self.DataSet[i]) TestLabel.append(self.Labels[i]) self.DataSet = np.array(TrainData) self.USING_SAMPLE_NUM = self.DataSet.shape[0] // 2 self.Labels = np.array(TrainLabel, dtype=np.str).transpose() return np.array(TestData), np.array(TestLabel, dtype=np.str)
def SeparateDataSet(self, TestSize=0.2, mode="DEFAULT", if_return=False): assert mode in ("LOAD", "SAVE", "DEFAULT") TrainData, TrainLabel, TestData, TestLabel = list(), list(), list( ), list() Lookup_Table = Util().splitDataSet(len(self.DataSet), TestSize, mode) for i in range(len(Lookup_Table)): if Lookup_Table[i] == 0: TrainData.append(self.DataSet[i]) TrainLabel.append(self.Labels[i]) elif Lookup_Table[i] == 1: TestData.append(self.DataSet[i]) TestLabel.append(int(self.Labels[i])) self.DataSet = np.array(TrainData) self.Labels = np.array(TrainLabel) if if_return: return TestData, TestLabel, np.nonzero( np.array(Lookup_Table) == 1)[0] return TestData, TestLabel
def SeparateDataSet(self, TestSize=0.2, Pattern=None): TrainData, TestData, TrainLabel, TestLabel = list(), list(), list( ), list() if Pattern is not None: Lookup_Table = Pattern else: Lookup_Table = Util().splitDataSet(len(self.DataSet), TestSize) test_index = list() for i in range(len(Lookup_Table)): if Lookup_Table[i] == 0: TrainData.append(list(self.DataSet[i])) TrainLabel.append(int(self.Labels[i])) elif Lookup_Table[i] == 1: TestData.append(list(self.DataSet[i])) TestLabel.append(int(self.Labels[i])) test_index.append(i) self.DataSet = np.array(TrainData) self.Labels = np.array(TrainLabel) if Pattern is None: return np.array(TestData), np.array(TestLabel) else: return np.array(TestData), np.array(TestLabel), test_index
def SeparateDataSet(self, testSize=0.2, pattern=None): assert testSize < 1.0 if pattern is not None: lookeup_table = pattern else: lookeup_table = Util().splitDataSet(len(self.DataSet), testSize) trainData, testData = list(), list() testIndex = list() for i in range(len(lookeup_table)): if lookeup_table[i] == 0: trainData.append(self.DataSet[i]) elif lookeup_table[i] == 1: testData.append(self.DataSet[i]) testIndex.append(i) else: raise ValueError("index out of range [0, 1]") self.DataSet = trainData if pattern is None: return testData else: return testData, testIndex
def ReadSimpleFile(self, path): dataMat, labelMat = list(), list() isinstance(path, str) fr = open(Util().getDirectory() + "/DATA/" + path, "r") lines = fr.readlines() self.Labels = lines.pop(0).strip().split() typeid = lines.pop(0).strip().split() typeList = list() typeKind = {"int": int, "bool": bool, "str": str} for item in typeid: typeList.append(typeKind[item]) for line in lines: splitLine = line.strip().split(" ") assert len(splitLine) == len(typeList) tempLine = list() for i in range(len(splitLine)): if typeList[i] == bool: tempLine.append(bool(int(splitLine[i]))) else: tempLine.append(typeList[i](splitLine[i])) dataMat.append(tempLine.copy()) self.DataSet = dataMat
def readSimpleFile(self, path): """ the function reads data from a .txt file. The file should follow the format: 1) data sepa :param path: str, the FILE NAME of the target file. :return: None """ assert isinstance(path, str) dataMat, labelMat = list(), list() fr = open(Util().getDirectory() + "/DATA/" + path, 'r') lines = fr.readlines() self.Title = lines.pop(0).strip().split(" ") for line in lines: splitLine = line.strip().split("\t") length = len(splitLine) - 1 if length != len(self.Title): continue tempLine = list() for i in range(length): tempLine.append(float(splitLine[i])) dataMat.append(tempLine.copy()) labelMat.append(float(splitLine[-1])) self.DataSet = np.array(dataMat) self.Labels = np.array(labelMat)
similarity = simMethod(dataSet[overlap, item], dataSet[overlap, i]) if toggle_print: print("similarity between (%d) and (%d) is %.6f%%" % (item, i, similarity * 100)) simTotal += similarity ratSimTotal += similarity * userRating if simTotal == 0: return 0 else: return ratSimTotal / simTotal def recommend(dataSet, user, simMethod, prediction_num=3, toggle_print=False): unratedItems = np.nonzero(np.array(dataSet[user, :]) == 0)[1] if len(unratedItems) == 0: print("no recommendation available") itemScores = list() for item in unratedItems: estimateScore = estimate(dataSet, user, simMethod, item, toggle_print) itemScores.append((item, estimateScore)) return sorted(itemScores, key=lambda combination: combination[-1], reverse=True)[:prediction_num] if __name__ == '__main__': demo = DataPreprocessing() demo.readSimpleDataSet("recommendation.txt", demo.SETTYPE_NDMAT, demo.DATATYPE_INT, ", ") result = recommend(demo.DataSet, 2, Util().SIM_COSINE, toggle_print=True) print(result)
import numpy as np from learning.Helper import Util from learning.machinelearning.classify import SupportVectorMachine as svm demo = svm() demo.ReadSimpleFile("testSet2.txt") #read data from file test_data, test_label = demo.SeparateDataSet() #get test data and test label demo.RadicalBias_Gaussian(20, 0.01, 1000, "DISABLED") #demo.Platt_SMO(20, 0.0001, 1000, "DISABLED") #linear classification demo.GetLine(True) #calculate the decision boundary predicted = list() #store the predict values for i in range(len(test_data)): res = demo.Predict(np.array(test_data[i]).transpose(), test_label[i]) #predict print(res) predicted.append(res[1]) #append the predicted value demo.GraphPoints() #graph Util().plotCurveROC(test_label, predicted) #graph roc curve