예제 #1
0
 def ReadSimpleFile(self, path):
     assert isinstance(path, str)
     if path[-4::] != '.txt':
         print('Read file only support txt format')
         return None
     if not os.path.exists(Util().getDirectory() + "/DATA/" + path):
         print('File does not exist: %s' % (path))
         return None
     file = open(Util().getDirectory() + "/DATA/" + path, 'r')
     try:
         Contents, Labels = list(), list()
         lines = file.readlines()
         self.ResultLabels = lines.pop(0).replace("\n", "").split(" ")
         for line in lines:
             line = line.replace("\n", "").split("\t")
             if float(line[-1]) > 3.0:
                 Labels.append(1)
                 Contents.append(line[0])
             elif float(line[-1]) < 3.0:
                 Labels.append(0)
                 Contents.append(line[0])
     except IndexError and ValueError and KeyError:
         print('invalid file arrangement')
         return None
     except:
         print('unknown error')
         return None
     else:
         print('Read file successful')
     self.DataSet = Contents
     self.Labels = Labels
예제 #2
0
 def ReadSimpleFile(self, path):
     assert isinstance(path, str)
     if path[-4::] != '.txt':
         print('Read file only support txt format')
         return None
     if not os.path.exists(Util().getDirectory() + "/DATA/" + path):
         print('File does not exist: %s' % (path))
         return None
     file = open(Util().getDirectory() + "/DATA/" + path, 'r')
     try:
         lines = file.readlines()
         RawData, DataSet, Labels = None, list(), list()
         for line in lines:
             RawData = line.strip().split('\t')
             newLabel = RawData.pop()
             if not newLabel in self.LabelName:
                 self.LabelName.append(newLabel)
             Labels.append(newLabel)
             DataSet.append([float(item) for item in RawData.copy()])
     except IndexError and ValueError and KeyError:
         print('invalid file arrangement')
         return None
     except:
         print('unknown error')
         return None
     else:
         print('Read file successful')
     self.DataSet = np.array(DataSet)
     self.Labels = np.array(Labels, dtype=np.str).transpose()
     pass
예제 #3
0
 def saveGraph(self, name = None, path = None):
     if name is not None:
         assert isinstance(name, str)
     if path is not None:
         assert isinstance(path, str)
     try: plt.savefig(Util().getDirectory() + "/DATA/save/" + path)
     except: print("Invalid Directory: %s" % Util().getDirectory() + "/DATA/save/" + path)
     else: print("file saved to: %s" % Util().getDirectory() + "/DATA/save/" + path)
예제 #4
0
 def ReadSimpleFile(self, path):
     assert isinstance(path, str)
     if path[-4::] != ".txt":
         print("Read file only support txt format")
         return None
     if not os.path.exists(Util().getDirectory() + "/DATA/" + path):
         print('File does not exist: %s' % (path))
         return None
     file = open(Util().getDirectory() + "/DATA/" + path, 'r')
     lines = file.readlines()
     self.DataSet = list()
     for line in lines:
         tempLine = line.strip().split(" ")
         tempLine = [int(item) for item in tempLine]
         self.DataSet.append(tempLine.copy())
예제 #5
0
 def separateDataSet(self, testSize = 0.2, pattern = None):
     """
     The function separates data set to train data and test data. train data stores in the object.
     :param testSize: float in (0.0, 10.0), the portion of test data. default value is 0.2.
     :param pattern: the pattern for splitting, usually using list from MachineLearningHelper.Util.SplitDataSet
                     None will randomly split.
     :return: testData (ndarray), testLabel (nd.array)
     """
     assert testSize < 1.0 and testSize > 0.0
     if pattern is None:
         lookeup_table = Util().splitDataSet(self.DataSet.shape[0], testSize)  #get flag of splitting index
     else: lookeup_table = pattern
     trainData, trainLabel, testData, testLabel = list(), list(), list(), list()
     testIndex = list()
     for i in range(len(lookeup_table)):
         if lookeup_table[i] == 0:
             trainData.append(self.DataSet[i])
             trainLabel.append(self.Labels[i])
         elif lookeup_table[i] == 1:
             testData.append(self.DataSet[i])
             testLabel.append(self.Labels[i])
             testIndex.append(i)
         else: raise ValueError("index out of range [0, 1]")
     self.DataSet = np.array(trainData)
     self.Labels = np.array(trainLabel)
     if pattern is None:
         return np.array(testData), np.array(testLabel)
     else: return np.array(testData), np.array(testLabel), testIndex
예제 #6
0
 def Train(i):
     nonlocal constant
     i_error = CalculateError(i)
     if (LabelMat[i] * i_error < -tolerance and betas[i] < constrain) or \
             (LabelMat[i] * i_error > tolerance and betas[i] > 0):  #计算偏移初始条件
         j, j_error = SelectBestPair(i, i_error)
         i_old = betas[i].copy()
         j_old = betas[j].copy()
         if LabelMat[j] != LabelMat[i]:
             min_difference = max(0, betas[j] - betas[i])
             max_difference = min(constrain,
                                  constrain + betas[j] - betas[i])
         else:
             min_difference = max(0, betas[i] + betas[j] - constrain)
             max_difference = min(constrain, betas[i] + betas[j])
         if max_difference == min_difference:  #第一个运算结束条件判断
             print("BC1: equal difference, i: %d, j: %d" % (i, j))
             return 0
         if self.KernelValues is not None:  #如果是在高次svm里调用执行这个
             eta = 2.0 * self.KernelValues[i, j] - self.KernelValues[
                 i, i] - self.KernelValues[j, j]
         else:
             eta = 2.0 * (DataMat[i] * DataMat[j].T) - (
                 DataMat[i] * DataMat[i].T) - (DataMat[j] *
                                               DataMat[j].T)
         if eta >= 0:  #第二个运算结束条件判断
             print("BC2: eta >= 0, i: %d, j: %d" % (i, j))
             return 0
         betas[j] -= LabelMat[j] * (i_error - j_error) / eta
         betas[j] = Util().clipStepSize(max_difference, betas[j],
                                        min_difference)
         if capacity == "ENABLED":
             CalculateError(j, True)
         if np.abs(betas[j] - j_old) < pow(10, -5):  #第三个运算结束条件判断
             print("BC3: j_offset is less than 10^-5, i: %d, j: %d" %
                   (i, j))
             return 0
         betas[i] += LabelMat[i] * LabelMat[j] * (j_old - betas[j]
                                                  )  #反方向移动
         if capacity == "ENABLED":
             CalculateError(i, True)
         if self.KernelValues is not None:  #如果是在高次svm里调用执行这个
             i_constant = constant - i_error - LabelMat[i] * (betas[i] - i_old) * self.KernelValues[i, i] - \
                          LabelMat[j] * (betas[j] - j_old) * self.KernelValues[i, j]
             j_constant = constant - i_error - LabelMat[i] * (betas[i] - i_old) * self.KernelValues[i, j] - \
                          LabelMat[j] * (betas[j] - j_old) * self.KernelValues[j, j]
         else:
             i_constant = constant - i_error - LabelMat[i] * (betas[i] - i_old) * (DataMat[i] * DataMat[i].T) - \
                          LabelMat[j] * (betas[j] - j_old) * (DataMat[i] * DataMat[j].T)
             j_constant = constant - j_error - LabelMat[i] * (betas[i] - i_old) * (DataMat[i] * DataMat[j].T) - \
                          LabelMat[j] * (betas[j] - j_old) * (DataMat[j] * DataMat[j].T)
         if 0 < betas[i] and constrain > betas[i]: constant = i_constant
         elif 0 < betas[j] and constrain > betas[j]:
             constant = j_constant
         else:
             constant = (i_constant - j_constant) / 2 + j_constant
         return 1
     else:
         return 0
예제 #7
0
 def SmartTest(self, TestData, TestLabel):
     assert len(TestLabel) == len(TestData)
     PredictedLabel = list()
     for i in range(len(TestData)):
         result = self.Predict(
             [TestData[i][1], TestData[i][2], TestLabel[i]])
         print(result)
         PredictedLabel.append(result[1])
     assert len(PredictedLabel) == len(TestLabel)
     Util().plotCurveROC(TestLabel, PredictedLabel)
예제 #8
0
 def readSimpleFile(self, path):
     assert isinstance(path, str)
     data = list()
     fr = open(Util().getDirectory() + "/DATA/" + path, 'r')
     lines = fr.readlines()
     for line in lines:
         tempLine = list()
         splitLine = line.strip().split('\t')
         tempLine = [float(item) for item in splitLine]
         data.append(tempLine.copy())
     self.DataSet = np.array(data)
     pass
예제 #9
0
 def SelectBestPair(i, error_i):
     if capacity != "ENABLED":
         ret = Util().selectRandomItem(i, vertical)
         return ret, CalculateError(ret)
     else:
         assert self.ErrorsStorage.shape
         best, modification_best, error_best = -1, 0, 0
         self.ErrorsStorage[i] = [1, error_i]
         ValidList = np.nonzero(self.ErrorsStorage[:, 0].A)[0]
         if len(ValidList) > 1:
             for item in ValidList:
                 if item == i:
                     continue
                 error_item = CalculateError(item)
                 modification_item = np.abs(error_i - error_item)
                 if modification_item > modification_best:
                     best = item
                     modification_best = modification_item
                     error_best = error_item
             return best, error_best
         else:
             ret = Util().selectRandomItem(i, vertical)
             return ret, CalculateError(ret)
예제 #10
0
 def ReadSimpleFile(self, path):
     dataMat = []
     labelMat = []
     fr = open(Util().getDirectory() + "/DATA/" + path, 'r')
     for line in fr.readlines():
         lineArr = line.strip().split()
         tempLine = list()
         tempLine.append(1.0)
         length = len(lineArr) - 1
         for i in range(length):
             tempLine.append(float(lineArr[i]))
         dataMat.append(tempLine.copy())
         labelMat.append(int(lineArr[-1]))
     self.DataSet = np.array(dataMat)
     self.Labels = np.array(labelMat)
예제 #11
0
 def SeparateDataSet(self, TestSize=0.2, mode="DEFAULT"):
     assert mode in ("LOAD", "SAVE", "DEFAULT")
     TrainData, TrainLabel, TestData, TestLabel = list(), list(), list(
     ), list()
     Lookup_Table = Util().splitDataSet(len(self.DataSet), TestSize, mode)
     for i in range(len(Lookup_Table)):
         if Lookup_Table[i] == 0:
             TrainData.append(self.DataSet[i])
             TrainLabel.append(self.Labels[i])
         elif Lookup_Table[i] == 1:
             TestData.append(self.DataSet[i].tolist()[0])
             TestLabel.append(int(self.Labels[i]))
     self.DataSet = np.mat(np.array(TrainData))
     self.Labels = np.mat(np.array(TrainLabel)).transpose()
     return TestData, TestLabel
예제 #12
0
 def SeparateDataSet(self, TestSize=0.2, mode="DEFAULT"):
     assert mode in ("LOAD", "SAVE", "DEFAULT")
     TrainData, TrainLabel, TestData, TestLabel = list(), list(), list(
     ), list()
     Lookup_Table = Util().splitDataSet(len(self.DataSet), TestSize, mode)
     for i in range(len(Lookup_Table)):
         if Lookup_Table[i] == 0:
             TrainData.append(self.DataSet[i])
             TrainLabel.append(self.Labels[i])
         elif Lookup_Table[i] == 1:
             TestData.append(self.DataSet[i])
             TestLabel.append(self.Labels[i])
     self.DataSet = np.array(TrainData)
     self.USING_SAMPLE_NUM = self.DataSet.shape[0] // 2
     self.Labels = np.array(TrainLabel, dtype=np.str).transpose()
     return np.array(TestData), np.array(TestLabel, dtype=np.str)
예제 #13
0
 def SeparateDataSet(self, TestSize=0.2, mode="DEFAULT", if_return=False):
     assert mode in ("LOAD", "SAVE", "DEFAULT")
     TrainData, TrainLabel, TestData, TestLabel = list(), list(), list(
     ), list()
     Lookup_Table = Util().splitDataSet(len(self.DataSet), TestSize, mode)
     for i in range(len(Lookup_Table)):
         if Lookup_Table[i] == 0:
             TrainData.append(self.DataSet[i])
             TrainLabel.append(self.Labels[i])
         elif Lookup_Table[i] == 1:
             TestData.append(self.DataSet[i])
             TestLabel.append(int(self.Labels[i]))
     self.DataSet = np.array(TrainData)
     self.Labels = np.array(TrainLabel)
     if if_return:
         return TestData, TestLabel, np.nonzero(
             np.array(Lookup_Table) == 1)[0]
     return TestData, TestLabel
예제 #14
0
 def SeparateDataSet(self, TestSize=0.2, Pattern=None):
     TrainData, TestData, TrainLabel, TestLabel = list(), list(), list(
     ), list()
     if Pattern is not None:
         Lookup_Table = Pattern
     else:
         Lookup_Table = Util().splitDataSet(len(self.DataSet), TestSize)
     test_index = list()
     for i in range(len(Lookup_Table)):
         if Lookup_Table[i] == 0:
             TrainData.append(list(self.DataSet[i]))
             TrainLabel.append(int(self.Labels[i]))
         elif Lookup_Table[i] == 1:
             TestData.append(list(self.DataSet[i]))
             TestLabel.append(int(self.Labels[i]))
             test_index.append(i)
     self.DataSet = np.array(TrainData)
     self.Labels = np.array(TrainLabel)
     if Pattern is None: return np.array(TestData), np.array(TestLabel)
     else: return np.array(TestData), np.array(TestLabel), test_index
예제 #15
0
 def SeparateDataSet(self, testSize=0.2, pattern=None):
     assert testSize < 1.0
     if pattern is not None:
         lookeup_table = pattern
     else:
         lookeup_table = Util().splitDataSet(len(self.DataSet), testSize)
     trainData, testData = list(), list()
     testIndex = list()
     for i in range(len(lookeup_table)):
         if lookeup_table[i] == 0:
             trainData.append(self.DataSet[i])
         elif lookeup_table[i] == 1:
             testData.append(self.DataSet[i])
             testIndex.append(i)
         else:
             raise ValueError("index out of range [0, 1]")
     self.DataSet = trainData
     if pattern is None:
         return testData
     else:
         return testData, testIndex
예제 #16
0
 def ReadSimpleFile(self, path):
     dataMat, labelMat = list(), list()
     isinstance(path, str)
     fr = open(Util().getDirectory() + "/DATA/" + path, "r")
     lines = fr.readlines()
     self.Labels = lines.pop(0).strip().split()
     typeid = lines.pop(0).strip().split()
     typeList = list()
     typeKind = {"int": int, "bool": bool, "str": str}
     for item in typeid:
         typeList.append(typeKind[item])
     for line in lines:
         splitLine = line.strip().split(" ")
         assert len(splitLine) == len(typeList)
         tempLine = list()
         for i in range(len(splitLine)):
             if typeList[i] == bool:
                 tempLine.append(bool(int(splitLine[i])))
             else:
                 tempLine.append(typeList[i](splitLine[i]))
         dataMat.append(tempLine.copy())
     self.DataSet = dataMat
예제 #17
0
 def readSimpleFile(self, path):
     """
     the function reads data from a .txt file. The file should follow the format:
         1) data sepa
     :param path: str, the FILE NAME of the target file.
     :return: None
     """
     assert isinstance(path, str)
     dataMat, labelMat = list(), list()
     fr = open(Util().getDirectory() + "/DATA/" + path, 'r')
     lines = fr.readlines()
     self.Title = lines.pop(0).strip().split(" ")
     for line in lines:
         splitLine = line.strip().split("\t")
         length = len(splitLine) - 1
         if length != len(self.Title):
             continue
         tempLine = list()
         for i in range(length):
             tempLine.append(float(splitLine[i]))
         dataMat.append(tempLine.copy())
         labelMat.append(float(splitLine[-1]))
     self.DataSet = np.array(dataMat)
     self.Labels = np.array(labelMat)
예제 #18
0
            similarity = simMethod(dataSet[overlap, item], dataSet[overlap, i])
            if toggle_print:
                print("similarity between (%d) and (%d) is %.6f%%" %
                      (item, i, similarity * 100))
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0:
        return 0
    else:
        return ratSimTotal / simTotal


def recommend(dataSet, user, simMethod, prediction_num=3, toggle_print=False):
    unratedItems = np.nonzero(np.array(dataSet[user, :]) == 0)[1]
    if len(unratedItems) == 0:
        print("no recommendation available")
    itemScores = list()
    for item in unratedItems:
        estimateScore = estimate(dataSet, user, simMethod, item, toggle_print)
        itemScores.append((item, estimateScore))
    return sorted(itemScores,
                  key=lambda combination: combination[-1],
                  reverse=True)[:prediction_num]


if __name__ == '__main__':
    demo = DataPreprocessing()
    demo.readSimpleDataSet("recommendation.txt", demo.SETTYPE_NDMAT,
                           demo.DATATYPE_INT, ", ")
    result = recommend(demo.DataSet, 2, Util().SIM_COSINE, toggle_print=True)
    print(result)
예제 #19
0
import numpy as np

from learning.Helper import Util
from learning.machinelearning.classify import SupportVectorMachine as svm

demo = svm()
demo.ReadSimpleFile("testSet2.txt")  #read data from file
test_data, test_label = demo.SeparateDataSet()  #get test data and test label
demo.RadicalBias_Gaussian(20, 0.01, 1000, "DISABLED")
#demo.Platt_SMO(20, 0.0001, 1000, "DISABLED")       #linear classification
demo.GetLine(True)  #calculate the  decision boundary
predicted = list()  #store the predict values
for i in range(len(test_data)):
    res = demo.Predict(np.array(test_data[i]).transpose(),
                       test_label[i])  #predict
    print(res)
    predicted.append(res[1])  #append the predicted value
demo.GraphPoints()  #graph
Util().plotCurveROC(test_label, predicted)  #graph roc curve