def __init__(self,type,rootDir): """ type: 1 -> full data 2 -> limited data """ #percentage of data to be used for model build if type == 1: self.GROUPTYPE = ["GENERAL"] self.percentageList = [1000, 2500, 5000, 7500, 10000, 20000] elif type == 2: self.GROUPTYPE = ["GENERAL"] self.percentageList = [1000] else: sys.exit("Wrong 'type' parameter in createData.__init__") self.shevaDB = ShevaDB() self.shevaTPF = ShevaTPF() self.shevaUtils = ShevaUtils() self.shevaVect = ShevaVect() if rootDir != "": self.shevaUtils.createDirOne(str(rootDir)) self.rootDir = str(rootDir) else: sys.exit("Wrong 'rootDir' parameter in createData.__init__")
def __init__(self, testSize, category, type=1): """ INPUT: type = full data (1) or training data (2) testSize = % of model size (nr of documents in model) to test with """ #percentage of data to be used for model build if type == 1: self.GROUPTYPE = ["CATID", "FATHERID", "GENERAL"] self.limitList = [1000, 2500, 5000, 7500, 10000, 20000] elif type == 2: self.GROUPTYPE = ["GENERAL", "FATHERID", "GENERAL"] self.limitList = [1000] else: sys.exit("Wrong 'type' parameter in createData.__init__") print "SimilarityLevel created" #Sheva Objects self.shevaDB = ShevaDB() self.shevaTPF = ShevaTPF() self.shevaSimilarity = ShevaSimilarity() self.shevaCSV = ShevaCSV() self.shevaClassificationMetrics = ShevaClassificationMetrics() self.shevaUtils = ShevaUtils() #SimilarityLevel Variables self.rootDir = "LimitModels/" self.testSize = testSize self.category = category self.maxDepth = self.shevaDB.getCategorymaxDepth(self.category)
def __init__(self, testSize): """ INPUT: type = full data (1) or training data (2) testSize = % of model size (nr of documents in model) to test with """ """ #percentage of data to be used for model build if type == 1: self.GROUPTYPE = ["CATID", "FATHERID", "GENERAL"] #self.percentageList = [25, 50, 75, 100] elif type == 2: self.GROUPTYPE = ["GENERAL"] #self.percentageList = [25, 50, 75, 100] else: sys.exit("Wrong 'type' parameter in createData.__init__") """ print "SimilarityLevel created" #Sheva Objects self.shevaDB = ShevaDB() self.shevaTPF = ShevaTPF() self.shevaSimilarity = ShevaSimilarity() self.shevaCSV = ShevaCSV() self.shevaClassificationMetrics = ShevaClassificationMetrics() self.shevaUtils = ShevaUtils() #SimilarityLevel Variables self.rootDir = "LevelModels/" self.testSize = testSize
class createDataSingleLimit: def __init__(self,type,rootDir): """ type: 1 -> full data 2 -> limited data """ #percentage of data to be used for model build if type == 1: self.GROUPTYPE = ["GENERAL"] self.percentageList = [1000, 2500, 5000, 7500, 10000, 20000] elif type == 2: self.GROUPTYPE = ["GENERAL"] self.percentageList = [1000] else: sys.exit("Wrong 'type' parameter in createData.__init__") self.shevaDB = ShevaDB() self.shevaTPF = ShevaTPF() self.shevaUtils = ShevaUtils() self.shevaVect = ShevaVect() if rootDir != "": self.shevaUtils.createDirOne(str(rootDir)) self.rootDir = str(rootDir) else: sys.exit("Wrong 'rootDir' parameter in createData.__init__") def createData(self, category): """ 1. get root categories to be used and iterate through main categories 3. get max depth for individual category 4. from 1 to max till 1 to 1 get all catid for iterated category get all pages for selected categories call createCorpusAndVectorModel from selected documents """ for group in self.GROUPTYPE: #gruping dependent queries if group != "FATHERID": sqlCategory = "select Description, catid from dmoz_combined where mainCategory = '%s' limit 20000" %(category) else: sqlCategory = "select Description, fatherid from dmoz_combined where mainCategory = '%s' limit 20000" %(category) sqlQueryResults = self.shevaDB.dbQuery(sqlCategory) if sqlQueryResults == 0: sys.exit("SQL code error in level: \t", category,"\t",indeks,"\t",sqlCategoryLevel) for percentageItem in self.percentageList: sqlQueryResultsLimit = [x for x in sqlQueryResults[:percentageItem]] #data for % model, range data dataCategoryLevelAll = [] dataCategoryLabelAll = [] originalCatIDAll = [] dataCategorySingleAll = [] path = "%s/%s/%s/" %(self.rootDir,group,percentageItem) self.shevaUtils.createDir(self.rootDir,group,percentageItem) #for indeks in ranger: #level list variables dataCategoryLevel = [] dataCategoryLabel = [] originalCatID = [] originalFatherID = [] finalContent = [] #get unique values if group == "GENERAL": finalContent = [[item for item in row[0].split()] for row in sqlQueryResultsLimit] originalCatID = [row[1] for row in sqlQueryResultsLimit] dataCategoryLevel.extend(self.shevaTPF.returnClean(finalContent,1)) else: unique = [] for row in sqlQueryResultsLimit: if row[1] not in unique: unique.append(row[1]) #prepare rows with uniq for document in model for uniq in unique: tempUnique = [] tempUnique = [row[0].split() for row in sqlQueryResultsLimit if row[1] == uniq] mergedContent = [i for i in itertools.chain.from_iterable(tempUnique)] finalContent.append(mergedContent) originalCatID.append(uniq) dataCategoryLevel.extend(self.shevaTPF.returnClean(finalContent,1)) self.shevaUtils.createDir(self.rootDir, group, percentageItem) #create file names #fileNameAll = "%s_%s_1_%s" %(str(percentageItem),category,str(indeks)) fileNameLevel = "%s_%s" %(str(percentageItem),category) fileNameSingleAll = "%s_%s" %(str(percentageItem),category) ########## ORIGINAL DESCRIPTION AND VECTORIZATION ################# #create corpus models #self.shevaVect.createCorpusAndVectorModel(dataCategoryLevel,fileNameLevel,path) dataCategoryLevelAll.extend(dataCategoryLevel) #self.shevaVect.createCorpusAndVectorModel(dataCategoryLevelAll, fileNameAll,path) #single model for all documents dataCategorySingleAll.append([x for sublist in dataCategoryLevelAll for x in sublist]) #print dataCategorySingleAll #print len(Counter(dataCategorySingleAll[0])) print len(dataCategorySingleAll), len(dataCategorySingleAll[0]) self.shevaVect.createCorpusAndVectorModel(dataCategorySingleAll, fileNameSingleAll,path) ########## ORIGINAL CATEGORIES ID ################# #self.shevaUtils.getCategoryListLevel(originalCatID,fileNameLevel,path) #originalCatIDAll.extend(originalCatID) #self.shevaUtils.getCategoryListLevel(originalCatIDAll,fileNameAll,path) #print out number of documents for (cat,level,model) print "Done with:\t",group,"\t",category,"\t","\t",percentageItem
class SimilarityLimit: ##@profile w def __init__(self, testSize, category, type=1): """ INPUT: type = full data (1) or training data (2) testSize = % of model size (nr of documents in model) to test with """ #percentage of data to be used for model build if type == 1: self.GROUPTYPE = ["CATID", "FATHERID", "GENERAL"] self.limitList = [1000, 2500, 5000, 7500, 10000, 20000] elif type == 2: self.GROUPTYPE = ["GENERAL", "FATHERID", "GENERAL"] self.limitList = [1000] else: sys.exit("Wrong 'type' parameter in createData.__init__") print "SimilarityLevel created" #Sheva Objects self.shevaDB = ShevaDB() self.shevaTPF = ShevaTPF() self.shevaSimilarity = ShevaSimilarity() self.shevaCSV = ShevaCSV() self.shevaClassificationMetrics = ShevaClassificationMetrics() self.shevaUtils = ShevaUtils() #SimilarityLevel Variables self.rootDir = "LimitModels/" self.testSize = testSize self.category = category self.maxDepth = self.shevaDB.getCategorymaxDepth(self.category) def __del__(self): print 'SimilarityLevel destroyed' #@profile def calculateLimitSimilarity(self): for limit in self.limitList: for group in self.GROUPTYPE: print "####################################################################" #print category, group, percentage, debth sim = [] vec_bow = [] allCategoryDataOID = [] categoryDataOID = [] categoryData = [] print "created variables" #path & csv file path = "%s%s/%s/" %(self.rootDir,group,limit) fileName = "%s_%s" %(limit,self.category) IODfilePath = "%soriginalID/%s.csv" %(path,fileName) print "Setup paths" #get data from original ID csv; unique ID allCategoryDataOID = self.shevaCSV.getModelCSV(IODfilePath) #categoryDataOID = self.shevaCSV.getIDfromModel(IODfilePath) print "Got all modelRow->originalID mappings" #get sim index, model, dict indexDir = "%sindexFiles/" %(path) self.shevaUtils.createDirOne(indexDir) index, tfidfModel, dictionary, corpusSize = self.shevaSimilarity.getSimilarityIndex(path, fileName, group) #return sample from original data categoryDataOID, categoryData = self.shevaDB.getSample(limit,self.testSize,self.category,self.maxDepth, group) #calculate similarites cleanText = self.shevaTPF.returnClean(categoryData, 1) cleanTextBoW = [dictionary.doc2bow(cleanText[i]) for i in range(0, len(cleanText))] print "Done with bow representation" vec_bow = self.shevaSimilarity.convert2VSM(cleanTextBoW, tfidfModel) print len(vec_bow) simCalculation = self.shevaSimilarity.calculateSimilarity(index, vec_bow, 0.1) #calcualte IR measures cPrecision, cRecall, cF1 = self.shevaClassificationMetrics.computeClassificationMetrics(categoryDataOID, allCategoryDataOID, simCalculation) print "All data measures :\t\t\t\tPrecision:\t", cPrecision, "\t\tRecall\t", cRecall, "\t\tF1:\t", cF1 sqlClassic = "INSERT INTO analysis_results_limit (category, groupingType, limitValue, levelDepth, testSize, measureType, P, R, F1) VALUES ('%s', '%s', '%s', '%i', '%i', '%s','%f','%f','%f')" % (self.category,group,limit,self.maxDepth,self.testSize, "computeClassificationMetrics",cPrecision, cRecall, cF1) self.shevaDB.dbQuery(sqlClassic) cPrecisionR, cRecallR, cF1R = self.shevaClassificationMetrics.computeClassificationMetricsRelative(categoryDataOID, allCategoryDataOID, simCalculation) print "Relative (with or) data measures :\t\tPrecision:\t", cPrecisionR, "\t\tRecall\t", cRecallR, "\t\tF1:\t", cF1R sqlRelative = "INSERT INTO analysis_results_limit (category, groupingType, limitValue, levelDepth, testSize,measureType, P, R, F1) VALUES ('%s', '%s', '%s', '%i', '%i', '%s','%f','%f','%f')" % (self.category,group,limit,self.maxDepth,self.testSize, "computeClassificationMetricsRelative",cPrecisionR, cRecallR, cF1R) self.shevaDB.dbQuery(sqlRelative) cPrecisionE, cRecallE, cF1E = self.shevaClassificationMetrics.computeClassificationMetricsExclusive(categoryDataOID, allCategoryDataOID, simCalculation) print "Exclusive (with and) data measures :\t\tPrecision:\t", cPrecisionE, "\t\tRecall\t", cRecallE, "\t\tF1:\t", cF1E sqlExclusive = "INSERT INTO analysis_results_limit (category, groupingType, limitValue, levelDepth,testSize, measureType, P, R, F1) VALUES ('%s', '%s', '%s', '%i', '%i', '%s','%f','%f','%f')" % (self.category,group,limit,self.maxDepth,self.testSize, "computeClassificationMetricsExclusive",cPrecisionE, cRecallE, cF1E) self.shevaDB.dbQuery(sqlExclusive) #trying to figure out the memory thing. needs speed-up in performance otherwise... dbData = [] simCalculation = [] cleanText = [] cleanTextBoW = [] vec_bow = [] del index del tfidfModel del dictionary del corpusSize del simCalculation del vec_bow del allCategoryDataOID del categoryDataOID del categoryData del dbData gc.collect()
class createDataSingleLevel: def __init__(self,type,rootDir): """ type: 1 -> full data 2 -> limited data """ #percentage of data to be used for model build if type == 1: self.GROUPTYPE = ["CATID","FATHERID","GENERAL"] self.percentageList = [25, 50, 75, 100] elif type == 2: self.GROUPTYPE = ["GENERAL"] self.percentageList = [5] else: sys.exit("Wrong 'type' parameter in createData.__init__") self.shevaDB = ShevaDB() self.shevaTPF = ShevaTPF() self.shevaUtils = ShevaUtils() self.shevaVect = ShevaVect() self.shevaCSV = ShevaCSV() if rootDir != "": self.shevaUtils.createDirOne(str(rootDir)) self.rootDir = str(rootDir) else: sys.exit("Wrong 'rootDir' parameter in createData.__init__") def createData(self, category): """ 1. get root categories to be used and iterate through main categories 3. get max depth for individual category 4. from 1 to max till 1 to 1 get all catid for iterated category get all pages for selected categories call createCorpusAndVectorModel fro selected documents """ ranger = self.shevaDB.getCategoryDepth(category) for group in self.GROUPTYPE: sqlQueryResults = [] #gruping dependent queries if group == "FATHERID": sqlCategory = "select Description, fatherid, categoryDepth from dmoz_combined where mainCategory = '%s'" %(category) else: sqlCategory = "select Description, catid, categoryDepth from dmoz_combined where mainCategory = '%s'" %(category) sqlQueryResults = self.shevaDB.dbQuery(sqlCategory) if sqlQueryResults == 0: sys.exit("SQL code error in level: \t", category,"\t",indeks,"\t",sqlCategoryLevel) for percentageItem in self.percentageList: #data for % model, range data dataCategoryLevelAll = [] dataCategoryLabelAll = [] originalCatIDAll = [] dataCategorySingleAll = [[]] path = "%s/%s/%s/" %(self.rootDir,group,percentageItem) self.shevaUtils.createDir(self.rootDir,group,percentageItem) #var = "" for indeks in ranger: #var += "Level:\t%s\n" %(indeks) """ #gruping dependent queries if group == "FATHERID": sqlCategoryLevel = "select Description, fatherid from dmoz_combined where mainCategory = '%s' and categoryDepth = '%s'" %(category,indeks) else: sqlCategoryLevel = "select Description, catid from dmoz_combined where mainCategory = '%s' and categoryDepth = '%s'" %(category,indeks) sqlQueryResultsLevel = self.shevaDB.dbQuery(sqlCategoryLevel) """ sqlQueryResultsLevel = [x for x in sqlQueryResults if x[2] == indeks] #level list variables finalContent = [] dataCategoryLevel = [] dataCategoryLabel = [] originalCatID = [] #originalFatherID = [] #get unique values if group == "GENERAL": #finalContent = [] percentageLevel = self.shevaUtils.setLimit(percentageItem,sqlQueryResultsLevel) finalContent = [[item for item in row[0].split()] for row in sqlQueryResultsLevel[:percentageLevel]] #var += "Original words:\t%s\n" %(finalContent) originalCatID = [row[1] for row in sqlQueryResultsLevel[:percentageLevel]] #var += "Original IDs:\t%s\n" %(originalCatID) dataCategoryLevel.extend(self.shevaTPF.returnClean(finalContent,1)) #var += "Processed words:\t%s\n" %(dataCategoryLevel) else: unique = [] for row in sqlQueryResultsLevel: if row[1] not in unique: unique.append(row[1]) for uniq in unique: #var += "ID:\t%s\n" %(uniq) tempUnique = [] tempUnique = [row[0] for row in sqlQueryResultsLevel if row[1] == uniq] percentageLevel = self.shevaUtils.setLimit(percentageItem,tempUnique) tempUnique = [item.split() for item in tempUnique[:percentageLevel]] mergedContent = [i for i in itertools.chain.from_iterable(tempUnique)] #var += "Original words:\t%s\n" %(mergedContent) finalContent.append(mergedContent) originalCatID.append(uniq) dataCategoryLevel.extend(self.shevaTPF.returnClean(finalContent,1)) self.shevaUtils.createDir(self.rootDir, group, percentageItem) ########## FILE NAMES ################# fileNameAll = "%s_%s_1_%s" %(str(percentageItem),category,str(indeks)) fileNameLevel = "%s_%s_%s" %(str(percentageItem),category,str(indeks)) fileNameSingleAll = "%s_%s_%s" %(str(percentageItem),category,str(indeks)) ########## PRINT OUT ORIGINAL AND PROCESSED DATA ################# """ print originalCatID print finalContent print dataCategoryLevel """ ########## ORIGINAL DESCRIPTION AND VECTORIZATION ################# #self.shevaVect.createCorpusAndVectorModel(dataCategoryLevel,fileNameLevel,path) dataCategoryLevelAll.extend(dataCategoryLevel) #self.shevaVect.createCorpusAndVectorModel(dataCategoryLevelAll, fileNameAll,path) #single model for all documents dataCategorySingleAll[0].extend([x for sublist in dataCategoryLevelAll for x in sublist]) self.shevaVect.createCorpusAndVectorModel(dataCategorySingleAll, fileNameSingleAll, path) ########## ORIGINAL CATEGORIES ID ################# #self.shevaUtils.getCategoryListLevel(originalCatID,fileNameLevel,path) originalCatIDAll.extend(originalCatID) #self.shevaCSV.getCategoryListLevel(originalCatIDAll,fileNameAll,path) #print out number of documents for (cat,level,model) print "Done with:\t",group,"\t",category,"\t",indeks,"\t",percentageItem ####################### GC ################# del dataCategoryLevel del originalCatID gc.collect() del dataCategoryLevelAll del dataCategoryLabelAll del originalCatIDAll del dataCategorySingleAll del sqlQueryResults gc.collect()