def __init__(self,type,rootDir):
        """
        type: 1 -> full data
              2 -> limited data
        """

        #percentage of data to be used for model build
        if type == 1:
            self.GROUPTYPE = ["GENERAL"]
            self.percentageList = [1000, 2500, 5000, 7500, 10000, 20000]
        elif type == 2:
            self.GROUPTYPE = ["GENERAL"]
            self.percentageList = [1000]
        else:
            sys.exit("Wrong 'type' parameter in createData.__init__")
        
        self.shevaDB = ShevaDB()
        self.shevaTPF = ShevaTPF()
        self.shevaUtils = ShevaUtils()
        self.shevaVect = ShevaVect()
        
        if rootDir != "":
            self.shevaUtils.createDirOne(str(rootDir))
            self.rootDir = str(rootDir)
        else:
            sys.exit("Wrong 'rootDir' parameter in createData.__init__")
示例#2
0
    def __init__(self, testSize, category, type=1):
        """
        INPUT:
            type = full data (1) or training data (2)
            testSize = % of model size (nr of documents in model) to test with
        """

        #percentage of data to be used for model build
        if type == 1:
            self.GROUPTYPE = ["CATID", "FATHERID", "GENERAL"]
            self.limitList = [1000, 2500, 5000, 7500, 10000, 20000]
        elif type == 2:
            self.GROUPTYPE = ["GENERAL", "FATHERID", "GENERAL"]
            self.limitList = [1000]
        else:
            sys.exit("Wrong 'type' parameter in createData.__init__")

        print "SimilarityLevel created"
        #Sheva Objects
        self.shevaDB = ShevaDB()
        self.shevaTPF = ShevaTPF()
        self.shevaSimilarity =  ShevaSimilarity()
        self.shevaCSV =  ShevaCSV()
        self.shevaClassificationMetrics =  ShevaClassificationMetrics()
        self.shevaUtils =  ShevaUtils()
        
        #SimilarityLevel Variables        
        self.rootDir = "LimitModels/"
        self.testSize = testSize
        self.category = category
        self.maxDepth = self.shevaDB.getCategorymaxDepth(self.category)
示例#3
0
 def __init__(self, testSize):
     """
     INPUT:
         type = full data (1) or training data (2)
         testSize = % of model size (nr of documents in model) to test with
     """
     """
     #percentage of data to be used for model build
     if type == 1:
         self.GROUPTYPE = ["CATID", "FATHERID", "GENERAL"]
         #self.percentageList = [25, 50, 75, 100]
     elif type == 2:
         self.GROUPTYPE = ["GENERAL"]
         #self.percentageList = [25, 50, 75, 100]
     else:
         sys.exit("Wrong 'type' parameter in createData.__init__")
     """
     
     print "SimilarityLevel created"
     #Sheva Objects
     self.shevaDB = ShevaDB()
     self.shevaTPF = ShevaTPF()
     self.shevaSimilarity =  ShevaSimilarity()
     self.shevaCSV =  ShevaCSV()
     self.shevaClassificationMetrics =  ShevaClassificationMetrics()
     self.shevaUtils =  ShevaUtils()
     
     #SimilarityLevel Variables        
     self.rootDir = "LevelModels/"
     self.testSize = testSize
class createDataSingleLimit:
    
    def __init__(self,type,rootDir):
        """
        type: 1 -> full data
              2 -> limited data
        """

        #percentage of data to be used for model build
        if type == 1:
            self.GROUPTYPE = ["GENERAL"]
            self.percentageList = [1000, 2500, 5000, 7500, 10000, 20000]
        elif type == 2:
            self.GROUPTYPE = ["GENERAL"]
            self.percentageList = [1000]
        else:
            sys.exit("Wrong 'type' parameter in createData.__init__")
        
        self.shevaDB = ShevaDB()
        self.shevaTPF = ShevaTPF()
        self.shevaUtils = ShevaUtils()
        self.shevaVect = ShevaVect()
        
        if rootDir != "":
            self.shevaUtils.createDirOne(str(rootDir))
            self.rootDir = str(rootDir)
        else:
            sys.exit("Wrong 'rootDir' parameter in createData.__init__")
            
            
    
    def createData(self, category):
        """
        1. get root categories to be used and iterate through main categories
        3. get max depth for individual category
        4. from 1 to max till 1 to 1
            get all catid for iterated category
            get all pages for selected categories
            call createCorpusAndVectorModel from selected documents
        """

        for group in self.GROUPTYPE:
            
            #gruping dependent queries
            if group != "FATHERID":
                sqlCategory = "select Description, catid from dmoz_combined where mainCategory = '%s' limit 20000" %(category)
            else:
                sqlCategory = "select Description, fatherid from dmoz_combined where mainCategory = '%s' limit 20000" %(category)
            
            sqlQueryResults = self.shevaDB.dbQuery(sqlCategory)
            
            if sqlQueryResults == 0:
                sys.exit("SQL code error in level: \t", category,"\t",indeks,"\t",sqlCategoryLevel)

            for percentageItem in self.percentageList:
                sqlQueryResultsLimit = [x for x in sqlQueryResults[:percentageItem]]
                #data for % model, range data
                dataCategoryLevelAll = []
                dataCategoryLabelAll = []
                originalCatIDAll = []
                dataCategorySingleAll = []
                
                path = "%s/%s/%s/" %(self.rootDir,group,percentageItem)
                self.shevaUtils.createDir(self.rootDir,group,percentageItem)
                
                #for indeks in ranger:
                #level list variables
                dataCategoryLevel = []
                dataCategoryLabel = []
                originalCatID = []
                originalFatherID = []
                finalContent = []
                    
                #get unique values
                if group == "GENERAL":
                    finalContent = [[item for item in row[0].split()] for row in sqlQueryResultsLimit]
                    originalCatID = [row[1] for row in sqlQueryResultsLimit]
                    dataCategoryLevel.extend(self.shevaTPF.returnClean(finalContent,1))
                else:
                    unique = []
                    for row in sqlQueryResultsLimit:
                        if row[1] not in unique:
                            unique.append(row[1])

                    #prepare rows with uniq for document in model
                    for uniq in unique:
                        tempUnique = []
                        tempUnique = [row[0].split() for row in sqlQueryResultsLimit if row[1] == uniq]
                        mergedContent = [i for i in itertools.chain.from_iterable(tempUnique)]
                        finalContent.append(mergedContent)
                        originalCatID.append(uniq)
                    dataCategoryLevel.extend(self.shevaTPF.returnClean(finalContent,1))

                self.shevaUtils.createDir(self.rootDir, group, percentageItem)

                #create file names
                #fileNameAll = "%s_%s_1_%s" %(str(percentageItem),category,str(indeks))
                fileNameLevel = "%s_%s" %(str(percentageItem),category)
                fileNameSingleAll = "%s_%s" %(str(percentageItem),category)
    
                ##########   ORIGINAL DESCRIPTION AND VECTORIZATION  #################
                #create corpus models
                #self.shevaVect.createCorpusAndVectorModel(dataCategoryLevel,fileNameLevel,path)
                dataCategoryLevelAll.extend(dataCategoryLevel)
                #self.shevaVect.createCorpusAndVectorModel(dataCategoryLevelAll, fileNameAll,path)

                #single model for all documents
                dataCategorySingleAll.append([x for sublist in dataCategoryLevelAll for x in sublist])
                #print dataCategorySingleAll
                #print len(Counter(dataCategorySingleAll[0]))
                print len(dataCategorySingleAll), len(dataCategorySingleAll[0])
                self.shevaVect.createCorpusAndVectorModel(dataCategorySingleAll, fileNameSingleAll,path)
    
                ##########   ORIGINAL CATEGORIES ID   #################
                #self.shevaUtils.getCategoryListLevel(originalCatID,fileNameLevel,path)
                #originalCatIDAll.extend(originalCatID)
                #self.shevaUtils.getCategoryListLevel(originalCatIDAll,fileNameAll,path)
                
                #print out number of documents for (cat,level,model)
                print "Done with:\t",group,"\t",category,"\t","\t",percentageItem
示例#5
0
class SimilarityLimit:   
    ##@profile w
    def __init__(self, testSize, category, type=1):
        """
        INPUT:
            type = full data (1) or training data (2)
            testSize = % of model size (nr of documents in model) to test with
        """

        #percentage of data to be used for model build
        if type == 1:
            self.GROUPTYPE = ["CATID", "FATHERID", "GENERAL"]
            self.limitList = [1000, 2500, 5000, 7500, 10000, 20000]
        elif type == 2:
            self.GROUPTYPE = ["GENERAL", "FATHERID", "GENERAL"]
            self.limitList = [1000]
        else:
            sys.exit("Wrong 'type' parameter in createData.__init__")

        print "SimilarityLevel created"
        #Sheva Objects
        self.shevaDB = ShevaDB()
        self.shevaTPF = ShevaTPF()
        self.shevaSimilarity =  ShevaSimilarity()
        self.shevaCSV =  ShevaCSV()
        self.shevaClassificationMetrics =  ShevaClassificationMetrics()
        self.shevaUtils =  ShevaUtils()
        
        #SimilarityLevel Variables        
        self.rootDir = "LimitModels/"
        self.testSize = testSize
        self.category = category
        self.maxDepth = self.shevaDB.getCategorymaxDepth(self.category)

    def __del__(self):
        print 'SimilarityLevel destroyed'                  
            
    #@profile
    def calculateLimitSimilarity(self):

        for limit in self.limitList:            
            for group in self.GROUPTYPE:
                print "####################################################################"
                #print category, group, percentage, debth

                sim = []
                vec_bow = []
                allCategoryDataOID = []
                categoryDataOID = []
                categoryData = []
                print "created variables"
                 
                #path & csv file
                path = "%s%s/%s/" %(self.rootDir,group,limit)
                fileName = "%s_%s" %(limit,self.category)
                IODfilePath = "%soriginalID/%s.csv" %(path,fileName)
                print "Setup paths"
                
                #get data from original ID csv; unique ID
                allCategoryDataOID = self.shevaCSV.getModelCSV(IODfilePath)
                #categoryDataOID = self.shevaCSV.getIDfromModel(IODfilePath)
                print "Got all modelRow->originalID mappings"
        
                #get sim index, model, dict
                indexDir = "%sindexFiles/" %(path)
                self.shevaUtils.createDirOne(indexDir)
                index, tfidfModel, dictionary, corpusSize = self.shevaSimilarity.getSimilarityIndex(path, fileName, group)
                #return sample from original data
                categoryDataOID, categoryData = self.shevaDB.getSample(limit,self.testSize,self.category,self.maxDepth, group)
        
                #calculate similarites
                cleanText = self.shevaTPF.returnClean(categoryData, 1)
                cleanTextBoW = [dictionary.doc2bow(cleanText[i]) for i in range(0, len(cleanText))]
                print "Done with bow representation"
                vec_bow = self.shevaSimilarity.convert2VSM(cleanTextBoW, tfidfModel)
                print len(vec_bow)
                
                simCalculation = self.shevaSimilarity.calculateSimilarity(index, vec_bow, 0.1)
        
                #calcualte IR measures
                cPrecision, cRecall, cF1 = self.shevaClassificationMetrics.computeClassificationMetrics(categoryDataOID, allCategoryDataOID, simCalculation)
                print "All data measures :\t\t\t\tPrecision:\t", cPrecision, "\t\tRecall\t", cRecall, "\t\tF1:\t", cF1
                sqlClassic = "INSERT INTO analysis_results_limit (category, groupingType, limitValue, levelDepth, testSize, measureType, P, R, F1) VALUES ('%s', '%s', '%s', '%i', '%i', '%s','%f','%f','%f')" % (self.category,group,limit,self.maxDepth,self.testSize, "computeClassificationMetrics",cPrecision, cRecall, cF1)
                self.shevaDB.dbQuery(sqlClassic)
        
                cPrecisionR, cRecallR, cF1R = self.shevaClassificationMetrics.computeClassificationMetricsRelative(categoryDataOID, allCategoryDataOID, simCalculation)
                print "Relative (with or) data measures :\t\tPrecision:\t", cPrecisionR, "\t\tRecall\t", cRecallR, "\t\tF1:\t", cF1R
                sqlRelative = "INSERT INTO analysis_results_limit (category, groupingType, limitValue, levelDepth, testSize,measureType, P, R, F1) VALUES ('%s', '%s', '%s', '%i', '%i', '%s','%f','%f','%f')" % (self.category,group,limit,self.maxDepth,self.testSize, "computeClassificationMetricsRelative",cPrecisionR, cRecallR, cF1R)
                self.shevaDB.dbQuery(sqlRelative)
                
                cPrecisionE, cRecallE, cF1E = self.shevaClassificationMetrics.computeClassificationMetricsExclusive(categoryDataOID, allCategoryDataOID, simCalculation)
                print "Exclusive (with and) data measures :\t\tPrecision:\t", cPrecisionE, "\t\tRecall\t", cRecallE, "\t\tF1:\t", cF1E
                sqlExclusive = "INSERT INTO analysis_results_limit (category, groupingType, limitValue, levelDepth,testSize, measureType, P, R, F1) VALUES ('%s', '%s', '%s', '%i', '%i', '%s','%f','%f','%f')" % (self.category,group,limit,self.maxDepth,self.testSize, "computeClassificationMetricsExclusive",cPrecisionE, cRecallE, cF1E)
                self.shevaDB.dbQuery(sqlExclusive)
        
                #trying to figure out the memory thing. needs speed-up in performance otherwise... 
                dbData = []
                simCalculation = []
                cleanText = []
                cleanTextBoW = []
                vec_bow = []
                
                del index
                del tfidfModel
                del dictionary
                del corpusSize
                del simCalculation
                del vec_bow
                del allCategoryDataOID
                del categoryDataOID
                del categoryData
                del dbData
                gc.collect()
class createDataSingleLevel:
    
    def __init__(self,type,rootDir):
        """
        type: 1 -> full data
              2 -> limited data
        """

        #percentage of data to be used for model build
        if type == 1:
            self.GROUPTYPE = ["CATID","FATHERID","GENERAL"]
            self.percentageList = [25, 50, 75, 100]
        elif type == 2:
            self.GROUPTYPE = ["GENERAL"]
            self.percentageList = [5]
        else:
            sys.exit("Wrong 'type' parameter in createData.__init__")
        
        self.shevaDB = ShevaDB()
        self.shevaTPF = ShevaTPF()
        self.shevaUtils = ShevaUtils()
        self.shevaVect = ShevaVect()
        self.shevaCSV = ShevaCSV()
        
        if rootDir != "":
            self.shevaUtils.createDirOne(str(rootDir))
            self.rootDir = str(rootDir)
        else:
            sys.exit("Wrong 'rootDir' parameter in createData.__init__")
    
    def createData(self, category):
        """
        1. get root categories to be used and iterate through main categories
        3. get max depth for individual category
        4. from 1 to max till 1 to 1
            get all catid for iterated category
            get all pages for selected categories
            call createCorpusAndVectorModel fro selected documents
        """
        ranger = self.shevaDB.getCategoryDepth(category)
        
        for group in self.GROUPTYPE:
            sqlQueryResults = []
            #gruping dependent queries
            if group == "FATHERID":
                sqlCategory = "select Description, fatherid, categoryDepth from dmoz_combined where mainCategory = '%s'" %(category)
            else:
                sqlCategory = "select Description, catid, categoryDepth from dmoz_combined where mainCategory = '%s'" %(category)
            
            sqlQueryResults = self.shevaDB.dbQuery(sqlCategory)
            
            if sqlQueryResults == 0:
                sys.exit("SQL code error in level: \t", category,"\t",indeks,"\t",sqlCategoryLevel)            
            
            for percentageItem in self.percentageList:
                #data for % model, range data
                dataCategoryLevelAll = []
                dataCategoryLabelAll = []
                originalCatIDAll = []
                dataCategorySingleAll = [[]]
                
                path = "%s/%s/%s/" %(self.rootDir,group,percentageItem)
                self.shevaUtils.createDir(self.rootDir,group,percentageItem)
                
                #var = ""
                
                for indeks in ranger:
                    #var += "Level:\t%s\n" %(indeks)
                    """
                    #gruping dependent queries
                    if group == "FATHERID":
                        sqlCategoryLevel = "select Description, fatherid from dmoz_combined where mainCategory = '%s' and categoryDepth = '%s'" %(category,indeks)
                    else:
                        sqlCategoryLevel = "select Description, catid from dmoz_combined where mainCategory = '%s' and categoryDepth = '%s'" %(category,indeks)
                    
                    sqlQueryResultsLevel = self.shevaDB.dbQuery(sqlCategoryLevel)
                    """
                    
                    sqlQueryResultsLevel = [x for x in sqlQueryResults if x[2] == indeks]
                    
                    #level list variables
                    finalContent = []
                    dataCategoryLevel = []
                    dataCategoryLabel = []
                    originalCatID = []
                    #originalFatherID = []
                        
                    #get unique values
                    if group == "GENERAL":
                        #finalContent = []
                        percentageLevel = self.shevaUtils.setLimit(percentageItem,sqlQueryResultsLevel)
                        finalContent = [[item for item in row[0].split()] for row in sqlQueryResultsLevel[:percentageLevel]]
                        #var += "Original words:\t%s\n" %(finalContent)
                        originalCatID = [row[1] for row in sqlQueryResultsLevel[:percentageLevel]]
                        #var += "Original IDs:\t%s\n" %(originalCatID)
                        dataCategoryLevel.extend(self.shevaTPF.returnClean(finalContent,1))
                        #var += "Processed words:\t%s\n" %(dataCategoryLevel)
                    else:
                        unique = []
                        for row in sqlQueryResultsLevel:
                            if row[1] not in unique:
                                unique.append(row[1])

                        for uniq in unique:
                            #var += "ID:\t%s\n" %(uniq)
                            tempUnique = []
                            tempUnique = [row[0] for row in sqlQueryResultsLevel if row[1] == uniq]
                            percentageLevel = self.shevaUtils.setLimit(percentageItem,tempUnique)
                            tempUnique = [item.split() for item in tempUnique[:percentageLevel]]
                            mergedContent = [i for i in itertools.chain.from_iterable(tempUnique)]
                            #var += "Original words:\t%s\n" %(mergedContent)
                            finalContent.append(mergedContent)
                            originalCatID.append(uniq)
                        dataCategoryLevel.extend(self.shevaTPF.returnClean(finalContent,1))

                    self.shevaUtils.createDir(self.rootDir, group, percentageItem)

                    ##########            FILE NAMES             #################
                    fileNameAll = "%s_%s_1_%s" %(str(percentageItem),category,str(indeks))
                    fileNameLevel = "%s_%s_%s" %(str(percentageItem),category,str(indeks))
                    fileNameSingleAll = "%s_%s_%s" %(str(percentageItem),category,str(indeks))

                    ##########    PRINT OUT ORIGINAL AND PROCESSED DATA  #################
                    """
                    print originalCatID
                    print finalContent
                    print dataCategoryLevel
                    """
                    
                    ##########   ORIGINAL DESCRIPTION AND VECTORIZATION  #################
                    #self.shevaVect.createCorpusAndVectorModel(dataCategoryLevel,fileNameLevel,path)
                    dataCategoryLevelAll.extend(dataCategoryLevel)
                    #self.shevaVect.createCorpusAndVectorModel(dataCategoryLevelAll, fileNameAll,path)
                    
                    #single model for all documents
                    dataCategorySingleAll[0].extend([x for sublist in dataCategoryLevelAll for x in sublist])
                    self.shevaVect.createCorpusAndVectorModel(dataCategorySingleAll, fileNameSingleAll, path)
                    
                    ##########   ORIGINAL CATEGORIES ID   #################
                    #self.shevaUtils.getCategoryListLevel(originalCatID,fileNameLevel,path)
                    originalCatIDAll.extend(originalCatID)
                    #self.shevaCSV.getCategoryListLevel(originalCatIDAll,fileNameAll,path)
                    
                    #print out number of documents for (cat,level,model)
                    print "Done with:\t",group,"\t",category,"\t",indeks,"\t",percentageItem

                    #######################    GC    #################
                    del dataCategoryLevel
                    del originalCatID
                    gc.collect()
                    
                del dataCategoryLevelAll
                del dataCategoryLabelAll
                del originalCatIDAll
                del dataCategorySingleAll
            del sqlQueryResults
            gc.collect()