def __init__(self,type,rootDir):
        """
        type: 1 -> full data
              2 -> limited data
        """

        #percentage of data to be used for model build
        if type == 1:
            self.GROUPTYPE = ["GENERAL"]
            self.percentageList = [1000, 2500, 5000, 7500, 10000, 20000]
        elif type == 2:
            self.GROUPTYPE = ["GENERAL"]
            self.percentageList = [1000]
        else:
            sys.exit("Wrong 'type' parameter in createData.__init__")
        
        self.shevaDB = ShevaDB()
        self.shevaTPF = ShevaTPF()
        self.shevaUtils = ShevaUtils()
        self.shevaVect = ShevaVect()
        
        if rootDir != "":
            self.shevaUtils.createDirOne(str(rootDir))
            self.rootDir = str(rootDir)
        else:
            sys.exit("Wrong 'rootDir' parameter in createData.__init__")
class createDataSingleLimit:
    
    def __init__(self,type,rootDir):
        """
        type: 1 -> full data
              2 -> limited data
        """

        #percentage of data to be used for model build
        if type == 1:
            self.GROUPTYPE = ["GENERAL"]
            self.percentageList = [1000, 2500, 5000, 7500, 10000, 20000]
        elif type == 2:
            self.GROUPTYPE = ["GENERAL"]
            self.percentageList = [1000]
        else:
            sys.exit("Wrong 'type' parameter in createData.__init__")
        
        self.shevaDB = ShevaDB()
        self.shevaTPF = ShevaTPF()
        self.shevaUtils = ShevaUtils()
        self.shevaVect = ShevaVect()
        
        if rootDir != "":
            self.shevaUtils.createDirOne(str(rootDir))
            self.rootDir = str(rootDir)
        else:
            sys.exit("Wrong 'rootDir' parameter in createData.__init__")
            
            
    
    def createData(self, category):
        """
        1. get root categories to be used and iterate through main categories
        3. get max depth for individual category
        4. from 1 to max till 1 to 1
            get all catid for iterated category
            get all pages for selected categories
            call createCorpusAndVectorModel from selected documents
        """

        for group in self.GROUPTYPE:
            
            #gruping dependent queries
            if group != "FATHERID":
                sqlCategory = "select Description, catid from dmoz_combined where mainCategory = '%s' limit 20000" %(category)
            else:
                sqlCategory = "select Description, fatherid from dmoz_combined where mainCategory = '%s' limit 20000" %(category)
            
            sqlQueryResults = self.shevaDB.dbQuery(sqlCategory)
            
            if sqlQueryResults == 0:
                sys.exit("SQL code error in level: \t", category,"\t",indeks,"\t",sqlCategoryLevel)

            for percentageItem in self.percentageList:
                sqlQueryResultsLimit = [x for x in sqlQueryResults[:percentageItem]]
                #data for % model, range data
                dataCategoryLevelAll = []
                dataCategoryLabelAll = []
                originalCatIDAll = []
                dataCategorySingleAll = []
                
                path = "%s/%s/%s/" %(self.rootDir,group,percentageItem)
                self.shevaUtils.createDir(self.rootDir,group,percentageItem)
                
                #for indeks in ranger:
                #level list variables
                dataCategoryLevel = []
                dataCategoryLabel = []
                originalCatID = []
                originalFatherID = []
                finalContent = []
                    
                #get unique values
                if group == "GENERAL":
                    finalContent = [[item for item in row[0].split()] for row in sqlQueryResultsLimit]
                    originalCatID = [row[1] for row in sqlQueryResultsLimit]
                    dataCategoryLevel.extend(self.shevaTPF.returnClean(finalContent,1))
                else:
                    unique = []
                    for row in sqlQueryResultsLimit:
                        if row[1] not in unique:
                            unique.append(row[1])

                    #prepare rows with uniq for document in model
                    for uniq in unique:
                        tempUnique = []
                        tempUnique = [row[0].split() for row in sqlQueryResultsLimit if row[1] == uniq]
                        mergedContent = [i for i in itertools.chain.from_iterable(tempUnique)]
                        finalContent.append(mergedContent)
                        originalCatID.append(uniq)
                    dataCategoryLevel.extend(self.shevaTPF.returnClean(finalContent,1))

                self.shevaUtils.createDir(self.rootDir, group, percentageItem)

                #create file names
                #fileNameAll = "%s_%s_1_%s" %(str(percentageItem),category,str(indeks))
                fileNameLevel = "%s_%s" %(str(percentageItem),category)
                fileNameSingleAll = "%s_%s" %(str(percentageItem),category)
    
                ##########   ORIGINAL DESCRIPTION AND VECTORIZATION  #################
                #create corpus models
                #self.shevaVect.createCorpusAndVectorModel(dataCategoryLevel,fileNameLevel,path)
                dataCategoryLevelAll.extend(dataCategoryLevel)
                #self.shevaVect.createCorpusAndVectorModel(dataCategoryLevelAll, fileNameAll,path)

                #single model for all documents
                dataCategorySingleAll.append([x for sublist in dataCategoryLevelAll for x in sublist])
                #print dataCategorySingleAll
                #print len(Counter(dataCategorySingleAll[0]))
                print len(dataCategorySingleAll), len(dataCategorySingleAll[0])
                self.shevaVect.createCorpusAndVectorModel(dataCategorySingleAll, fileNameSingleAll,path)
    
                ##########   ORIGINAL CATEGORIES ID   #################
                #self.shevaUtils.getCategoryListLevel(originalCatID,fileNameLevel,path)
                #originalCatIDAll.extend(originalCatID)
                #self.shevaUtils.getCategoryListLevel(originalCatIDAll,fileNameAll,path)
                
                #print out number of documents for (cat,level,model)
                print "Done with:\t",group,"\t",category,"\t","\t",percentageItem
class createDataSingleLevel:
    
    def __init__(self,type,rootDir):
        """
        type: 1 -> full data
              2 -> limited data
        """

        #percentage of data to be used for model build
        if type == 1:
            self.GROUPTYPE = ["CATID","FATHERID","GENERAL"]
            self.percentageList = [25, 50, 75, 100]
        elif type == 2:
            self.GROUPTYPE = ["GENERAL"]
            self.percentageList = [5]
        else:
            sys.exit("Wrong 'type' parameter in createData.__init__")
        
        self.shevaDB = ShevaDB()
        self.shevaTPF = ShevaTPF()
        self.shevaUtils = ShevaUtils()
        self.shevaVect = ShevaVect()
        self.shevaCSV = ShevaCSV()
        
        if rootDir != "":
            self.shevaUtils.createDirOne(str(rootDir))
            self.rootDir = str(rootDir)
        else:
            sys.exit("Wrong 'rootDir' parameter in createData.__init__")
    
    def createData(self, category):
        """
        1. get root categories to be used and iterate through main categories
        3. get max depth for individual category
        4. from 1 to max till 1 to 1
            get all catid for iterated category
            get all pages for selected categories
            call createCorpusAndVectorModel fro selected documents
        """
        ranger = self.shevaDB.getCategoryDepth(category)
        
        for group in self.GROUPTYPE:
            sqlQueryResults = []
            #gruping dependent queries
            if group == "FATHERID":
                sqlCategory = "select Description, fatherid, categoryDepth from dmoz_combined where mainCategory = '%s'" %(category)
            else:
                sqlCategory = "select Description, catid, categoryDepth from dmoz_combined where mainCategory = '%s'" %(category)
            
            sqlQueryResults = self.shevaDB.dbQuery(sqlCategory)
            
            if sqlQueryResults == 0:
                sys.exit("SQL code error in level: \t", category,"\t",indeks,"\t",sqlCategoryLevel)            
            
            for percentageItem in self.percentageList:
                #data for % model, range data
                dataCategoryLevelAll = []
                dataCategoryLabelAll = []
                originalCatIDAll = []
                dataCategorySingleAll = [[]]
                
                path = "%s/%s/%s/" %(self.rootDir,group,percentageItem)
                self.shevaUtils.createDir(self.rootDir,group,percentageItem)
                
                #var = ""
                
                for indeks in ranger:
                    #var += "Level:\t%s\n" %(indeks)
                    """
                    #gruping dependent queries
                    if group == "FATHERID":
                        sqlCategoryLevel = "select Description, fatherid from dmoz_combined where mainCategory = '%s' and categoryDepth = '%s'" %(category,indeks)
                    else:
                        sqlCategoryLevel = "select Description, catid from dmoz_combined where mainCategory = '%s' and categoryDepth = '%s'" %(category,indeks)
                    
                    sqlQueryResultsLevel = self.shevaDB.dbQuery(sqlCategoryLevel)
                    """
                    
                    sqlQueryResultsLevel = [x for x in sqlQueryResults if x[2] == indeks]
                    
                    #level list variables
                    finalContent = []
                    dataCategoryLevel = []
                    dataCategoryLabel = []
                    originalCatID = []
                    #originalFatherID = []
                        
                    #get unique values
                    if group == "GENERAL":
                        #finalContent = []
                        percentageLevel = self.shevaUtils.setLimit(percentageItem,sqlQueryResultsLevel)
                        finalContent = [[item for item in row[0].split()] for row in sqlQueryResultsLevel[:percentageLevel]]
                        #var += "Original words:\t%s\n" %(finalContent)
                        originalCatID = [row[1] for row in sqlQueryResultsLevel[:percentageLevel]]
                        #var += "Original IDs:\t%s\n" %(originalCatID)
                        dataCategoryLevel.extend(self.shevaTPF.returnClean(finalContent,1))
                        #var += "Processed words:\t%s\n" %(dataCategoryLevel)
                    else:
                        unique = []
                        for row in sqlQueryResultsLevel:
                            if row[1] not in unique:
                                unique.append(row[1])

                        for uniq in unique:
                            #var += "ID:\t%s\n" %(uniq)
                            tempUnique = []
                            tempUnique = [row[0] for row in sqlQueryResultsLevel if row[1] == uniq]
                            percentageLevel = self.shevaUtils.setLimit(percentageItem,tempUnique)
                            tempUnique = [item.split() for item in tempUnique[:percentageLevel]]
                            mergedContent = [i for i in itertools.chain.from_iterable(tempUnique)]
                            #var += "Original words:\t%s\n" %(mergedContent)
                            finalContent.append(mergedContent)
                            originalCatID.append(uniq)
                        dataCategoryLevel.extend(self.shevaTPF.returnClean(finalContent,1))

                    self.shevaUtils.createDir(self.rootDir, group, percentageItem)

                    ##########            FILE NAMES             #################
                    fileNameAll = "%s_%s_1_%s" %(str(percentageItem),category,str(indeks))
                    fileNameLevel = "%s_%s_%s" %(str(percentageItem),category,str(indeks))
                    fileNameSingleAll = "%s_%s_%s" %(str(percentageItem),category,str(indeks))

                    ##########    PRINT OUT ORIGINAL AND PROCESSED DATA  #################
                    """
                    print originalCatID
                    print finalContent
                    print dataCategoryLevel
                    """
                    
                    ##########   ORIGINAL DESCRIPTION AND VECTORIZATION  #################
                    #self.shevaVect.createCorpusAndVectorModel(dataCategoryLevel,fileNameLevel,path)
                    dataCategoryLevelAll.extend(dataCategoryLevel)
                    #self.shevaVect.createCorpusAndVectorModel(dataCategoryLevelAll, fileNameAll,path)
                    
                    #single model for all documents
                    dataCategorySingleAll[0].extend([x for sublist in dataCategoryLevelAll for x in sublist])
                    self.shevaVect.createCorpusAndVectorModel(dataCategorySingleAll, fileNameSingleAll, path)
                    
                    ##########   ORIGINAL CATEGORIES ID   #################
                    #self.shevaUtils.getCategoryListLevel(originalCatID,fileNameLevel,path)
                    originalCatIDAll.extend(originalCatID)
                    #self.shevaCSV.getCategoryListLevel(originalCatIDAll,fileNameAll,path)
                    
                    #print out number of documents for (cat,level,model)
                    print "Done with:\t",group,"\t",category,"\t",indeks,"\t",percentageItem

                    #######################    GC    #################
                    del dataCategoryLevel
                    del originalCatID
                    gc.collect()
                    
                del dataCategoryLevelAll
                del dataCategoryLabelAll
                del originalCatIDAll
                del dataCategorySingleAll
            del sqlQueryResults
            gc.collect()