class TestCreatedData: def __init__(self,rootDir): self.root = rootDir self.GROUPTYPE = ["CATID","FATHERID","GENERAL"] self.percentageList = [25, 50, 75, 100] self.shevaDB = ShevaDB() def checkCreatedModels(self,category): ranger = self.shevaDB.getCategoryDepth(category) var = "" for group in self.GROUPTYPE: for percentageItem in self.percentageList: for indeks in ranger: fileNameAll = "%s_%s_1_%s" %(str(percentageItem),category,str(indeks)) fileNameLevel = "%s_%s_%s" %(str(percentageItem),category,str(indeks)) fileNameSingleAll = "%s_%s_%s_single" %(str(percentageItem),category,str(indeks)) dictDirLevel = "../%s/%s/%s/dict/%s.dict" %(self.root,group,str(percentageItem),fileNameLevel) dictDirRange = "../%s/%s/%s/dict/%s.dict" %(self.root,group,str(percentageItem),fileNameAll) dictionaryLevel = corpora.Dictionary.load(dictDirLevel) dictionaryRange = corpora.Dictionary.load(dictDirRange) """ print "Level dict %s: %s" %(fileNameLevel, dictionaryLevel) print "Range dict %s: %s" %(fileNameAll, dictionaryRange) """ var += "Level dict %s: %s\n" %(fileNameLevel, dictionaryLevel) var += "Range dict %s: %s\n" %(fileNameAll, dictionaryRange) corpusDirLevel = "../%s/%s/%s/corpus/%s.mm" %(self.root,group,str(percentageItem),fileNameLevel) corpusDirRange = "../%s/%s/%s/corpus/%s.mm" %(self.root,group,str(percentageItem),fileNameAll) corpusLevel = corpora.MmCorpus(corpusDirLevel) corpusRange = corpora.MmCorpus(corpusDirRange) """ print "Level corpus %s: %s" %(fileNameLevel, corpusLevel) print "Range corpus %s: %s" %(fileNameAll, corpusRange) """ var += "Level corpus %s: %s\n" %(fileNameLevel, corpusLevel) var += "Range corpus %s: %s\n" %(fileNameAll, corpusRange) var += "###################################################################\n" return var def runCheck(self): inputs = self.shevaDB.getMainCat() var = "" for index in inputs: var += self.checkCreatedModels(index) f = open('analizaDoc2Dict.txt', 'w') f.write( var) f.close()
class createDataSingleLevel: def __init__(self,type,rootDir): """ type: 1 -> full data 2 -> limited data """ #percentage of data to be used for model build if type == 1: self.GROUPTYPE = ["CATID","FATHERID","GENERAL"] self.percentageList = [25, 50, 75, 100] elif type == 2: self.GROUPTYPE = ["GENERAL"] self.percentageList = [5] else: sys.exit("Wrong 'type' parameter in createData.__init__") self.shevaDB = ShevaDB() self.shevaTPF = ShevaTPF() self.shevaUtils = ShevaUtils() self.shevaVect = ShevaVect() self.shevaCSV = ShevaCSV() if rootDir != "": self.shevaUtils.createDirOne(str(rootDir)) self.rootDir = str(rootDir) else: sys.exit("Wrong 'rootDir' parameter in createData.__init__") def createData(self, category): """ 1. get root categories to be used and iterate through main categories 3. get max depth for individual category 4. from 1 to max till 1 to 1 get all catid for iterated category get all pages for selected categories call createCorpusAndVectorModel fro selected documents """ ranger = self.shevaDB.getCategoryDepth(category) for group in self.GROUPTYPE: sqlQueryResults = [] #gruping dependent queries if group == "FATHERID": sqlCategory = "select Description, fatherid, categoryDepth from dmoz_combined where mainCategory = '%s'" %(category) else: sqlCategory = "select Description, catid, categoryDepth from dmoz_combined where mainCategory = '%s'" %(category) sqlQueryResults = self.shevaDB.dbQuery(sqlCategory) if sqlQueryResults == 0: sys.exit("SQL code error in level: \t", category,"\t",indeks,"\t",sqlCategoryLevel) for percentageItem in self.percentageList: #data for % model, range data dataCategoryLevelAll = [] dataCategoryLabelAll = [] originalCatIDAll = [] dataCategorySingleAll = [[]] path = "%s/%s/%s/" %(self.rootDir,group,percentageItem) self.shevaUtils.createDir(self.rootDir,group,percentageItem) #var = "" for indeks in ranger: #var += "Level:\t%s\n" %(indeks) """ #gruping dependent queries if group == "FATHERID": sqlCategoryLevel = "select Description, fatherid from dmoz_combined where mainCategory = '%s' and categoryDepth = '%s'" %(category,indeks) else: sqlCategoryLevel = "select Description, catid from dmoz_combined where mainCategory = '%s' and categoryDepth = '%s'" %(category,indeks) sqlQueryResultsLevel = self.shevaDB.dbQuery(sqlCategoryLevel) """ sqlQueryResultsLevel = [x for x in sqlQueryResults if x[2] == indeks] #level list variables finalContent = [] dataCategoryLevel = [] dataCategoryLabel = [] originalCatID = [] #originalFatherID = [] #get unique values if group == "GENERAL": #finalContent = [] percentageLevel = self.shevaUtils.setLimit(percentageItem,sqlQueryResultsLevel) finalContent = [[item for item in row[0].split()] for row in sqlQueryResultsLevel[:percentageLevel]] #var += "Original words:\t%s\n" %(finalContent) originalCatID = [row[1] for row in sqlQueryResultsLevel[:percentageLevel]] #var += "Original IDs:\t%s\n" %(originalCatID) dataCategoryLevel.extend(self.shevaTPF.returnClean(finalContent,1)) #var += "Processed words:\t%s\n" %(dataCategoryLevel) else: unique = [] for row in sqlQueryResultsLevel: if row[1] not in unique: unique.append(row[1]) for uniq in unique: #var += "ID:\t%s\n" %(uniq) tempUnique = [] tempUnique = [row[0] for row in sqlQueryResultsLevel if row[1] == uniq] percentageLevel = self.shevaUtils.setLimit(percentageItem,tempUnique) tempUnique = [item.split() for item in tempUnique[:percentageLevel]] mergedContent = [i for i in itertools.chain.from_iterable(tempUnique)] #var += "Original words:\t%s\n" %(mergedContent) finalContent.append(mergedContent) originalCatID.append(uniq) dataCategoryLevel.extend(self.shevaTPF.returnClean(finalContent,1)) self.shevaUtils.createDir(self.rootDir, group, percentageItem) ########## FILE NAMES ################# fileNameAll = "%s_%s_1_%s" %(str(percentageItem),category,str(indeks)) fileNameLevel = "%s_%s_%s" %(str(percentageItem),category,str(indeks)) fileNameSingleAll = "%s_%s_%s" %(str(percentageItem),category,str(indeks)) ########## PRINT OUT ORIGINAL AND PROCESSED DATA ################# """ print originalCatID print finalContent print dataCategoryLevel """ ########## ORIGINAL DESCRIPTION AND VECTORIZATION ################# #self.shevaVect.createCorpusAndVectorModel(dataCategoryLevel,fileNameLevel,path) dataCategoryLevelAll.extend(dataCategoryLevel) #self.shevaVect.createCorpusAndVectorModel(dataCategoryLevelAll, fileNameAll,path) #single model for all documents dataCategorySingleAll[0].extend([x for sublist in dataCategoryLevelAll for x in sublist]) self.shevaVect.createCorpusAndVectorModel(dataCategorySingleAll, fileNameSingleAll, path) ########## ORIGINAL CATEGORIES ID ################# #self.shevaUtils.getCategoryListLevel(originalCatID,fileNameLevel,path) originalCatIDAll.extend(originalCatID) #self.shevaCSV.getCategoryListLevel(originalCatIDAll,fileNameAll,path) #print out number of documents for (cat,level,model) print "Done with:\t",group,"\t",category,"\t",indeks,"\t",percentageItem ####################### GC ################# del dataCategoryLevel del originalCatID gc.collect() del dataCategoryLevelAll del dataCategoryLabelAll del originalCatIDAll del dataCategorySingleAll del sqlQueryResults gc.collect()