def __init__(self): dictionObj = diction(146860) self._prevIdfList = dictionObj._wordIdfList[:10000] self._eligibleWordsList = [i[1] for i in self._prevIdfList] self._eligibleWords = {} for i in self._prevIdfList: self._eligibleWords[i[1]] = i[0] self._WordDict = {} self._testdictOfWordCount = {} self._dictOfWordCount = {} self._testcategoryDict = {} # e.g. {'teaching job':[1000,15000,...],...} self._testlocationDict = {} # e.g. {'location':[1000,15000,...],...} self._testcompanyDict = {} # e.g. {'company':[1000,5000,...],...} self._testsourceDict = {} #e.g. {'source':[1000,5000,...],...} self._testjobTimeTermList = [] #e.g. {1:'[full, permanent]', 2:[None,None], 3:'[part, contact]'} self._testlocationDocs = {} self._testcompanyDocs = {} self._testcatDocs = {} print 'training features' di = diction(146860,1) self._ssourceDocs = di._sourceDocs self._scatDocs = di._catDocs self._scompanyDocs = di._companyDocs self._slocDocs = di._locationDocs self._testsourceDocs = {} print 'dicting...' self.setDicts() self.retrieveFeatures() self.writejobTimetoCSV() self.calculateTfIdf()
def __init__(self): dictionObj = diction(190) self._docLength = dictionObj._documentLength self._wordDict = dictionObj._universalWordDict self._dictOfWordCount = dictionObj._dictOfWordCount self._timeDict = {1:[], -1:[]} #1=> full time, -1=> part time self._termDict = {1:[], -1:[]} #1=> permanent, -1=> part time self.timeTermDict() self.tfIdfDict = dictionObj.tfidfInDict() self.idfList = dictionObj._wordIdfList featureObj = diction(190,1) self._tempLocDict = featureObj._locationDict self._locDict = {} self._locDocs = featureObj._locationDocs # each location in which which docs self._tempCompanyDict = featureObj._companyDict self._companyDict = {} self._companyDocs = featureObj._companyDocs # each company in which which docs self.locationSalary() # # e.g. _locDict = {'loc1':[1000, 5000,2000],...} self.companySalary() # e.g. _companyDict = {'comp1':[1000, 5000,2000],...} self._catDocs = featureObj._catDocs #e.g. {'cat1':[1,5,8,...],...} self._sourceDocs = featureObj._sourceDocs # e.g. similar to category