def testProcessPage(): listCollectionName = "daily_dice_list_2014-07-11" infoCollectionName = "daily_dice_info_2014-07-11" dbClient = DbClient('localhost', 27017, "jobaly_daily") listCollection = dbClient.getCollection(listCollectionName) infoCollection = dbClient.getCollection(infoCollectionName) getter = DicePageGetter(infoCollection) pageSize = 100 pageNo = 1 has_more = True pageNum = 10000 find_sort = None find_spec = None while has_more and pageNo <= pageNum: page = dbClient.getPage(listCollection, find_spec, find_sort, pageSize, pageNo) getter.processPage(page, pageNo) pageNo += 1 count = page.count(with_limit_and_skip=True) # print "count=",count if (count < pageSize): has_more = False
def main(): collectionName = "job_lang_top_corps" infoCollectionName = "jobinfo_lang_top_corps" dbClient = DbClient('localhost', 27017, "jobaly") collection = dbClient.getCollection(collectionName) infoCollection = dbClient.getCollection(infoCollectionName) getter = IndeedPageGetter(infoCollection) pageSize = 10 pageNo = 149 has_more = True pageNum = 10000 find_sort = None find_spec=None while has_more and pageNo <= pageNum : page = dbClient.getPage(collection, find_spec,find_sort, pageSize, pageNo) getter.processPage(page,pageNo) pageNo+=1 count = page.count(with_limit_and_skip = True) # print "count=",count if ( count < pageSize ) : has_more = False
def testProcessPage(): listCollectionName = "daily_dice_list_2014-07-11" infoCollectionName = "daily_dice_info_2014-07-11" dbClient = DbClient('localhost', 27017, "jobaly_daily") listCollection = dbClient.getCollection(listCollectionName) infoCollection = dbClient.getCollection(infoCollectionName) getter = DicePageGetter(infoCollection) pageSize = 100 pageNo = 1 has_more = True pageNum = 10000 find_sort = None find_spec=None while has_more and pageNo <= pageNum : page = dbClient.getPage(listCollection, find_spec,find_sort, pageSize, pageNo) getter.processPage(page,pageNo) pageNo+=1 count = page.count(with_limit_and_skip = True) # print "count=",count if ( count < pageSize ) : has_more = False
class DataHandler: def __init__(self , dbclient=None ): self.dbclient = dbclient def setup_tfidfMatcher(self): if ( self.dbclient is None): self.dbClient = DbClient('localhost', 27017, "jobaly") else: self.dbClient = dbclient self.resumeCollection = self.dbClient.getCollection(gConfig["webResumeColName"]) self.jobCollection = self.dbClient.getCollection(gConfig["webJobInfoCollName"]) self.jobModelCollection = self.dbClient.getCollection(gConfig["jobModelCollName"]) self.matcher = TfIdfMatch(self.jobCollection) def save_resume(self, resume_text): resume = {"content": resume_text, "date": datetime.datetime.utcnow()} resume_id = self.resumeCollection.insert(resume) print "add resume id is:", resume_id def get_resumes(self): return self.resumeCollection.find() def get_resume(self, _id): return self.resumeCollection.find_one({'_id': ObjectId(_id)}) def get_jobs(self, page_no=1, page_size=20): find_sort = None find_spec = None return self.dbClient.getPage(self.jobCollection, find_spec,find_sort, page_size, page_no) def get_job(self, _id): result=list(self.jobCollection.find({'_id': _id })) if len(result) > 0: return result[0] else : return None def get_job_ids(self, ids): result=list(self.jobCollection.find({"_id": {"$in": ids}})) return result def get_jobmodel_ids(self, ids): result=list(self.modelCollection.find({"_id": {"$in": ids}})) return result def get_model(self, _id): result=list(self.modelCollection.find({'_id': _id })) if len(result) > 0: return result[0] else : return None def matchResume(self, resume): return self.matcher.matchResume(resume) def connectJobColl(self, dbName, collName): self.dbname = dbName self.collname = collName self.dbClient = DbClient('localhost', 27017, dbName) self.jobCollection = self.dbClient.getCollection(collName) self.collSize = self.dbClient.getCollectionSize(collName) self.modelCollection = self.dbClient.getCollection(collName+"_model") def getJobsByPage(self, page_size , page_no ): find_sort = None find_spec = None return self.dbClient.getPage(self.jobCollection, find_spec,find_sort, page_size, page_no) def searchjobs(self,query,qtype ) : if qtype == "jid" : result=list(self.jobCollection.find({'_id': query })) pageno = 1 elif qtype == "jobtitle" : result=list(self.jobCollection.find({'jobtitle': query })) pageno = 1 resultnum = len(result) return (result, pageno, resultnum)