def main(): collectionName = "job_lang_top_corps" infoCollectionName = "jobinfo_lang_top_corps" dbClient = DbClient('localhost', 27017, "jobaly") collection = dbClient.getCollection(collectionName) infoCollection = dbClient.getCollection(infoCollectionName) getter = IndeedPageGetter(infoCollection) pageSize = 10 pageNo = 149 has_more = True pageNum = 10000 find_sort = None find_spec = None while has_more and pageNo <= pageNum: page = dbClient.getPage(collection, find_spec, find_sort, pageSize, pageNo) getter.processPage(page, pageNo) pageNo += 1 count = page.count(with_limit_and_skip=True) # print "count=",count if (count < pageSize): has_more = False
def main(): collectionName = "jobinfo_se_top_corps" dbClient = DbClient('localhost', 27017, "jobaly") collection = dbClient.getCollection(collectionName) pageSize = 100 pageNo = 1 has_more = True pageNum = 10000 find_sort = None find_spec=None while has_more and pageNo <= pageNum : page = dbClient.getPage(collection, find_spec,find_sort, pageSize, pageNo) processPage(collection, page,pageNo) pageNo+=1 count = page.count(with_limit_and_skip = True) # print "count=",count if ( count < pageSize ) : has_more = False
def main(): collectionName = "job_se_10city" infoCollectionName = "jobinfo_se_10city" collectionName = "job_lang_top_corps" infoCollectionName = "jobinfo_lang_top_corps" dbClient = DbClient('localhost', 27017, "jobaly") collection = dbClient.getCollection(collectionName) infoCollection = dbClient.getCollection(infoCollectionName) pageSize = 20 pageNo = 1 has_more = True pageNum = 10000 find_sort = None find_spec = None threadNum = 20 queue = Queue.Queue() for i in range(threadNum): t = JobGetter(queue, infoCollection) t.setDaemon(True) t.start() while has_more and pageNo <= pageNum: page = dbClient.getPage(collection, find_spec, find_sort, pageSize, pageNo) queue.put((page, pageNo)) pageNo += 1 count = page.count(with_limit_and_skip=True) # print "count=",count if (count < pageSize): has_more = False queue.join()
def main(): collectionName = "job_se_10city" infoCollectionName = "jobinfo_se_10city" collectionName = "job_lang_top_corps" infoCollectionName = "jobinfo_lang_top_corps" dbClient = DbClient('localhost', 27017, "jobaly") collection = dbClient.getCollection(collectionName) infoCollection = dbClient.getCollection(infoCollectionName) pageSize = 20 pageNo = 1 has_more = True pageNum = 10000 find_sort = None find_spec=None threadNum = 20 queue = Queue.Queue() for i in range(threadNum): t = JobGetter(queue,infoCollection) t.setDaemon(True) t.start() while has_more and pageNo <= pageNum : page = dbClient.getPage(collection, find_spec,find_sort, pageSize, pageNo) queue.put( (page,pageNo) ) pageNo+=1 count = page.count(with_limit_and_skip = True) # print "count=",count if ( count < pageSize ) : has_more = False queue.join()
class DataProcessor: def __init__(self): self.dbClient = DbClient("localhost", 27017, "SimilarQuestion") @staticmethod def processQuestion(question): a = {} a["qid"] = question["_id"] a["title"] = question["title"] return a @staticmethod def processLinkedQuestion(question): a = {} a["qid"] = question["_id"] a["title"] = question["title"] a["linked"] = [] for item in question["items"]: b = {} b["qid"] = item["question_id"] b["title"] = item["title"] print b a["linked"].append(b) return a @staticmethod def processLinkedQuestion2(question): a = {} a["qid"] = question["_id"] a["linked"] = [] for item in question["items"]: a["linked"].append(item["question_id"]) return a @staticmethod def processRelatedQuestion(question): a = {} a["qid"] = question["_id"] a["title"] = question["title"] a["related"] = [] for item in question["items"]: b = {} b["qid"] = item["question_id"] b["title"] = item["title"] # print b a["related"].append(b) return a def dumpDataToFile(self, queFun, collection, find_spec, find_sort, fileName, pageNum): pageSize = 1000 pageNo = 1 has_more = True with open(fileName, "w") as the_file: # the_file.write('Hello\n') while has_more and pageNo <= pageNum: page = self.dbClient.getPage(collection, find_spec, find_sort, pageSize, pageNo) pageNo += 1 count = page.count(with_limit_and_skip=True) print "count=", count if count < pageSize: has_more = False for item in page: a = queFun(item) jstr = json.dumps(a) + "\n" the_file.write(jstr) print " page %d saved %d lines in file" % (pageNo - 1, count) def dumpPythonQuestions(self, pageNum): question_coll = self.dbClient.getCollection("question_test") fileName = "..\..\data\pyton_questions.txt" self.dumpDataToFile(DataProcessor.processQuestion, question_coll, fileName, pageNum) def dumpLinkedQuestions(self, pageNum): question_coll = self.dbClient.getCollection("question_link_python") fileName = "..\..\data\question_link_python.txt" find_spec = {"items": {"$exists": True}, "$where": "this.items.length > 5"} find_sort = {"items": {"$size": -1}} self.dumpDataToFile(DataProcessor.processLinkedQuestion, question_coll, find_spec, find_sort, fileName, pageNum) def dumpLinkedQuestions2(self, pageNum): question_coll = self.dbClient.getCollection("question_link_python") fileName = "..\..\data\python_linked.txt" find_spec = {"items": {"$exists": True}, "$where": "this.items.length > 1"} find_sort = {"items": {"$size": -1}} self.dumpDataToFile( DataProcessor.processLinkedQuestion2, question_coll, find_spec, find_sort, fileName, pageNum ) def dumpRelatedQuestions(self, pageNum): question_coll = self.dbClient.getCollection("related_python") fileName = "..\..\data\question_related_python.txt" find_spec = {"items": {"$exists": True}, "$where": "this.items.length > 5"} find_sort = None self.dumpDataToFile( DataProcessor.processRelatedQuestion, question_coll, find_spec, find_sort, fileName, pageNum )