def main(): collectionName = "job_lang_top_corps" infoCollectionName = "jobinfo_lang_top_corps" dbClient = DbClient('localhost', 27017, "jobaly") collection = dbClient.getCollection(collectionName) infoCollection = dbClient.getCollection(infoCollectionName) getter = IndeedPageGetter(infoCollection) pageSize = 10 pageNo = 149 has_more = True pageNum = 10000 find_sort = None find_spec = None while has_more and pageNo <= pageNum: page = dbClient.getPage(collection, find_spec, find_sort, pageSize, pageNo) getter.processPage(page, pageNo) pageNo += 1 count = page.count(with_limit_and_skip=True) # print "count=",count if (count < pageSize): has_more = False
def main(): targetDb = "jobaly" targetClient = DbClient('localhost', 27017, targetDb) srcDb = "jobaly_daily" srcClient = DbClient('localhost', 27017, srcDb) targetCollName = "job1000" srcCollnames = "daily_job_info_2014-06-16" srcColl = srcClient.getCollection(srcCollnames) targetColl = targetClient.getCollection(targetCollName) size = 1000 copyCollection(srcColl, targetColl, size)
def main(): cities = [ 'MoutainView, CA', 'Seattle, WA', 'San Diego, CA', 'San Francisco, CA', 'Austin, TX', 'San Jose, CA', 'Portland, OR', ' New York, NY', 'Houston, TX', 'Boston, MA', 'Davis, CA', 'Palo Alto, CA', ' Irvine, CA', 'Olathe, KS', 'Columbia, MD', ' Atlanta, GA' ] cities = [ 'Austin, TX', 'San Jose, CA', 'Portland, OR', ' New York, NY', 'Houston, TX', 'Boston, MA', 'Davis, CA', 'Palo Alto, CA', ' Irvine, CA', 'Olathe, KS', 'Columbia, MD', ' Atlanta, GA' ] _pageSize = 25 _fromage = 30 _location = 94040 _radius = 25 _query = "software engineer" collectionName = "job_se_10city" indeedClient = ApiClient(_query, _pageSize, _fromage, _location, _radius) # client.getPage(0) dbClient = DbClient('localhost', 27017, "jobaly") collection = dbClient.getCollection(collectionName) for city in cities: print "-----prcoss city %s -------" % city indeedClient.processCity(collection, city)
def getByCities(): cities = [ "Austin, TX", "San Jose, CA", "Portland, OR", " New York, NY", "Houston, TX", "Boston, MA", "Davis, CA", "Palo Alto, CA", " Irvine, CA", "Olathe, KS", "Columbia, MD", " Atlanta, GA", ] param = {"q": "software engineer", "fromage": "30"} collectionName = "job_se_10city" indeedClient = ApiClient(param) # client.getPage(0) dbClient = DbClient("localhost", 27017, "jobaly") collection = dbClient.getCollection(collectionName) for city in cities: print "-----prcoss city %s -------" % city indeedClient.processQuery(collection, "l", city)
def main(): collectionName = "job_se_10city" infoCollectionName = "jobinfo_se_10city" collectionName = "job_lang_top_corps" infoCollectionName = "jobinfo_lang_top_corps" dbClient = DbClient('localhost', 27017, "jobaly") collection = dbClient.getCollection(collectionName) infoCollection = dbClient.getCollection(infoCollectionName) pageSize = 20 pageNo = 1 has_more = True pageNum = 10000 find_sort = None find_spec = None threadNum = 20 queue = Queue.Queue() for i in range(threadNum): t = JobGetter(queue, infoCollection) t.setDaemon(True) t.start() while has_more and pageNo <= pageNum: page = dbClient.getPage(collection, find_spec, find_sort, pageSize, pageNo) queue.put((page, pageNo)) pageNo += 1 count = page.count(with_limit_and_skip=True) # print "count=",count if (count < pageSize): has_more = False queue.join()
def main(): collectionName = "job_se_10city" infoCollectionName = "jobinfo_se_10city" collectionName = "job_lang_top_corps" infoCollectionName = "jobinfo_lang_top_corps" dbClient = DbClient('localhost', 27017, "jobaly") collection = dbClient.getCollection(collectionName) infoCollection = dbClient.getCollection(infoCollectionName) pageSize = 20 pageNo = 1 has_more = True pageNum = 10000 find_sort = None find_spec=None threadNum = 20 queue = Queue.Queue() for i in range(threadNum): t = JobGetter(queue,infoCollection) t.setDaemon(True) t.start() while has_more and pageNo <= pageNum : page = dbClient.getPage(collection, find_spec,find_sort, pageSize, pageNo) queue.put( (page,pageNo) ) pageNo+=1 count = page.count(with_limit_and_skip = True) # print "count=",count if ( count < pageSize ) : has_more = False queue.join()
def main(): collectionName = "job_lang_top_corps" infoCollectionName = "jobinfo_lang_top_corps" dbClient = DbClient('localhost', 27017, "jobaly") collection = dbClient.getCollection(collectionName) infoCollection = dbClient.getCollection(infoCollectionName) getter = IndeedPageGetter(infoCollection) pageSize = 10 pageNo = 149 has_more = True pageNum = 10000 find_sort = None find_spec=None while has_more and pageNo <= pageNum : page = dbClient.getPage(collection, find_spec,find_sort, pageSize, pageNo) getter.processPage(page,pageNo) pageNo+=1 count = page.count(with_limit_and_skip = True) # print "count=",count if ( count < pageSize ) : has_more = False
def main(): pageSize = 100 startPageNo = 13 endPageNo = 10000 dbClient = DbClient('localhost', 27017, "SimilarQuestion") collection = dbClient.getCollection("question_test") questionGetter = QuestionGetter(pageSize,"python") for pg in range(startPageNo, endPageNo): print "--get page at : %d -----" % pg items = questionGetter.getPage(pg) if items == "NO_ITEMS": break print "--page at : %d have %d questions--" % (pg, len(items)) questionGetter.savePage(collection,items) time.sleep(10)
def getByCities(): cities = [ 'Austin, TX', 'San Jose, CA', 'Portland, OR', ' New York, NY', 'Houston, TX', 'Boston, MA', 'Davis, CA', 'Palo Alto, CA', ' Irvine, CA', 'Olathe, KS', 'Columbia, MD', ' Atlanta, GA' ] param = {"q": "software engineer", "fromage": "30"} collectionName = "job_se_10city" indeedClient = ApiClient(param) # client.getPage(0) dbClient = DbClient('localhost', 27017, "jobaly") collection = dbClient.getCollection(collectionName) for city in cities: print "-----prcoss city %s -------" % city indeedClient.processQuery(collection, "l", city)
def getByCorps(): print " --- get job by companies---" collectionName = "job_se_top_corps" param = {"q": "software engineer", "fromage": "30"} indeedClient = ApiClient(param) # client.getPage(0) dbClient = DbClient('localhost', 27017, "jobaly") collection = dbClient.getCollection(collectionName) corps = [] fileName = "topcorps.txt" with open(fileName, 'r') as the_file: for line in the_file: word = line.strip() if not len(word) == 0: corps.append(word) for corp in corps: q = indeedClient.buildQuery("software engineer", {"company": corp}) print "-----prcoss corp %s -------" % corp indeedClient.processQuery(collection, "q", q)
def getByCorps(): print " --- get job by companies---" collectionName = "job_se_top_corps" param = {"q": "software engineer", "fromage": "30"} indeedClient = ApiClient(param) # client.getPage(0) dbClient = DbClient("localhost", 27017, "jobaly") collection = dbClient.getCollection(collectionName) corps = [] fileName = "topcorps.txt" with open(fileName, "r") as the_file: for line in the_file: word = line.strip() if not len(word) == 0: corps.append(word) for corp in corps: q = indeedClient.buildQuery("software engineer", {"company": corp}) print "-----prcoss corp %s -------" % corp indeedClient.processQuery(collection, "q", q)
def getByLang(): print " --- get job by language and companies---" collectionName = "job_lang_top_corps" param = { "q" : "software engineer", "fromage" : "30" } lang_names = utils.loadArrayFromFile("pro_langs.txt") corps_names = utils.loadArrayFromFile("topcorps.txt") indeedClient= ApiClient( param ) # client.getPage(0) dbClient = DbClient('localhost', 27017, "jobaly") collection = dbClient.getCollection(collectionName) for corp in corps_names: for lang in lang_names: q = indeedClient.buildQuery(lang, {"company": corp }) print "-----prcoss corp %s with language %s -------" % (corp, lang) indeedClient.processQuery(collection, "q", q)
def main(): pageSize = 100 startPageNo = 1 endPageNo = 10000 dbClient = DbClient('localhost', 27017, "SimilarQuestion") collection = dbClient.getCollection("english_questions") questionGetter = QuestionGetter(pageSize,"") for pg in range(startPageNo, endPageNo): print "--- get page %d ---" %pg items = questionGetter.getPage(pg) # print items if ( items == "NO_MORE" ) : print "have no more questions, quit program !!" break print "--- page %d has %d questions ---" %(pg,len(items)) if ( items != "NO_ITEMS" ) : i = questionGetter.savePage(collection,items) print "--- page %d has save %d question " %(pg,i)
def main(): collectionName = "job_lang_top_corps" dbClient = DbClient('localhost', 27017, "jobaly") collection = dbClient.getCollection(collectionName) title_dict = {} for job in collection.find(): # print job["_id"], job["jobtitle"] title = job["jobtitle"] if title_dict.has_key(title): title_dict[title] += 1 else : title_dict[title] = 1 stat_file_name = "jobtitle_stat.txt" with open( stat_file_name , "w") as text_file: i = 0 for (key, value) in sorted(title_dict.iteritems(), key=operator.itemgetter(1), reverse = True): # print key, ":", value text_file.write("%s : %s \n" % (key.encode('utf8'),value)) i+=1 print i, " lines had been writen into file:", stat_file_name
def main(): cities = ['MoutainView, CA', 'Seattle, WA', 'San Diego, CA', 'San Francisco, CA', 'Austin, TX', 'San Jose, CA','Portland, OR',' New York, NY','Houston, TX','Boston, MA', 'Davis, CA', 'Palo Alto, CA', ' Irvine, CA', 'Olathe, KS', 'Columbia, MD', ' Atlanta, GA' ] cities = [ 'Austin, TX', 'San Jose, CA','Portland, OR',' New York, NY','Houston, TX','Boston, MA', 'Davis, CA', 'Palo Alto, CA', ' Irvine, CA', 'Olathe, KS', 'Columbia, MD', ' Atlanta, GA' ] _pageSize = 25 _fromage = 30 _location = 94040 _radius = 25 _query = "software engineer" collectionName = "job_se_10city" indeedClient= ApiClient(_query, _pageSize, _fromage, _location, _radius ) # client.getPage(0) dbClient = DbClient('localhost', 27017, "jobaly") collection = dbClient.getCollection(collectionName) for city in cities: print "-----prcoss city %s -------" %city indeedClient.processCity(collection,city)
class DataProcessor: def __init__(self): self.dbClient = DbClient("localhost", 27017, "SimilarQuestion") @staticmethod def processQuestion(question): a = {} a["qid"] = question["_id"] a["title"] = question["title"] return a @staticmethod def processLinkedQuestion(question): a = {} a["qid"] = question["_id"] a["title"] = question["title"] a["linked"] = [] for item in question["items"]: b = {} b["qid"] = item["question_id"] b["title"] = item["title"] print b a["linked"].append(b) return a @staticmethod def processLinkedQuestion2(question): a = {} a["qid"] = question["_id"] a["linked"] = [] for item in question["items"]: a["linked"].append(item["question_id"]) return a @staticmethod def processRelatedQuestion(question): a = {} a["qid"] = question["_id"] a["title"] = question["title"] a["related"] = [] for item in question["items"]: b = {} b["qid"] = item["question_id"] b["title"] = item["title"] # print b a["related"].append(b) return a def dumpDataToFile(self, queFun, collection, find_spec, find_sort, fileName, pageNum): pageSize = 1000 pageNo = 1 has_more = True with open(fileName, "w") as the_file: # the_file.write('Hello\n') while has_more and pageNo <= pageNum: page = self.dbClient.getPage(collection, find_spec, find_sort, pageSize, pageNo) pageNo += 1 count = page.count(with_limit_and_skip=True) print "count=", count if count < pageSize: has_more = False for item in page: a = queFun(item) jstr = json.dumps(a) + "\n" the_file.write(jstr) print " page %d saved %d lines in file" % (pageNo - 1, count) def dumpPythonQuestions(self, pageNum): question_coll = self.dbClient.getCollection("question_test") fileName = "..\..\data\pyton_questions.txt" self.dumpDataToFile(DataProcessor.processQuestion, question_coll, fileName, pageNum) def dumpLinkedQuestions(self, pageNum): question_coll = self.dbClient.getCollection("question_link_python") fileName = "..\..\data\question_link_python.txt" find_spec = {"items": {"$exists": True}, "$where": "this.items.length > 5"} find_sort = {"items": {"$size": -1}} self.dumpDataToFile(DataProcessor.processLinkedQuestion, question_coll, find_spec, find_sort, fileName, pageNum) def dumpLinkedQuestions2(self, pageNum): question_coll = self.dbClient.getCollection("question_link_python") fileName = "..\..\data\python_linked.txt" find_spec = {"items": {"$exists": True}, "$where": "this.items.length > 1"} find_sort = {"items": {"$size": -1}} self.dumpDataToFile( DataProcessor.processLinkedQuestion2, question_coll, find_spec, find_sort, fileName, pageNum ) def dumpRelatedQuestions(self, pageNum): question_coll = self.dbClient.getCollection("related_python") fileName = "..\..\data\question_related_python.txt" find_spec = {"items": {"$exists": True}, "$where": "this.items.length > 5"} find_sort = None self.dumpDataToFile( DataProcessor.processRelatedQuestion, question_coll, find_spec, find_sort, fileName, pageNum )