def report(): word = request.args.get('word') if word: word = word.lower() existingJobs = db.get(word) if existingJobs: jobs = existingJobs else: jobs = get_jobs(word) db[word] = jobs else: return redirect('/') return render_template("report.html", search=word, resultNumber=len(jobs), jobs=jobs)
from indeed import get_jobs from save import save_to_file indeed_jobs = get_jobs() save_to_file(indeed_jobs)
this file is used to build the initial json corpus ''' # Search settings KEYWORD_FILTER = [ 'IT', 'Software', 'engineer', 'developer', 'scientist', 'computer', 'researcher', 'technician', 'data', 'specialist', 'designer' ] LOCATION_FILTER = "" # Other settings MAX_PAGES_COMPANIES = 1000 MAX_PAGES_REVIEWS = 100 import indeed jobs = {} id = 0 for key in KEYWORD_FILTER: id = indeed.get_jobs(key, LOCATION_FILTER, MAX_PAGES_COMPANIES, id, jobs) if id > 3000: break #print jobs reload(sys) sys.setdefaultencoding('utf-8') f = open("jobcorpusupdate.json", "w+") jsontext = json.dumps(jobs, ensure_ascii=False, indent=4) f.write(jsontext) f.close()
def get_indeed(old): return indeed.get_jobs(old)
#now we override the one in httplib httplib.HTTPSConnection = HTTPSConnection # ssl_version corrections are done """ 1) Scrap indeed.com for jobs listings 2) Fill mongo b with results and reviews from indeed 3) For each company found, scrap glassdoor for additional reviews """ # Search settings KEYWORD_FILTER = "Data Scientist" LOCATION_FILTER = "Boston, MA" KWFLAGS = ["Hadoop", "years experience", "years' experience","years of experience"] # Other settings MAX_PAGES_COMPANIES = 500 MAX_PAGES_REVIEWS = 500 # DB settings client = MongoClient() indeed_db = client.indeed #use indeed_db database indeed_jobs = indeed_db.jobs #create collection for jobs ads indeed_reviews = indeed_db.reviews # create collection for company reviews """1) scrap indeed for jobs""" jobs = indeed.get_jobs(KEYWORD_FILTER, LOCATION_FILTER, indeed_jobs, MAX_PAGES_COMPANIES,KWFLAGS) """2) Get companies reviews into mongodb""" indeed.get_all_company_reviews(jobs, indeed_reviews, MAX_PAGES_REVIEWS)