def get_resource(self, url_): user_agent = self.user_agents_cycle.next() try: resp = requests.get(url_, headers={"user_agent": user_agent}) except: slp(300) print "sleeping for 300 secs due to a block.." user_agent = self.user_agents_cycle.next() resp = requests.get(url_, headers={"user_agent": user_agent}) if resp.status_code == 200: data = pq_(resp.text) data = data(".report2ed") if not data: user_agent = self.user_agents_cycle.next() resp = requests.get(url_, headers={"user_agent": user_agent}) if resp.status_code == 200: data = pq_(resp.text) data = data(".report2ed") return data else: return [] else: return data else: return []
def get_resource(self, url_): user_agent = self.user_agents_cycle.next() try: resp = requests.get(url_, headers = {'user_agent': user_agent}) except: slp(100) print 'sleeping for 100 secs due to a block..' user_agent = self.user_agents_cycle.next() resp = requests.get(url_, headers = {'user_agent': user_agent}) if resp.status_code == 200: data = pq_(resp.text) data = data('#resume_body').children() if not data: user_agent = self.user_agents_cycle.next() resp = requests.get(url_, headers = {'user_agent': user_agent}) if resp.status_code == 200: data = pq_(resp.text) data = data('#resume_body').children() return data else: return [] else: return data else: return []
def get_resource(self, url_): user_agent = self.user_agents_cycle.next() #--add some more solid step here, as in the except when the max retries error occurs, the script breaks try: resp = requests.get(url_, headers = {'user_agent': user_agent}) except: slp(300) print 'sleeping for 300 secs due to a block..' user_agent = self.user_agents_cycle.next() resp = requests.get(url_, headers = {'user_agent': user_agent}) if resp.status_code == 200: data = pq_(resp.text) data = data('#results').children() if not data: user_agent = self.user_agents_cycle.next() resp = requests.get(url_, headers = {'user_agent': user_agent}) if resp.status_code == 200: data = pq_(resp.text) data = data('#results').children() return data else: return [] else: return data else: return []
def resource_collection(self, keyword_index, keyword, sort, rest_kewords=False): start_time = tm() n_all = 0 n_profiles = {} keyword = '%s' % keyword.replace('/', ' ') keyword = keyword.strip('\n') init_url = self.init_url % (keyword.replace(' ', '+'), 0, 50) filtering_urls, result_count = self.get_filter_urls(init_url, 0) if result_count >= 1000: counter = 10 else: counter = int(max(float(result_count)/100, 1)) for route in filtering_urls: url_ = self.url_ % pq_(route).children('a').attr('href') for i in range(counter): if i == 0: beg = i end = i+100 else: beg = end end = end+100 postfix = '&start=%d&limit=%d&radius=100&%s&co=%s' % (beg, end, sort, self.country_code) print url_+postfix data = self.get_resource(url_+postfix, 0) for each in data: item = pq_(each) unique_id = item.attr('id') city_ = item('.location').text() n_profiles[unique_id] = city_ profile_data = indeed_resumes_details(unique_id).resource_collection() self.save_to_disk(profile_data, unique_id) n_all += 1 db_success = False while not db_success: try: db_insert_hash(n_profiles, self.country_code) db_success = True except: print 'db locked..will wait for 5 secs and try again..' slp(5) pass print 'inserted %d records to db.. %s, %d' % (len(n_profiles), keyword, keyword_index) n_profiles = {} slp(0) #--sleeping for 2 secs for every filter for not making calls too fast and get blocked quickly gc.collect() gc.collect() current_time = tm() self.time_all.append((keyword, n_all, current_time - start_time)) print 'current time passed..%d secs for one round of %s (%d)' % (int(current_time - begin_time), keyword, keyword_index) return
def resource_collection(self, keyword): t1 = tm() n_profiles = 0 keyword = keyword.replace('/', ' ') file_ = open(os.path.join(os.path.dirname(self.directory), keyword+'.json'), 'a+') #--lets aim for collecting 1000+ profiles per skill/keyword for i in range(15): if i == 0: beg = i end = i+100 else: beg = end end = end+100 url_ = browse_url_profiles % (keyword, beg, end) data = self.get_resource(url_) for each in data: item = pq_(each) unique_id = item.attr('id') item_data = self.get_info(item('.sre-content')) item_data.append({'type': 'resource_id', 'data': unique_id}) if unique_id not in self.n_distinct: self.n_distinct[unique_id] = 1 file_.write(json.dumps(item_data)+'\n') n_profiles += 1 # if n_profiles % 500 == 0: # print "%d profiles collected for %s - %s" % (n_profiles, self.area, keyword) file_.close() t2 = tm() print "done collecting %d records for (%s - %s) ..in %d seconds.." % (n_profiles, self.area, keyword, int(t2-t1)) print "TOTAL DISTINCT: %d " % len(self.n_distinct) print "\n" self.n_done.append(self.area) return
def extract_details(self, data): t1 = tm() details = {} if not data: return details details["name"] = data("#basic_info_row #basic_info_cell #resume-contact").text().strip("\n") details["title"] = data("#basic_info_row #basic_info_cell #headline").text().strip("\n") details["address"] = ( data("#basic_info_row #basic_info_cell #contact_info_container .adr #headline_location").text().strip("\n") ) details["skills"] = ( data(".skills-content #skills-items .data_display .skill-container").text().strip("\n").split(",") ) details["additional_info"] = ( data(".additionalInfo-content #additionalinfo-items .data_display") .text() .strip("\n") .encode("ascii", "ignore") ) identities = {} for k, v in self.profile_identities.iteritems(): identities[k] = {"data": []} for item in data(v["content"]).children(): data_ = {} it = pq_(item) if it.attr("id").startswith(k): it_id = it.attr("id") item = data(v["item_w_id"] % it_id) children = pq_(item.children()) for each, splits in v["items"]: if splits: item_construct = children(each).text().strip("\n").split("-") for sub, index in splits.iteritems(): data_[sub] = item_construct[index].strip("\n") else: data_[each] = children(each).text().encode("ascii", "ignore").strip("\n") identities[k]["data"].append(data_) details[k] = identities[k] t2 = tm() details["time_taken"] = t2 - t1 details["timestamp"] = tm() return details
def get_info(self, item): item_data = [] for i in item.children(): entry = pq_(i) class_name = entry.attr('class') if class_name == 'app_name': nameloc = entry.text().split('-') name = nameloc[0] location = nameloc[-1] item_data.append({'type': 'name', 'data': name}) item_data.append({'type': 'location', 'data': location}) else: item_data.append({'type': class_name, 'data': entry.text()}) return item_data
def extract_details(self, data): t1 = tm() details = {} if not data: return details details['name'] = data('#basic_info_row #basic_info_cell #resume-contact').text() details['title'] = data('#basic_info_row #basic_info_cell #headline').text() details['address'] = data('#basic_info_row #basic_info_cell #contact_info_container .adr #headline_location').text() details['skills'] = data('.skills-content #skills-items .data_display .skill-container').text().split(',') details['additional_info'] = data('.additionalInfo-content #additionalinfo-items .data_display').text().encode('ascii','ignore') identities = {} for k, v in self.profile_identities.iteritems(): identities[k] = {'data': []} for item in data(v['content']).children(): data_= {} it = pq_(item) if it.attr('id').startswith(k): it_id = it.attr('id') item = data(v['item_w_id'] % it_id) children = pq_(item.children()) for each, splits in v['items']: if splits: item_construct = children(each).text().split('-') for sub, index in splits.iteritems(): data_[sub] = item_construct[index] else: data_[each] = children(each).text().encode('ascii','ignore') identities[k]['data'].append(data_) details[k] = identities[k] t2 = tm() details['time_taken'] = t2-t1 details['timestamp'] = tm() return details
def get_static_resource(self, url): data = [] resp = None try: while not resp: try: user_agent = self.user_agents_cycle.next() resp = requests.get(url, headers = {'user_agent': user_agent}) except Exception, e: print str(e), '!!!' slp(5) pass if resp.status_code == 200: data = pq_(resp.text) data = data('#results').children() return data else: return data
def sorter(start, end): final = [] ##get the file contents f = open('indeed/SKILLS_ROUND_A.json', 'rb') y = [] for a in f: y.append(a) f.close() #call and get the count for i, each in enumerate(y[start:end]): keyword = each.strip('\n') url_ = 'http://www.indeed.com/resumes?q=%s&co=US' % keyword.replace(' ', '+') resp = None while not resp: try: #user_agent = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(15)) user_agent = user_agents_cycle.next() resp = requests.get(url_, headers = {'user_agent': user_agent}) except: sleep(2) pass if resp.status_code == 200: html = pq_(resp.text) count = html('#search_header #rezsearch #search_table #result_count').text().split(' ')[0].replace(',', '') if count.isdigit(): count = int(count) else: count = 0 print (keyword, count) final.append((keyword, count)) print i sleep(3) #reverse sort the big list final = sorted(final, key=lambda n: n[1], reverse=True) f = open('sorted_items_%d_%d.json' % (start, end), 'wb') for i in final: f.write(str(i[0])+'\t'+str(i[1])+'\n') f.close() print '%d, %d done...' % (start, end)
def collect_keywords(self): t1 = tm() for keyword in self.domains: domain_name = keyword.upper().replace(" ", "_") if domain_name in self.final_data: container = self.final_data[domain_name] else: self.final_data[domain_name] = [] container = self.final_data[domain_name] url_ = self.url_base % keyword.replace(" ", "+") data = self.get_resource(url_) for each in data: child = pq_(each).text() container.append(child) self.n_concepts += 1 t2 = tm() f = open("keywords/skills.json", "wb") f.write(json.dumps(self.final_data)) f.close() print "total time taken: %d seconds.." % int(t2 - t1) print "%d concepts saved in keywords/skills.json" % self.n_concepts
if counter >= self.max_recursion_depth: print 'max recursion depth achieved in the get_resource' #slp(300) return [] data = [] resp = None while not resp: try: user_agent = self.user_agents_cycle.next() resp = requests.get(url_, headers = {'user_agent': user_agent}) except Exception, e: print str(e), '@@@' slp(10) pass if resp.status_code == 200 or len(self.get_static_resource(self.fixed_test_url)): data = pq_(resp.text) data = data('#results').children() return data else: slp(1) return self.get_resource(url_, counter+1) def get_static_resource(self, url): data = [] resp = None try: while not resp: try: user_agent = self.user_agents_cycle.next() resp = requests.get(url, headers = {'user_agent': user_agent}) except Exception, e: