예제 #1
0
    def get_resource(self, url_):
        user_agent = self.user_agents_cycle.next()
        try:
            resp = requests.get(url_, headers={"user_agent": user_agent})
        except:
            slp(300)
            print "sleeping for 300 secs due to a block.."
            user_agent = self.user_agents_cycle.next()
            resp = requests.get(url_, headers={"user_agent": user_agent})

        if resp.status_code == 200:
            data = pq_(resp.text)
            data = data(".report2ed")
            if not data:
                user_agent = self.user_agents_cycle.next()
                resp = requests.get(url_, headers={"user_agent": user_agent})
                if resp.status_code == 200:
                    data = pq_(resp.text)
                    data = data(".report2ed")
                    return data
                else:
                    return []
            else:
                return data
        else:
            return []
	def get_resource(self, url_):
		user_agent = self.user_agents_cycle.next()
		try:
			resp = requests.get(url_, headers = {'user_agent': user_agent})
		except:
			slp(100)
			print 'sleeping for 100 secs due to a block..'
			user_agent = self.user_agents_cycle.next()
			resp = requests.get(url_, headers = {'user_agent': user_agent})

		if resp.status_code == 200:
			data = pq_(resp.text)
			data = data('#resume_body').children()
			if not data:
				user_agent = self.user_agents_cycle.next()
				resp = requests.get(url_, headers = {'user_agent': user_agent})
				if resp.status_code == 200:
					data = pq_(resp.text)
					data = data('#resume_body').children()
					return data
				else:
					return []
			else:
				return data
		else:
			return []
예제 #3
0
	def get_resource(self, url_):
		user_agent = self.user_agents_cycle.next()

		#--add some more solid step here, as in the except when the max retries error occurs, the script breaks
		try:
			resp = requests.get(url_, headers = {'user_agent': user_agent})
		except:
			slp(300)
			print 'sleeping for 300 secs due to a block..'
			user_agent = self.user_agents_cycle.next()
			resp = requests.get(url_, headers = {'user_agent': user_agent})

		if resp.status_code == 200:
			data = pq_(resp.text)
			data = data('#results').children()
			if not data:
				user_agent = self.user_agents_cycle.next()
				resp = requests.get(url_, headers = {'user_agent': user_agent})
				if resp.status_code == 200:
					data = pq_(resp.text)
					data = data('#results').children()
					return data
				else:
					return []
			else:
				return data
		else:
			return []
예제 #4
0
	def resource_collection(self, keyword_index, keyword, sort, rest_kewords=False):
		start_time = tm()
		n_all = 0
		n_profiles = {}
		keyword = '%s' % keyword.replace('/', ' ')
		keyword = keyword.strip('\n')
		init_url = self.init_url % (keyword.replace(' ', '+'), 0, 50)
		
		filtering_urls, result_count  = self.get_filter_urls(init_url, 0)
		
		if result_count >= 1000:
			counter = 10
		else:
			counter = int(max(float(result_count)/100, 1))
		
		for route in filtering_urls:
			url_ = self.url_ % pq_(route).children('a').attr('href')
			for i in range(counter):
				if i == 0:
					beg = i
					end = i+100
				else:
					beg = end
					end = end+100
				postfix = '&start=%d&limit=%d&radius=100&%s&co=%s' % (beg, end, sort, self.country_code)	
				print url_+postfix	
				data = self.get_resource(url_+postfix, 0)

				for each in data:
					item = pq_(each)
					unique_id = item.attr('id')
					city_ = item('.location').text()
					n_profiles[unique_id] = city_
					profile_data = indeed_resumes_details(unique_id).resource_collection()
					self.save_to_disk(profile_data, unique_id)
					n_all += 1

			db_success = False
			while not db_success:
				try:
					db_insert_hash(n_profiles, self.country_code)
					db_success = True
				except:
					print 'db locked..will wait for 5 secs and try again..'
					slp(5)
					pass
			print 'inserted %d records to db.. %s, %d' % (len(n_profiles), keyword, keyword_index)	
			n_profiles = {}
			slp(0) #--sleeping for 2 secs for every filter for not making calls too fast and get blocked quickly
			gc.collect()
		gc.collect()
		current_time = tm()
		self.time_all.append((keyword, n_all, current_time - start_time))
		print 'current time passed..%d secs for one round of %s (%d)' % (int(current_time - begin_time), keyword, keyword_index)
		return
예제 #5
0
	def resource_collection(self, keyword):
		t1 = tm()
		n_profiles = 0
		keyword = keyword.replace('/', ' ')
		file_ = open(os.path.join(os.path.dirname(self.directory), keyword+'.json'), 'a+')
		#--lets aim for collecting 1000+ profiles per skill/keyword
		for i in range(15):
			if i == 0:
				beg = i
				end = i+100
			else:
				beg = end
				end = end+100
			url_ = browse_url_profiles % (keyword, beg, end)
			data = self.get_resource(url_)

			for each in data:
				item = pq_(each)
				unique_id = item.attr('id')
				item_data = self.get_info(item('.sre-content'))
				item_data.append({'type': 'resource_id', 'data': unique_id})
				if unique_id not in self.n_distinct:
					self.n_distinct[unique_id] = 1
				file_.write(json.dumps(item_data)+'\n')
				n_profiles += 1
				# if n_profiles % 500 == 0:
				# 	print "%d profiles collected for %s - %s" % (n_profiles, self.area, keyword)

		file_.close()
		t2 = tm()
		print "done collecting %d records  for (%s - %s) ..in %d seconds.." % (n_profiles, self.area, keyword, int(t2-t1))
		print "TOTAL DISTINCT: %d " %  len(self.n_distinct)
		print "\n"
		self.n_done.append(self.area)
		return
    def extract_details(self, data):
        t1 = tm()

        details = {}
        if not data:
            return details

        details["name"] = data("#basic_info_row #basic_info_cell #resume-contact").text().strip("\n")
        details["title"] = data("#basic_info_row #basic_info_cell #headline").text().strip("\n")
        details["address"] = (
            data("#basic_info_row #basic_info_cell #contact_info_container .adr #headline_location").text().strip("\n")
        )
        details["skills"] = (
            data(".skills-content #skills-items .data_display .skill-container").text().strip("\n").split(",")
        )
        details["additional_info"] = (
            data(".additionalInfo-content #additionalinfo-items .data_display")
            .text()
            .strip("\n")
            .encode("ascii", "ignore")
        )

        identities = {}
        for k, v in self.profile_identities.iteritems():
            identities[k] = {"data": []}
            for item in data(v["content"]).children():
                data_ = {}
                it = pq_(item)
                if it.attr("id").startswith(k):
                    it_id = it.attr("id")
                    item = data(v["item_w_id"] % it_id)
                    children = pq_(item.children())
                    for each, splits in v["items"]:
                        if splits:
                            item_construct = children(each).text().strip("\n").split("-")
                            for sub, index in splits.iteritems():
                                data_[sub] = item_construct[index].strip("\n")
                        else:
                            data_[each] = children(each).text().encode("ascii", "ignore").strip("\n")

                identities[k]["data"].append(data_)
            details[k] = identities[k]
        t2 = tm()
        details["time_taken"] = t2 - t1
        details["timestamp"] = tm()
        return details
예제 #7
0
	def get_info(self, item):
		item_data = []
		for i in item.children():
			entry = pq_(i)
			class_name = entry.attr('class')
			if class_name == 'app_name':
				nameloc = entry.text().split('-')
				name = nameloc[0]
				location = nameloc[-1]
				item_data.append({'type': 'name', 'data': name})
				item_data.append({'type': 'location', 'data': location})
			else:
				item_data.append({'type': class_name, 'data': entry.text()})
		return item_data
	def extract_details(self, data):
		t1 = tm()

		details = {}
		if not data:
			return details

		details['name'] = data('#basic_info_row #basic_info_cell #resume-contact').text()
		details['title'] = data('#basic_info_row #basic_info_cell #headline').text()
		details['address'] = data('#basic_info_row #basic_info_cell #contact_info_container .adr #headline_location').text()
		details['skills'] = data('.skills-content #skills-items .data_display .skill-container').text().split(',')
		details['additional_info'] = data('.additionalInfo-content #additionalinfo-items .data_display').text().encode('ascii','ignore')

		identities = {}
		for k, v in self.profile_identities.iteritems():
			identities[k] = {'data': []}
			for item in data(v['content']).children():
				data_= {}
				it = pq_(item)
				if it.attr('id').startswith(k):
					it_id = it.attr('id')
					item = data(v['item_w_id'] % it_id)
					children = pq_(item.children())
					for each, splits in v['items']:
						if splits:
							item_construct = children(each).text().split('-')
							for sub, index in splits.iteritems():
								data_[sub] = item_construct[index]
						else:
							data_[each] = children(each).text().encode('ascii','ignore')

				identities[k]['data'].append(data_)
			details[k] = identities[k]
		t2 = tm()
		details['time_taken'] = t2-t1
		details['timestamp'] = tm()
		return details
예제 #9
0
	def get_static_resource(self, url):
		data = []
		resp = None
		try:
			while not resp:
				try:
					user_agent = self.user_agents_cycle.next()
					resp = requests.get(url, headers = {'user_agent': user_agent})
				except Exception, e:
					print str(e), '!!!'
					slp(5)
					pass
			if resp.status_code == 200:
				data = pq_(resp.text)
				data = data('#results').children()
				return data
			else:
				return data
예제 #10
0
def sorter(start, end):
	final = []
	##get the file contents
	f = open('indeed/SKILLS_ROUND_A.json', 'rb')
	y = []
	for a in f:
		y.append(a)
	f.close()

	#call and get the count
	for i, each in enumerate(y[start:end]):
		keyword = each.strip('\n')
		url_ = 'http://www.indeed.com/resumes?q=%s&co=US' % keyword.replace(' ', '+')
		resp = None
		while not resp:
			try:
				#user_agent = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(15))
				user_agent = user_agents_cycle.next()
				resp = requests.get(url_, headers = {'user_agent': user_agent})
			except:
				sleep(2)
				pass
		if resp.status_code == 200:
			html = pq_(resp.text)
			count = html('#search_header #rezsearch #search_table #result_count').text().split(' ')[0].replace(',', '')
			if count.isdigit():
				count = int(count)
			else:
				count = 0
			print (keyword, count)
			final.append((keyword, count))
			print i
		sleep(3)

	#reverse sort the big list
	final = sorted(final, key=lambda n: n[1], reverse=True)
	f = open('sorted_items_%d_%d.json' % (start, end), 'wb')
	for i in final:
		f.write(str(i[0])+'\t'+str(i[1])+'\n')
	f.close()
	print '%d, %d done...' % (start, end)
예제 #11
0
    def collect_keywords(self):
        t1 = tm()
        for keyword in self.domains:
            domain_name = keyword.upper().replace(" ", "_")
            if domain_name in self.final_data:
                container = self.final_data[domain_name]
            else:
                self.final_data[domain_name] = []
                container = self.final_data[domain_name]

            url_ = self.url_base % keyword.replace(" ", "+")
            data = self.get_resource(url_)
            for each in data:
                child = pq_(each).text()
                container.append(child)
                self.n_concepts += 1
        t2 = tm()
        f = open("keywords/skills.json", "wb")
        f.write(json.dumps(self.final_data))
        f.close()
        print "total time taken: %d seconds.." % int(t2 - t1)
        print "%d concepts saved in keywords/skills.json" % self.n_concepts
예제 #12
0
		if counter >= self.max_recursion_depth:
			print 'max recursion depth achieved in the get_resource'
			#slp(300)
			return []
		data = []
		resp = None
		while not resp:
			try:
				user_agent = self.user_agents_cycle.next()
				resp = requests.get(url_, headers = {'user_agent': user_agent})
			except Exception, e:
				print str(e), '@@@'
				slp(10)
				pass
		if resp.status_code == 200 or len(self.get_static_resource(self.fixed_test_url)):
			data = pq_(resp.text)
			data = data('#results').children()
			return data
		else:
			slp(1)
			return self.get_resource(url_, counter+1)

	def get_static_resource(self, url):
		data = []
		resp = None
		try:
			while not resp:
				try:
					user_agent = self.user_agents_cycle.next()
					resp = requests.get(url, headers = {'user_agent': user_agent})
				except Exception, e: