def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.find("td", {'class': "resultsCompanyUrl resultsStandard"}).string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.find("div", {'class': "jobTitle"}).a.string self.fields["company_joburl"].func = lambda doc: doc.find("div", {'class': "jobTitle"}).a["href"] self.fields["source_joburl"].func = lambda doc: doc.find("div", {'class': "jobTitle"}).a["href"] self.fields["city"].func = lambda doc: doc.find("a", id="results.job.location").string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find("a", id="results.job.location").string self.fields["state"].patterns = [r", (.*?)\s+\d", r", ([^,/]*)$"] self.fields["state"].process = common.shorten self.fields["source"].func = lambda doc: "careercast.com" self.fields["posting_date"].func = lambda doc: doc.tr.findAll("td", {'class': "resultsStandard"}, recursive=False).pop().string self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.fields.update(kwargs) def nextpage(doc, page): links = doc.find("ul", {'class': "paginationLineup"}) link=links.find('img',id='pager.pagenext') if links else None if link: return "http://www.careercast.com/careers/jobsearch/" +link.parent['href'] return None self.nextlink = nextpage self.dev_mode=True self.datafunc = lambda doc: [tbody for tbody in doc.findAll("tbody") if tbody["class"] == "displayTableRowEven" or tbody["class"] == "displayTableRowOdd"] self.url = "http://www.careercast.com/careers/jobsearch/results?searchType=quick;kAndEntire=%s;lastUpdated=-30+days;pageSize=500;sortBy=moddate;lastUpdated_i18n_date_array[month]=9;lastUpdated_i18n_date_array[day]=4;lastUpdated_i18n_date_array[year]=2010;lastUpdated_i18n_date_mysql=2010-09-04;lastUpdated_i18n[date_array][month]=9;lastUpdated_i18n[date_array][day]=4;lastUpdated_i18n[date_array][year]=2010;lastUpdated_i18n[date_mysql]=2010-09-04;lastUpdated_i18n[utc_beginning_mysql]=2010-09-04+05%3A00%3A00;lastUpdated_i18n[utc_end_mysql]=2010-09-05+04%3A59%3A59;lastUpdated_i18n[timezone_used_for_conversion]=CST"
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.find("span", {'class': "Company"}).string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.h3.a.string self.fields["company_joburl"].func = lambda doc: "http://postjobfree.com/" + doc.h3.a["href"] self.fields["source_joburl"].func = lambda doc: "http://postjobfree.com/" + doc.h3.a["href"] self.fields["city"].func = lambda doc: doc.find("span", {'class': "Location"}).string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find("span", {'class': "Location"}).string self.fields["state"].patterns = [r", (\w\w)\W", r", (\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "postjobfree.com" self.fields["posting_date"].func = lambda doc: doc.find("span", {'class': "PostedDate"}).string self.fields["posting_date"].patterns = [r"(\w\w\w) (\d\d?)"] self.fields["posting_date"].process = common.mmm_dd self.filterfields["zipcode"].func = lambda doc: "".join(doc.find("td", text=re.compile("ZIP:")).parent.parent.findAll(text=True)) self.filterfields["zipcode"].patterns = [r"(\d{5})"] self.filterfields["zipcode"].process = lambda t: t[0].strip() self.filterfields["zipcode"].depth = 2 self.fields.update(kwargs) self.datafunc = lambda doc: doc.findAll("div", {'class': "JobRow"}) self.url = "http://postjobfree.com/JobList.aspx?q=%s&n=&t=&c=&jt=&l=&radius=25&r=50&lat=&lng=&lct=&lc=&ls=&lz=&accuracy=&address="
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.find("td", {'class': "resultsCompanyUrl resultsStandard"}).string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.find("div", {'class': "jobTitle"}).a.string self.fields["company_joburl"].func = lambda doc: "http://jobs.retailcareersnow.com" + doc.find("div", {'class': "jobTitle"}).a["href"] self.fields["source_joburl"].func = lambda doc: "http://jobs.retailcareersnow.com" + doc.find("div", {'class': "jobTitle"}).a["href"] self.fields["city"].func = lambda doc: doc.find("a", {'id': "results.job.location"}).string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find("a", {'id': "results.job.location"}).string self.fields["state"].patterns = [r", (\w\w)\W", r", (\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "retailcareersnow.com" self.fields["posting_date"].func = lambda doc: doc.findAll("td", {'class': "resultsStandard"})[2].string self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.fields.update(kwargs) def nextpage(doc, page): links = doc.find("ul", {'class': "paginationLineup"}) if links is None or links.findAll("li").pop().a is None: return None return "http://jobs.retailcareersnow.com/careers/jobsearch/" + links.findAll("li").pop().a["href"] self.datafunc = lambda doc: doc.findAll("tbody", {'class': re.compile("^displayTableRow")}) self.url = "http://jobs.retailcareersnow.com/careers/jobsearch/results?searchType=quick;kAndEntire=%s;country=United+States" self.nextlink = nextpage
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.findNext('h4').string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.a.string self.fields["company_joburl"].func = lambda doc: doc.a["href"] self.fields["source_joburl"].func = lambda doc: doc.a["href"] self.fields["city"].func = lambda doc: doc.findNext('div').string.split(',')[0] self.fields["city"].process = lambda t: t.strip() self.fields["state"].func = lambda doc: doc.findNext('div').string.split(',')[1] self.fields["state"].process = lambda t: t.strip() self.fields["source"].func = lambda doc: "job.com" self.fields["posting_date"].func = lambda doc: doc.find('ul',{'id':'jobSummary'}).findAll('li')[2]._lastRecursiveChild().strip() self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.fields["posting_date"].depth = 2 self.fields.update(kwargs) def nextpage(doc, page): links= doc.find('p',{'class':'resultsJumper'}) if links is None or links.findAll('a') is None: return None return links.findAll('a')[0]['href'] self.datafunc = lambda doc: doc.findAll('h2',{'class':'jobTitle_results'})[2:] self.url = "http://www.job.com/my.job/search/page=results/pt=2/qs=2/kw=%s/kt=3/ns=1/f=60/rpp=10/&b=2" self.nextlink = nextpage self.cookie=False
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: "".join(doc.find("td", {'class': "resultsCompanyUrl resultsStandard"}).findAll(text=True)) self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.find("a", id="results.job.title").string self.fields["company_joburl"].func = lambda doc: "http://jobs.nj.com" + doc.find("a", id="results.job.title")["href"] self.fields["source_joburl"].func = lambda doc: "http://jobs.nj.com" + doc.find("a", id="results.job.title")["href"] self.fields["city"].func = lambda doc: doc.find("a", id="results.job.location").string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find("a", id="results.job.location").string self.fields["state"].patterns = [r", (.*?)\s+\d", r", ([^,/]*)$"] self.fields["state"].process = common.shorten self.fields["source"].func = lambda doc: "nj.com" self.fields["posting_date"].func = lambda doc: doc.findAll("td")[3].string self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.filterfields["zipcode"].func = lambda doc: doc.find("a", id="results.job.location").string self.filterfields["zipcode"].patterns = [r"(\d{5})"] self.filterfields["zipcode"].process = lambda t: t[0].strip() self.fields.update(kwargs) self.cookie=False self.datafunc = lambda doc: doc.findAll("tbody", {'class': re.compile("displayTableRow")}) self.url = "http://jobs.nj.com/careers/jobsearch/results?searchType=quick;kAndEntire=%s;lastUpdated=-30+days;sortBy=moddate;pageSize=50;lastUpdated_i18n_date_array[month]=8;lastUpdated_i18n_date_array[day]=30;lastUpdated_i18n_date_array[year]=2010;lastUpdated_i18n_date_mysql=2010-08-30;lastUpdated_i18n[date_array][month]=8;lastUpdated_i18n[date_array][day]=30;lastUpdated_i18n[date_array][year]=2010;lastUpdated_i18n[date_mysql]=2010-08-30;lastUpdated_i18n[utc_beginning_mysql]=2010-08-30+04%3A00%3A00;lastUpdated_i18n[utc_end_mysql]=2010-08-31+03%3A59%3A59;lastUpdated_i18n[timezone_used_for_conversion]=EST"
def __init__(self, keyword, logger, ** kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.find('td', {'class': 'results_company'}).a.string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.find('td',{'class':'results_title'}).a.string self.fields["company_joburl"].func = lambda doc: doc.a["href"] self.fields["source_joburl"].func = lambda doc: doc.a["href"] self.fields["city"].func = lambda doc: doc.find('td',{'class':'results_location'}).findAll('a')[0].string self.fields["city"].process = lambda t: t.strip() self.fields["state"].func = lambda doc: doc.find('td',{'class':'results_location'}).findAll('a')[1].string self.fields["state"].process = lambda t: t.strip() self.fields["source"].func = lambda doc: "ihispano.com" self.fields["posting_date"].func = lambda doc: doc.find('td', {'class':'results_create'}).string.strip() self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy # self.fields["posting_date"].depth = 2 self.fields.update(kwargs) def nextpage(doc, page): links = doc.find("span", {'class': "pager-list"}) if links.findChildren()[-1].name!='strong': url= 'http://www.ihispano.com' + links.find('strong').findNext('a')['href'] return url return None self.datafunc = lambda doc: doc.findAll('tr', {'class':'top-result-row'}) self.dev_mode=True self.url = "http://www.ihispano.com/careers/searchjob/results?key_words=%s&country=USA&state=&city=&searchtype=qck&Save=save&zip_code=&jobs_within_miles=10&category=&op=Search&form_id=candidate_searchjob_quick" self.nextlink = nextpage
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.company.string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.jobtitle.string self.fields["company_joburl"].func = lambda doc: 'http://www.careerbuilder.com/JobSeeker/Jobs/JobDetails.aspx?job_did=%s'%(doc.did.string) self.fields["source_joburl"].func = self.fields["company_joburl"].func self.fields["city"].func = lambda doc: doc.location.string.split('-')[-1] self.fields["state"].func = lambda doc: doc.location.string.split('-')[0] self.fields["source"].func = lambda doc: "careerbuilder.com" self.fields["posting_date"].func = lambda doc: doc.posteddate.string self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.fields.update(kwargs) def nextpage(doc, page): url='http://api.careerbuilder.com/v1/jobsearch?DeveloperKey=WDAX88L6VM2F0BQG99WX&PostedWithin=1&Keywords=%s&PerPage=100&PageNumber=%d'%(self.query,page+1) return url self.nextlink=nextpage self.datafunc = lambda doc: doc.findAll('jobsearchresult') self.cookie=False self.url=['http://api.careerbuilder.com/v1/jobsearch?DeveloperKey=WDAX88L6VM2F0BQG99WX&PostedWithin=1&Keywords=%s&PerPage=100&PageNumber=1'] self.query_sleeptime=3600
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.fields["company_name"].func = lambda doc: doc.find("div", {'class': "instName"}).string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.find("div", {'class': "jobTitle"}).a.string self.fields["company_joburl"].func = lambda doc: "http://higheredjobs.com/" + doc.find("div", {'class': "jobTitle"}).a["href"] self.fields["source_joburl"].func = lambda doc: "http://higheredjobs.com/" + doc.find("div", {'class': "jobTitle"}).a["href"] self.fields["city"].func = lambda doc: doc.find("div", {'class': "jobLocation"}).string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find("div", {'class': "jobLocation"}).string self.fields["state"].patterns = [r", (\w\w)\W", r", (\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() self.fields["state"].mandatory = True self.fields["source"].func = lambda doc: "higheredjobs.com" self.fields["posting_date"].func = lambda doc: doc.find("div", {'class': "jobDetails"}).string self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d)"] self.fields["posting_date"].process = common.mm_dd_yy self.fields.update(kwargs) self.cookie = False self.keyword = ['intern']#'college', 'intern', 'student', 'internship', 'major', 'coop', '"co-op"', 'bachelors', 'gpa'] def get_data(doc): data=doc.find("div", {'id': "jobResults"}) if data: return data.table.findAll("tr", {'valign': "top"}, recursive=False) return None self.datafunc = lambda doc: get_data(doc)#doc.find("div", {'id': "jobResults"}).table.findAll("tr", {'valign': "top"}, recursive=False) #shows all results in a page self.url = "http://higheredjobs.com/search/advanced_action.cfm?Keyword=%s&PosType=&InstType=&JobCat=&Region=0&SubRegions=&Metros=&OnlyTitle=0&SortBy=1&ShowAll=yes"
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.fields["company_name"].func = lambda doc: "".join(doc.findAll("td").pop().a.findAll(text=True)) self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: "".join(doc.find("p", id="wrapJobTitle").a.findAll(text=True)) self.fields["company_joburl"].func = lambda doc: "http://jobcircle.com" + doc.find("p", id="wrapJobTitle").a["href"] self.fields["source_joburl"].func = lambda doc: "http://jobcircle.com" + doc.find("p", id="wrapJobTitle").a["href"] self.fields["city"].func = lambda doc: doc.find("p", id="wrapJobTitle").findAll("br")[0].nextSibling self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find("p", id="wrapJobTitle").findAll("br")[0].nextSibling self.fields["state"].patterns = [r", (\w\w)\W", r", (\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "jobcircle.com" self.fields["posting_date"].func = lambda doc: doc.find("p", id="wrapJobTitle").findAll("br")[1].nextSibling self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.filterfields["zipcode"].func = lambda doc: "".join(doc.find("b", text=re.compile("ZIP Code:")).parent.parent.parent.findAll(text=True)) self.filterfields["zipcode"].patterns = [r"(\d{5})"] self.filterfields["zipcode"].process = lambda t: t[0].strip() self.filterfields["zipcode"].depth = 2 self.fields.update(kwargs) self.keyword = ['a'] def nextpage(doc, page): if doc.find('a',text='[More]'): start=(page*50)+1 return 'http://jobcircle.com/public/csearch.mpl?search_string=%s&search_method=and&search_radius=&search_zip_code=&industry_code=ALL&chk_search_radius=0&reward=&start=%d&len=50&job_length=&search_scope=1'%\ (self.query,start) return None self.nextlink = nextpage self.datafunc = lambda doc: [tr for tr in doc.findAll("tr") if len(tr.findAll("td", {'class': "tblrowcolored"}, recursive=False))+len(tr.findAll("td", {'class': "tblrowwhite"}, recursive=False)) > 1] self.url = "http://jobcircle.com/public/csearch.mpl?search_string=%s&search_method=and&search_radius=&search_zip_code=&industry_code=ALL&chk_search_radius=0&reward=&start=0&len=50&job_length=&search_scope=1"
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: "".join(doc.findAll("td")[0].findAll(text=True)) self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.findAll("td")[4].b.a.string self.fields["company_joburl"].func = lambda doc: "http://hirelifescience.com/" + doc.findAll("td")[4].b.a["href"] self.fields["source_joburl"].func = lambda doc: "http://hirelifescience.com/" + doc.findAll("td")[4].b.a["href"] self.fields["city"].func = lambda doc: doc.findAll("td")[1].string self.fields["state"].func = lambda doc: doc.findAll("td")[2].string self.fields["source"].func = lambda doc: "hirelifescience.com" self.fields["posting_date"].func = lambda doc: doc.findAll("td")[3].string self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.fields.update(kwargs) def getall(doc): trs = doc.find("div", {'class': "indent"}).div.findAll("table")[1].findAll("tr")[2:-1] for i in range(0, len(trs)-1, 3): trs[i].append(trs[i+1]) return trs[::3] self.datafunc = getall def nextpage(doc, page): trs = doc.find("div", {'class': "indent"}).div.findAll("table")[1].findAll("tr")[0] links = trs.findAll("a") if len(links) < page: return None return "http://hirelifescience.com/" + links[page-1]["href"] self.nextlink = nextpage self.url = "http://hirelifescience.com/seeker_jobs.asp?search=yes&page=&keyword=%s&pagesize=500&updown=&orderby=date"
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: "".join(doc.find("td", id=re.compile(r"Company")).findAll(text=True)) self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.find("a", {'class': "jt"}).string self.fields["company_joburl"].func = lambda doc: doc.find("a", {'class': "jt"})["href"] self.fields["source_joburl"].func = self.fields["company_joburl"].func self.fields["city"].func = lambda doc: doc.find("td", id=re.compile(r"Location")).string self.fields["city"].patterns = [r"-([^-]*)$"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find("td", id=re.compile(r"Location")).string self.fields["state"].patterns = [r"^(\w\w)\W"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "jobpath.com" self.fields["posting_date"].func = lambda doc: doc.find("span", id=re.compile(r"Posted"))["title"] self.fields["posting_date"].patterns = [r"(\w\w\w)-(\d\d?)"] self.fields["posting_date"].process = common.mmm_dd self.fields.update(kwargs) def nextpage(doc, page): links = doc.find("td", {'class': "nav_btm_cell"}) if links : return links.a["href"] if links.find('a',text='Next Page') else None return None self.datafunc = lambda doc: doc.findAll("tr", {'class': re.compile(r"^jl_\w+_row$")}) self.url = "http://www.jobpath.com/JobSeeker/Jobs/JobResults.aspx?IPath=QHKCV&excrit=QID%3dA6657255451511%3bst%3da%3buse%3dALL%3brawWords%3d%s%3bCID%3dUS%3bSID%3d%3f%3bTID%3d0%3bENR%3dNO%3bDTP%3dDRNS%3bYDI%3dYES%3bIND%3dALL%3bPDQ%3dAll%3bPDQ%3dAll%3bPAYL%3d0%3bPAYH%3dgt120%3bPOY%3dNO%3bETD%3dALL%3bRE%3dALL%3bMGT%3dDC%3bSUP%3dDC%3bFRE%3d30%3bQS%3dsid_unknown%3bSS%3dNO%3bTITL%3d0%3bJQT%3dRAD%3bJDV%3dFalse%3bExpHigh%3dgt50%3bExpLow%3d0%3bMaxLowExp%3d-1&sc=3&ff=21&sd=2" self.nextlink = nextpage self.dev_mode=True
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.find("div", {'class': "assetOwner"}).a.string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: "".join(doc.li.h2.a.findAll(text=True)) self.fields["company_joburl"].func = lambda doc: "http://www.idealist.org" + doc.li.h2.a["href"] self.fields["source_joburl"].func = lambda doc: "http://www.idealist.org" + doc.li.h2.a["href"] self.fields["city"].func = lambda doc: doc.find("div", {'class': "assetLocation"}).string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find("div", {'class': "assetLocation"}).string self.fields["state"].patterns = [r", (.*?)\s+United States"] self.fields["state"].process = common.shorten self.fields["state"].mandatory = True self.fields["source"].func = lambda doc: "idealist.org" self.fields["posting_date"].func = lambda doc: "".join(doc.find("div", {'class': "assetDates"}).findAll(text=True)) self.fields["posting_date"].patterns = [r"(\w\w\w)\w* (\d\d?), (\d\d\d\d)"] self.fields["posting_date"].process = common.mmm_dd_yyyy self.filterfields["zipcode"].func = lambda doc: "".join(doc.find("span", text=re.compile("Location:")).parent.parent.findAll(text=True)) self.filterfields["zipcode"].patterns = [r"\D(\d{5})\D", r"\D(\d{5})$"] self.filterfields["zipcode"].process = lambda t: t[0].strip() self.filterfields["zipcode"].depth = 2 self.fields.update(kwargs) self.datafunc = lambda doc: doc.find('ul',{'class':'itemsList'}).findAll('li') if doc else None self.url = 'http://www.idealist.org/search?search_keywords=%s&search_type=job'#"http://www.idealist.org/if/idealist/en/SiteIndex/AssetSearch/search?assetTags=JOB_TYPE&assetTypes=Job&fetchLimit=50&keywords=%s&keywordsAsString=%s&languageDesignations=en&onlyFetchAssetProperties=1&siteClassifierName=idealist&sortOrderings=modificationDate&validStatusTypes=APPROVED&validStatusTypes=UNAPPROVED&validStatusTypes=DEFERRED"
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.findAll("td")[3].p.string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: "".join(doc.find("a", {'class': "colour"}).parent.findAll(text=True)) self.fields["company_joburl"].func = lambda doc: "http://www.hirediversity.com/jobseekers/jobs/" + doc.find("a", {'class': "colour"})["href"] self.fields["source_joburl"].func = lambda doc: "http://www.hirediversity.com/jobseekers/jobs/" + doc.find("a", {'class': "colour"})["href"] self.fields["city"].func = lambda doc: doc.findAll("td")[4].p.string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.findAll("td")[4].p.string self.fields["state"].patterns = [r", (\w\w)\W", r", (\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "hirediversity.com" self.fields["posting_date"].func = lambda doc: doc.findAll("td")[6].p.string self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.fields.update(kwargs) def nextpage(doc, page): links = doc.find("div", {'class': "text"}).h3.findAll("a") if len(links) < page: return None return "http://www.hirediversity.com/jobseekers/jobs/" + links[page-1]["href"] self.datafunc = lambda doc: doc.find("div", {'class': "content"}).table.findAll("tr")[1:] if doc else None self.url = "http://www.hirediversity.com/jobseekers/jobs/list.asp?quicksearch=yes&ambiguouslocation=City%2C+State&zipcode=ZipCode&industryids=&keywords=%s&Search.x=57&Search.y=10" self.nextlink = nextpage self.dev_mode=True
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.findAll("p")[0].string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.h4.a.string self.fields["company_joburl"].func = lambda doc: "http://chronicle.com" + doc.h4.a["href"] self.fields["source_joburl"].func = lambda doc: "http://chronicle.com" + doc.h4.a["href"] self.fields["state"].func = lambda doc: doc.div.find("dl", {'class': None}).dd.string self.fields["state"].patterns = [r"(.*)"] self.fields["state"].process = lambda doc : doc[0]#common.shorten self.fields["state"].mandatory = True self.fields["source"].func = lambda doc: "chronicle.com" self.fields["posting_date"].func = lambda doc: doc.findAll("p")[1].string self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.fields.update(kwargs) def nextpage(doc, page): links = doc.find("div", {'class': "pagination"}) if len(links.ul.findAll("li")) >= page+1 and links.findChildren('li')[-1].findChild('a'): url= "http://chronicle.com/jobSearch" + links.ul.findAll("li")[page].a["href"] return url return None self.datafunc = lambda doc: doc.findAll("div", {'class': "result"}) self.url = "http://chronicle.com/jobSearch?contextId=434&facetClear=1&searchQueryString=%s&position=&location=&locationmulti[]=ODg6OjpVbml0ZWQgU3RhdGVz" self.nextlink = nextpage self.dev_mode=True
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["title"].func = lambda doc: doc.find("span", {'class': "jobTitle"}).a.string self.fields["company_joburl"].func = lambda doc: "http://www.nursing-rehab-jobs.com" + doc.find("span", {'class': "jobTitle"}).a["href"] self.fields["source_joburl"].func = lambda doc: "http://www.nursing-rehab-jobs.com" + doc.find("span", {'class': "jobTitle"}).a["href"] self.fields["city"].func = lambda doc: doc.find("span", {'class': "jobLocation"}).string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find("span", {'class': "jobLocation"}).string self.fields["state"].patterns = [r", (\w\w)\W", r", (\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "nursing-rehab-jobs.com" self.fields["posting_date"].func = lambda doc: doc.find("span", {'class': "jobDate"}).string self.fields["posting_date"].patterns = [r"(\w\w\w) (\d\d?), (\d\d\d\d)"] self.fields["posting_date"].process = common.mmm_dd_yyyy self.filterfields["zipcode"].func = lambda doc: doc.find("span", {'class': "jobLocation"}).string self.filterfields["zipcode"].patterns = [r"(\d{5})"] self.filterfields["zipcode"].process = lambda t: t[0].strip() self.fields.update(kwargs) self.cookie = False def nextpage(doc, page): links = doc.find("span", {'class': "pagination-links"}).findAll("a") if len(links) < page+2: return None m = re.search(r"^(.*?&q=[^&]+).*?(&startrow=.*)", "http://www.nursing-rehab-jobs.com" + links[page]["href"]) return "".join(m.groups()) self.datafunc = lambda doc: doc.findAll("tr", {'class': re.compile(r"dbOutputRow")}) self.url = "http://www.nursing-rehab-jobs.com/search?q=%s" self.nextlink = nextpage
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.findAll("td")[ 1].a.string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.findAll("td")[0].a.string self.fields[ "company_joburl"].func = lambda doc: "http://nationjob.com" + doc.findAll( "td")[0].a["href"] self.fields[ "source_joburl"].func = lambda doc: "http://nationjob.com" + doc.findAll( "td")[0].a["href"] self.fields["city"].func = lambda doc: doc.findAll("td")[2].a.string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.findAll("td")[2].a.string self.fields["state"].patterns = [ r", (\w\w)\W", r", (\w\w)$", r", ([^/]*)$" ] self.fields["state"].process = common.shorten self.fields["source"].func = lambda doc: "nationjob.com" self.fields["posting_date"].func = datetime.datetime.now() self.fields.update(kwargs) self.cookie = False def getall(doc): x = doc.findAll("tr", {'class': "row1"}) x.extend(doc.findAll("tr", {'class': "row2"})) return x self.datafunc = getall self.url = "http://nationjob.com/jobsearch/?keywords=%s&searchnow=1&pos=NA&STATE=&ZIP=zipcode&radius=25&go=Search+Jobs"
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.a.nextSibling.next.next.next self.fields["company_name"].patterns = [r"Company:\s*(.+)"] self.fields["company_name"].process = lambda t: t[0].strip() self.fields["company_id"].func = self.fields["company_name"].func self.fields["company_id"].patterns = self.fields["company_name"].patterns self.fields["company_id"].process = self.fields["company_name"].process self.fields["title"].func = lambda doc: doc.a.string self.fields["company_joburl"].func = lambda doc: doc.a["href"] self.fields["source_joburl"].func = lambda doc: doc.a["href"] self.fields["city"].func = lambda doc: doc.a.nextSibling.next self.fields["city"].patterns = [r"Location:\s*([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.a.nextSibling.next self.fields["state"].patterns = [r"Location:\s*.*?, (\w\w)\W", r"Location:\s*.*?, (\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "physiciancrossroads.com" self.fields["posting_date"].func = lambda doc: doc.find("span", {'class': "date"}).string self.fields["posting_date"].patterns = [r"(\d+) days? ago", r"()\d+ hours? ago"] self.fields["posting_date"].process = common.daysago self.fields["posting_date"].depth = 2 self.fields.update(kwargs) self.datafunc = lambda doc: doc.findAll("td", {'class': "copy"})[4:-2] self.url = "http://physiciancrossroads.com/candidate/search.php?new_search=1&search_string=%s&city=&period-s=&limit=50&submit=Search"
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.findAll("td", {'class': None})[1].a.string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.findAll("td", {'class': None})[0].a.string self.fields["company_joburl"].func = lambda doc: "http://seeker.dice.com" + doc.findAll("td", {'class': None})[0].a["href"] self.fields["source_joburl"].func = lambda doc: "http://seeker.dice.com" + doc.findAll("td", {'class': None})[0].a["href"] self.fields["city"].func = lambda doc: doc.findAll("td", {'class': None})[2].string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.findAll("td", {'class': None})[2].string self.fields["state"].patterns = [r", (\w\w)\W", r", (\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "dice.com" self.fields["posting_date"].func = lambda doc: doc.findAll("td", {'class': None})[3].string self.fields["posting_date"].patterns = [r"(\w\w\w)-(\d\d)"] self.fields["posting_date"].process = common.mmm_dd self.fields.update(kwargs) self.dev_mode=True def nextpage(doc, page): links=doc.find('div',{'class':'pageProg'}) if links: last_link=links.findAll('a')[-1] return'http://seeker.dice.com'+last_link['href'] if last_link.string.startswith('Next') else None self.nextlink = nextpage self.datafunc = lambda doc: [elem for elem in doc.tbody.findAll("tr") if elem('td', {'class': "icon"})] self.url = "http://seeker.dice.com/jobsearch/servlet/JobSearch?QUICK=1&NUM_PER_PAGE=500&TRAVEL=0&FRMT=0&LOCATION_OPTION=2&Ntx=mode+matchall&DAYSBACK=30&RADIUS=64.37376&op=300&Hf=0&N=0&ZC_COUNTRY=0&FREE_TEXT=%s&Ntk=JobSearchRanking&TAXTERM=0&Ns=p_PostedAge|0&SORTDIR=7&SORTSPEC=0"
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.find("p", {'class': "title_1"}).findAll("a")[1].string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.find("p", {'class': "title_1"}).findAll("a")[0].string self.fields["company_joburl"].func = lambda doc: doc.find("p", {'class': "title_1"}).findAll("a")[0]["href"] self.fields["source_joburl"].func = lambda doc: doc.find("p", {'class': "title_1"}).findAll("a")[0]["href"] self.fields["city"].func = lambda doc: doc.find("p", {'class': "title_1"}).findAll("a")[2].string self.fields["state"].func = lambda doc: doc.find("p", {'class': "title_1"}).findAll("a")[3].string self.fields["source"].func = lambda doc: "amightyriver.com" self.fields["posting_date"].func = lambda doc: doc.find("div", {'id': "title_1"}).font.string self.fields["posting_date"].patterns = [r"(\d\d)-(\d\d)-(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.filterfields["zipcode"].func = lambda doc: "".join(doc.find("div", text=re.compile("Location:")).parent.parent.findAll(text=True)) self.filterfields["zipcode"].patterns = [r"(\d{5})"] self.filterfields["zipcode"].process = lambda t: t[0].strip() self.filterfields["zipcode"].depth = 2 self.fields.update(kwargs) def nextpage(doc, page): links = doc.find("div", id="note_right").findAll("a") if len(links) < page+1 or links[-1].string=='First': return None return links[page]["href"] self.datafunc = lambda doc: doc.findAll("div", id="l_item") self.url = "http://www.amightyriver.com/job/search-result?sh_keyword=%s" self.nextlink = nextpage self.dev_mode=True
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.company.string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.jobtitle.string self.fields[ "company_joburl"].func = lambda doc: 'http://www.careerbuilder.com/JobSeeker/Jobs/JobDetails.aspx?job_did=%s' % ( doc.did.string) self.fields["source_joburl"].func = self.fields["company_joburl"].func self.fields["city"].func = lambda doc: doc.location.string.split('-')[ -1] self.fields["state"].func = lambda doc: doc.location.string.split('-')[ 0] self.fields["source"].func = lambda doc: "careerbuilder.com" self.fields["posting_date"].func = lambda doc: doc.posteddate.string self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.fields.update(kwargs) def nextpage(doc, page): url = 'http://api.careerbuilder.com/v1/jobsearch?DeveloperKey=WDAX88L6VM2F0BQG99WX&PostedWithin=1&Keywords=%s&PerPage=100&PageNumber=%d' % ( self.query, page + 1) return url self.nextlink = nextpage self.datafunc = lambda doc: doc.findAll('jobsearchresult') self.cookie = False self.url = [ 'http://api.careerbuilder.com/v1/jobsearch?DeveloperKey=WDAX88L6VM2F0BQG99WX&PostedWithin=1&Keywords=%s&PerPage=100&PageNumber=1' ] self.query_sleeptime = 3600
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.tr.findAll("td", recursive=False)[4].a.string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.tr.findAll("td", recursive=False)[1].a.string self.fields["company_joburl"].func = lambda doc: "http://careers.nwjobs.com" + doc.tr.findAll("td", recursive=False)[1].a["href"] self.fields["source_joburl"].func = lambda doc: "http://careers.nwjobs.com" + doc.tr.findAll("td", recursive=False)[1].a["href"] self.fields["city"].func = lambda doc: doc.tr.findAll("td", recursive=False)[2].a.string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.tr.findAll("td", recursive=False)[2].a.string self.fields["state"].patterns = [r", (.*?)\s+\d", r", ([^,/]*)$"] self.fields["state"].process = common.shorten self.fields["source"].func = lambda doc: "nwjobs.com" self.fields["posting_date"].func = lambda doc: doc.tr.findAll("td", recursive=False)[3].string self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.filterfields["zipcode"].func = lambda doc: doc.tr.findAll("td", recursive=False)[2].a.string self.filterfields["zipcode"].patterns = [r"(\d{5})"] self.filterfields["zipcode"].process = lambda t: t[0].strip() self.fields.update(kwargs) self.datafunc = lambda doc: doc.findAll("tbody", {'class': re.compile("displayTableRow")}) self.url = "http://careers.nwjobs.com/careers/jobsearch/results?kAndEntire=%s;pageSize=50;sortBy=moddate"
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: "".join(doc.find("td", {'class': "resultsCompanyUrl resultsStandard"}).findAll(text=True)) self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.find("a", id="results.job.title").string self.fields["company_joburl"].func = lambda doc: "http://jobs.pharmacyjobcenter.com" + doc.find("a", id="results.job.title")["href"] self.fields["source_joburl"].func = lambda doc: "http://jobs.pharmacyjobcenter.com" + doc.find("a", id="results.job.title")["href"] self.fields["city"].func = lambda doc: doc.find("a", id="results.job.location").string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find("a", id="results.job.location").string self.fields["state"].patterns = [r", (\w\w)\s+United States"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "pharmacyjobcenter.com" self.fields["posting_date"].func = lambda doc: doc.findAll("td")[3].string self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.filterfields["zipcode"].func = lambda doc: "".join(doc.find("span", text=re.compile("Location :")).parent.parent.findAll(text=True)) self.filterfields["zipcode"].patterns = [r"\D(\d{5})\D", r"\D(\d{5})$"] self.filterfields["zipcode"].process = lambda t: t[0].strip() self.filterfields["zipcode"].depth = 2 self.fields.update(kwargs) self.datafunc = lambda doc: doc.findAll("tbody", {'class': re.compile("displayTableRow")}) self.url = "http://jobs.pharmacyjobcenter.com/careers/jobsearch/results?searchType=quick;kAndTitle=%s;country=United+States;sortBy=moddate;pageSize=50"
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.findAll("td")[1].a.string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.findAll("td")[0].a.string self.fields["company_joburl"].func = lambda doc: "http://nationjob.com" + doc.findAll("td")[0].a["href"] self.fields["source_joburl"].func = lambda doc: "http://nationjob.com" + doc.findAll("td")[0].a["href"] self.fields["city"].func = lambda doc: doc.findAll("td")[2].a.string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.findAll("td")[2].a.string self.fields["state"].patterns = [r", (\w\w)\W", r", (\w\w)$", r", ([^/]*)$"] self.fields["state"].process = common.shorten self.fields["source"].func = lambda doc: "nationjob.com" self.fields["posting_date"].func = datetime.datetime.now() self.fields.update(kwargs) self.cookie = False def getall(doc): x = doc.findAll("tr", {'class': "row1"}) x.extend(doc.findAll("tr", {'class': "row2"})) return x self.datafunc = getall self.url = "http://nationjob.com/jobsearch/?keywords=%s&searchnow=1&pos=NA&STATE=&ZIP=zipcode&radius=25&go=Search+Jobs"
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: "".join(doc.findAll("td")[-2].findAll(text=True)) self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.findAll("td")[3].a.string self.fields["company_joburl"].func = lambda doc: "http://www.hcareers.com" + doc.findAll("td")[3].a["href"] self.fields["source_joburl"].func = lambda doc: "http://www.hcareers.com" + doc.findAll("td")[3].a["href"] self.fields["city"].func = lambda doc: doc.findAll("td")[2].string self.fields["city"].patterns = [r"-([^-]*)$"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.findAll("td")[2].string self.fields["state"].patterns = [r"\W(\w\w)\W"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "hcareers.com" self.fields["posting_date"].func = lambda doc: doc.findAll("td")[0].string self.fields["posting_date"].patterns = [r"(\w\w\w) (\d\d?), (\d\d\d\d)"] self.fields["posting_date"].process = common.mmm_dd_yyyy self.fields.update(kwargs) def nextpage(doc, page): link = doc.find("div", {'class': "search-results-nav"}).a if link is None: return None return "http://www.hcareers.com" + link["href"] self.datafunc = lambda doc: doc.find("table", id="table1").findAll("tr")[1:] self.url = "http://www.hcareers.com/seeker/search/advanced?jobDetectiveId=&booleanKeyWordSearch=%s&industryCodes=&management=&managementCheckbox=on&nonmanagementCheckbox=on&form.commit=Search&h_v=XG_20071127_1" self.nextlink = nextpage self.dev_mode=True
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.find("div", id=re.compile("^companyname")).span.string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.h3.findAll("div")[0].a.string self.fields["company_joburl"].func = lambda doc: "http://www.healthcarejobsite.com" + doc.h3.findAll("div")[0].a["href"] self.fields["source_joburl"].func = lambda doc: "http://www.healthcarejobsite.com" + doc.h3.findAll("div")[0].a["href"] self.fields["city"].func = lambda doc: doc.find("div", id=re.compile("^companyname")).i.string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find("div", id=re.compile("^companyname")).i.string self.fields["state"].patterns = [r", (\w\w)\W", r", (\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "healthcarejobsite.com" self.fields["posting_date"].func = lambda doc: doc.h3.findAll("div")[1].i.string self.fields["posting_date"].patterns = [r"(\w\w\w) (\d\d?)"] self.fields["posting_date"].process = common.mmm_dd self.fields.update(kwargs) def nextpage(doc, page): x=doc.findAll("td", {'class': "paging"})[-1] links = doc.find("td", {'class': "paging"}).findAll("a") if links[-1].string=='Next': return "http://www.healthcarejobsite.com"+x.find('span',{'class':'currentPage'}).findNext()['href'] return None self.datafunc = lambda doc: doc.findAll("td", {'class': re.compile("^job_title")}) self.url = "http://www.healthcarejobsite.com/jobs/job-search.asp?fkeywords=%s&forderby=M" self.nextlink = nextpage
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.findAll("div", {'class': "jt_jobs_company"}).pop().a.string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.find("div", {'class': "jt_jobs_title"}).string self.fields["company_joburl"].func = lambda doc: "http://retirementjobs.retiredbrains.com" + doc.find("div", {'class': "jt_jobs_title"}).parent["href"] self.fields["source_joburl"].func = lambda doc: "http://retirementjobs.retiredbrains.com" + doc.find("div", {'class': "jt_jobs_title"}).parent["href"] self.fields["city"].func = lambda doc: doc.find("td", {'class': "jt_jobs_location"}).a.string self.fields["city"].patterns = [r"^([^,]{3,})"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find("td", {'class': "jt_jobs_location"}).a.string self.fields["state"].patterns = [r"\W+(\w\w)\W", r"\W+(\w\w)$", r"^(\w\w)\W", r"^(\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "retiredbrains.com" self.fields["posting_date"].func = lambda doc: doc.find("td", {'class': "jt_jobs_date"}).a.string self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.filterfields["zipcode"].func = lambda doc: "".join(doc.find("th", text=re.compile("Location")).parent.parent.findAll(text=True)) self.filterfields["zipcode"].patterns = [r"(\d{5})"] self.filterfields["zipcode"].process = lambda t: t[0].strip() self.filterfields["zipcode"].depth = 2 self.fields.update(kwargs) self.cookie = False self.datafunc = lambda doc: doc.findAll("tr", id=re.compile("^jt_jobrow_\d+$")) self.url = "http://retirementjobs.retiredbrains.com/c/search_results.cfm?site_id=9182&vnet=0&max=50&keywords=%s&search=Search"
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.find("span", {'class': "company"}).string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.find("a", {'class': "jobTitle"})["title"] self.fields["company_joburl"].func = lambda doc: doc.find("a", {'class': "jobTitle"})["href"] self.fields["source_joburl"].func = lambda doc: doc.find("a", {'class': "jobTitle"})["href"] self.fields["city"].func = lambda doc: "".join(doc.find("span", {'class': "jobplace"}).findAll(text=True)) self.fields["city"].patterns = [r"^([^,]*),"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: "".join(doc.find("span", {'class': "jobplace"}).findAll(text=True)) self.fields["state"].patterns = [r"\W(\w\w),", r"\W(\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "monster.com" self.fields["posting_date"].func = lambda doc: doc.find("span", {'class': "postingdate"}).string self.fields["posting_date"].patterns = [r"()today", r"(\d+)"] self.fields["posting_date"].process = common.daysago self.fields.update(kwargs) self.dev_mode=True def nextpage(doc, page): nav= doc.find('div',{'class':'navigationBar'}) links= nav.findAll('a') if links[-1].has_key('href'): url='http://jobsearch.monster.com/PowerSearch.aspx?q=%s&rad=20&rad_units=miles&tm=60&dv=&pg=%d&pp=500&sort=dt.rv'%(self.query,page+1) return url return None self.nextlink=nextpage self.datafunc = lambda doc: doc.findAll("div", {'class': "itemHeader"}) self.url = "http://jobsearch.monster.com/PowerSearch.aspx?q=%s&rad=20&rad_units=miles&tm=60&dv=&pg=1&pp=500&sort=dt.rv"
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["title"].func = lambda doc: doc.find( "span", { 'class': "jobTitle" }).a.string self.fields[ "company_joburl"].func = lambda doc: "http://www.texashealth-jobs.org" + doc.find( "span", { 'class': "jobTitle" }).a["href"] self.fields[ "source_joburl"].func = lambda doc: "http://www.texashealth-jobs.org" + doc.find( "span", { 'class': "jobTitle" }).a["href"] self.fields["city"].func = lambda doc: doc.find( "span", { 'class': "jobLocation" }).string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find( "span", { 'class': "jobLocation" }).string self.fields["state"].patterns = [r", (\w\w)\W", r", (\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "texashealth-jobs.org" self.fields["posting_date"].func = lambda doc: "".join( doc.find("p", id="job-date").findAll(text=True)) self.fields["posting_date"].patterns = [ r"(\w\w\w) (\d\d?), (\d\d\d\d)" ] self.fields["posting_date"].process = common.mmm_dd_yyyy self.fields["posting_date"].depth = 2 self.fields.update(kwargs) self.cookie = False def nextpage(doc, page): links = doc.find("span", { 'class': "pagination-links" }).findAll("a") if len(links) < page + 2: return None m = re.search( r"^(.*?&q=[^&]+).*?(&startrow=.*)", "http://www.texashealth-jobs.org" + links[page]["href"]) return "".join(m.groups()) self.datafunc = lambda doc: doc.findAll( "tr", {'class': re.compile("dbOutputRow")}) if doc else None self.url = "http://www.texashealth-jobs.org/search?q=%s" self.nextlink = nextpage
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.findAll( "div", { 'class': "jt_jobs_company" }).pop().a.string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.find( "div", { 'class': "jt_jobs_title" }).string self.fields[ "company_joburl"].func = lambda doc: "http://retirementjobs.retiredbrains.com" + doc.find( "div", { 'class': "jt_jobs_title" }).parent["href"] self.fields[ "source_joburl"].func = lambda doc: "http://retirementjobs.retiredbrains.com" + doc.find( "div", { 'class': "jt_jobs_title" }).parent["href"] self.fields["city"].func = lambda doc: doc.find( "td", { 'class': "jt_jobs_location" }).a.string self.fields["city"].patterns = [r"^([^,]{3,})"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find( "td", { 'class': "jt_jobs_location" }).a.string self.fields["state"].patterns = [ r"\W+(\w\w)\W", r"\W+(\w\w)$", r"^(\w\w)\W", r"^(\w\w)$" ] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "retiredbrains.com" self.fields["posting_date"].func = lambda doc: doc.find( "td", { 'class': "jt_jobs_date" }).a.string self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.filterfields["zipcode"].func = lambda doc: "".join( doc.find("th", text=re.compile("Location")).parent.parent.findAll( text=True)) self.filterfields["zipcode"].patterns = [r"(\d{5})"] self.filterfields["zipcode"].process = lambda t: t[0].strip() self.filterfields["zipcode"].depth = 2 self.fields.update(kwargs) self.cookie = False self.datafunc = lambda doc: doc.findAll( "tr", id=re.compile("^jt_jobrow_\d+$")) self.url = "http://retirementjobs.retiredbrains.com/c/search_results.cfm?site_id=9182&vnet=0&max=50&keywords=%s&search=Search"
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["source"].func = lambda doc: 'regionalhelpwanted.com' self.fields["title"].func = lambda doc: doc.findAll('a')[0].string self.fields["company_joburl"].func = lambda doc: doc.findAll('a')[0][ 'href'] self.fields["source_joburl"].func = self.fields["company_joburl"].func self.fields["company_name"].func = lambda doc: get_company(doc) self.fields["company_name"].depth = 2 self.fields["company_id"].func = self.fields["company_name"].func self.fields["company_id"].depth = 2 self.fields["city"].func = lambda doc: get_location(doc)[0] self.fields["city"].depth = 2 self.fields["state"].func = lambda doc: get_location(doc)[1] self.fields["state"].depth = 2 self.fields["posting_date"].func = lambda doc: get_posting_date(doc) self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d)"] self.fields["posting_date"].process = common.mm_dd_yy self.fields["posting_date"].depth = 2 self.fields.update(kwargs) def get_company(doc): doc = get_tab(doc) comp = doc.find('td', text=re.compile('Company Name:')) if comp: return comp.next.next.firstText().string return 'Unknown' def get_location(doc): doc = get_tab(doc) loc = doc.find('td', text=re.compile('Location:')) if loc: return [loc.next.next.string, 'US'] return [None, None] def get_posting_date(doc): doc = get_tab(doc) d = doc.find('td', text=re.compile(r"(\d\d?)/(\d\d?)/(\d\d)")) return d.strip() if d else datetime.now().strftime('%m/%d/%y') def get_tab(doc): if doc: tab = doc.findAll('table') return tab[1] if tab else '' return '' self.dev_mode = True self.datafunc = lambda doc: doc.findAll("div", { 'class': 'detailswhite' }) #re.compile("^displayTableRow.*")}) self.url = [e[0] for e in xdomains]
def __init__(self, keyword, logger, ** kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: get_company(doc) self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.find('div', {'class':'jobTitle'}).a.string self.fields["company_joburl"].func = lambda doc: get_joburl(doc) self.fields["source_joburl"].func = self.fields["company_joburl"].func self.fields["city"].func = lambda doc: get_location(doc)['city'].strip() self.fields["state"].func = lambda doc: get_location(doc)['state'].strip() self.fields["source"].func = lambda doc: "adicio.com" self.fields["posting_date"].func = lambda doc: get_date(doc) self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.fields.update(kwargs) self.fields["all_text"].func = lambda doc: all_text(doc) def all_text(doc): doc=str(doc.find('div',{'class':'remoteJobDescriptionContainer'})) return get_desc(doc,self.url) def get_date(doc): try: d = doc.findAll('td', {'class':'resultsStandard'})[-2].string.strip() except: x = datetime.now() d = "%s/%s/%s" % (x.month, x.day, x.year) return d def get_company(doc): str = doc.find('td', {'class':re.compile('^resultsCompanyUrl.*')}) x = str.findChild() str = x.string if x else str.string return str def get_location(doc): data = urlparse(doc.find('a', id='results.job.location')['href']) location_dict = dict(e.split('=') for e in data.query.split('&')) return location_dict def get_joburl(doc): job_url = 'http://%s%s' % (urlparse(self.url).netloc, doc.find('div', {'class':'jobTitle'}).a['href']) return job_url def nextpage(doc, page): links = doc.find('ul', {'class':'paginationLineup'}) if links is None or links.findAll('a') is None: return None return 'http://%s/careers/jobsearch/%s%d' % (urlparse(self.url).netloc, links.findAll('a')[0]['href'][:-2], (page + 1) * 10) self.nextlink = nextpage self.datafunc = lambda doc: doc.findAll("tbody", {'class':re.compile("^displayTableRow.*")}) self.url = xdomains
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.find( "p", { 'class': "title_1" }).findAll("a")[1].string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.find( "p", { 'class': "title_1" }).findAll("a")[0].string self.fields["company_joburl"].func = lambda doc: doc.find( "p", { 'class': "title_1" }).findAll("a")[0]["href"] self.fields["source_joburl"].func = lambda doc: doc.find( "p", { 'class': "title_1" }).findAll("a")[0]["href"] self.fields["city"].func = lambda doc: doc.find( "p", { 'class': "title_1" }).findAll("a")[2].string self.fields["state"].func = lambda doc: doc.find( "p", { 'class': "title_1" }).findAll("a")[3].string self.fields["source"].func = lambda doc: "amightyriver.com" self.fields["posting_date"].func = lambda doc: doc.find( "div", { 'id': "title_1" }).font.string self.fields["posting_date"].patterns = [r"(\d\d)-(\d\d)-(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.filterfields["zipcode"].func = lambda doc: "".join( doc.find("div", text=re.compile("Location:")).parent.parent. findAll(text=True)) self.filterfields["zipcode"].patterns = [r"(\d{5})"] self.filterfields["zipcode"].process = lambda t: t[0].strip() self.filterfields["zipcode"].depth = 2 self.fields.update(kwargs) def nextpage(doc, page): links = doc.find("div", id="note_right").findAll("a") if len(links) < page + 1 or links[-1].string == 'First': return None return links[page]["href"] self.datafunc = lambda doc: doc.findAll("div", id="l_item") self.url = "http://www.amightyriver.com/job/search-result?sh_keyword=%s" self.nextlink = nextpage self.dev_mode = True
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: "" self.fields["company_id"].func = lambda doc: "" self.fields[ "title"].func = lambda doc: doc.previousSibling.previousSibling.find( "a", { 'class': 'tl' }).findChild().string self.fields[ "company_joburl"].func = lambda doc: "http://www.tiptopjob.com" + doc.previousSibling.previousSibling.find( "a", {'class': 'tl'})['href'] self.fields["source_joburl"].func = self.fields["company_joburl"].func self.fields["city"].func = lambda doc: loc(doc).previous.split("(")[0] self.fields["state"].func = lambda doc: loc(doc).string # self.fields["state"].process = common.shorten self.fields["source"].func = lambda doc: "tiptopjob.com" self.fields["posting_date"].func = lambda doc: x(doc) self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.dd_mm_yyyy self.fields.update(kwargs) self.datafunc = lambda doc: doc.find("table", id='tbljobresults' ).findAll('tr', {'class': 'cr'}) self.url = "http://www.tiptopjob.com/search/tiptopresults.asp?qs=1&srchtype=1&jobtype=3&keyword=%s&searchby=1&country=USA&orderby=4&sortdirection=1&newsearch=1&PageNo=1" def x(doc): y = doc.find('table', { 'class': 'sjd' }).findAll('tr')[-1].findAll('td')[-1] print y return y def loc(doc): return doc.find('table', { 'class': 'sjd' }).findAll('tr')[0].findAll('td')[1].a def nextpage(doc, page): x = doc.find( 'img', src='http://img.tiptopjob.com/jobs_images/next_arrow.png') if x.parent.name == 'a': return "http://www.tiptopjob.com/search/tiptopresults.asp?qs=1&srchtype=1&jobtype=3&keyword=%s&searchby=1&country=USA&orderby=4&sortdirection=1&newsearch=1&PageNo=%d" % ( self.query, page + 1, ) return None self.dev_mode = True self.nextlink = nextpage self.clean_html = True
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.find( "td", { 'class': "resultsCompanyUrl resultsStandard" }).string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.find( "div", { 'class': "jobTitle" }).a.string self.fields[ "company_joburl"].func = lambda doc: "http://jobs.retailcareersnow.com" + doc.find( "div", { 'class': "jobTitle" }).a["href"] self.fields[ "source_joburl"].func = lambda doc: "http://jobs.retailcareersnow.com" + doc.find( "div", { 'class': "jobTitle" }).a["href"] self.fields["city"].func = lambda doc: doc.find( "a", { 'id': "results.job.location" }).string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find( "a", { 'id': "results.job.location" }).string self.fields["state"].patterns = [r", (\w\w)\W", r", (\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "retailcareersnow.com" self.fields["posting_date"].func = lambda doc: doc.findAll( "td", {'class': "resultsStandard"})[2].string self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.fields.update(kwargs) def nextpage(doc, page): links = doc.find("ul", {'class': "paginationLineup"}) if links is None or links.findAll("li").pop().a is None: return None return "http://jobs.retailcareersnow.com/careers/jobsearch/" + links.findAll( "li").pop().a["href"] self.datafunc = lambda doc: doc.findAll( "tbody", {'class': re.compile("^displayTableRow")}) self.url = "http://jobs.retailcareersnow.com/careers/jobsearch/results?searchType=quick;kAndEntire=%s;country=United+States" self.nextlink = nextpage
def __init__(self, keyword, logger, ** kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["source"].func = lambda doc: 'regionalhelpwanted.com' self.fields["title"].func = lambda doc: doc.findAll('a')[0].string self.fields["company_joburl"].func = lambda doc: doc.findAll('a')[0]['href'] self.fields["source_joburl"].func = self.fields["company_joburl"].func self.fields["company_name"].func = lambda doc: get_company(doc) self.fields["company_name"].depth = 2 self.fields["company_id"].func =self.fields["company_name"].func self.fields["company_id"].depth = 2 self.fields["city"].func = lambda doc: get_location(doc)[0] self.fields["city"].depth = 2 self.fields["state"].func = lambda doc: get_location(doc)[1] self.fields["state"].depth = 2 self.fields["posting_date"].func = lambda doc: get_posting_date(doc) self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d)"] self.fields["posting_date"].process = common.mm_dd_yy self.fields["posting_date"].depth = 2 self.fields.update(kwargs) def get_company(doc): doc= get_tab(doc) comp= doc.find('td',text=re.compile('Company Name:')) if comp: return comp.next.next.firstText().string return 'Unknown' def get_location(doc): doc= get_tab(doc) loc= doc.find('td',text=re.compile('Location:')) if loc: return [loc.next.next.string,'US'] return [None,None] def get_posting_date(doc): doc= get_tab(doc) d= doc.find('td',text=re.compile(r"(\d\d?)/(\d\d?)/(\d\d)")) return d.strip() if d else datetime.now().strftime('%m/%d/%y') def get_tab(doc): if doc: tab= doc.findAll('table') return tab[1] if tab else '' return '' self.dev_mode = True self.datafunc = lambda doc: doc.findAll("div", {'class':'detailswhite'})#re.compile("^displayTableRow.*")}) self.url = [e[0] for e in xdomains]
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["code"].func = lambda x: (x["pre"] or "") + " " + (x[ "post"] or "") self.fields["code"].depth = 2 self.fields["title"].func = lambda doc: doc.p.string self.fields["title"].patterns = [r"\. (.*)"] self.fields["title"].process = lambda t: t[0].strip() self.fields["text"].func = lambda doc: "".join( doc.findAll("p")[1].findAll(text=True)) self.filterfields = { "pre": CourseField.CourseField(False), "post": CourseField.CourseField(False) } def prefunc(doc): d = doc while True: try: if d.name == 'h3': break except: pass d = d.previousSibling return d.string self.filterfields["pre"].func = prefunc self.filterfields["pre"].patterns = [r":(.*)"] self.filterfields["pre"].process = lambda t: t[0].strip() self.filterfields["post"].func = lambda doc: doc.p.string self.filterfields["post"].patterns = [r"(.*?)\."] self.filterfields["post"].process = lambda t: t[0].strip() self.fields.update(kwargs) def blow(data): lst = [] for entry in data: en = str(entry) en = en.replace(str(entry.strong), "") s = entry.strong.string x = s.find(".") for num in s[:x].split(","): new = num.strip() + s[x:] soup = self.getsoup(new + en) soup.previousSibling = entry.previousSibling lst.append(soup) return lst self.datafunc = lambda doc: blow(doc.findAll("p", {'class': "desc"})) self.url = "http://registrar.utexas.edu/catalogs/ug08-10/ch02/ug08.cr02.html#courses"
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.find("td", {'class': "c"}).a.string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.find("td", {'class': "t"}).a.string self.fields["company_joburl"].func = lambda doc: "http://hotjobs.yahoo.com" + doc.find("td", {'class': "t"}).a["href"].split(';')[0] self.fields["source_joburl"].func = lambda doc: "http://hotjobs.yahoo.com" + doc.find("td", {'class': "t"}).a["href"].split(';')[0] self.fields["source"].func = lambda doc: "hotjobs.yahoo.com" self.fields["posting_date"].func = lambda doc: doc.find("td", {'class': "d"}).string self.fields["posting_date"].patterns = [r"(\w\w\w) (\d\d)"] self.fields["posting_date"].process = common.mmm_dd self.fields.update(kwargs) self.datafunc = lambda doc: doc.findAll("tr", {'class': " top"}) self.url = "http://hotjobs.yahoo.com/job-search?jobtype=PERM&jobtype=CONT&commitment=FT&commitment=PT&locations=&country=&industry=&kw=%s&sort[type]=date" def getexp(doc): for tr in doc.findAll("tr"): if tr.th.string == "Experience": return tr.td.string or "" def getloc(doc): try: return doc.find("td", {'class': "l"}).string except: return doc.find("span", {'class': "first-of-type"}).string def nextpage(doc, page): linkSet=doc.find("div", {'class': "pageCtrl"}) links = linkSet.findAll("a") if links[-1].findChild(): return 'http://hotjobs.yahoo.com'+linkSet.find('a',{'class':'current'}).findNext()['href'] return None self.fields["city"].func = getloc self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = getloc self.fields["state"].patterns = [r", (\w\w)\W", r", (\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() # self.filterfields["experience"].func = getexp # self.filterfields["experience"].patterns = [r"(\d+)-(\d+)"] # self.filterfields["experience"].process = common.expminmax # self.filterfields["experience"].depth = 2 self.nextlink = nextpage self.dev_mode=True
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.find( "div", { 'class': "block" }).findAll("div")[2].p.a.string self.fields["company_name"].depth = 2 self.fields["company_id"].func = self.fields["company_name"].func self.fields["company_id"].depth = self.fields["company_name"].depth self.fields["title"].func = lambda doc: doc.find( "div", { 'class': "block" }).findAll("div")[1].p.string self.fields["title"].depth = 2 self.fields["company_joburl"].func = lambda doc: doc.findAll("td")[ 2].a["href"] self.fields["source_joburl"].func = lambda doc: doc.findAll("td")[2].a[ "href"] self.fields["city"].func = lambda doc: doc.findAll("td")[3].string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.findAll("td")[3].string self.fields["state"].patterns = [r", (\w\w)\W", r", (\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "hirefinders.com" self.fields["posting_date"].func = lambda doc: doc.findAll("td")[ 0].string self.fields["posting_date"].patterns = [r"(\d\d?)-(\d\d?)-(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy # self.filterfields["zipcode"].func = lambda doc: "".join(doc.find("span", text=re.compile("Job Locations:")).parent.parent.findAll(text=True)) # self.filterfields["zipcode"].patterns = [r"\D(\d{5})\D", r"\D(\d{5})$"] # self.filterfields["zipcode"].process = lambda t: t[0].strip() # self.filterfields["zipcode"].depth = 2 self.fields.update(kwargs) def nextpage(doc, page): curl = doc.find("ul", { 'class': "pagenavigator" }).findAll('span')[-1] links = doc.find("ul", {'class': "pagenavigator"}).findAll("li") if int(curl.string) >= page + 1: return links[page].a["href"] return None self.datafunc = lambda doc: doc.findAll("tr", {'class': re.compile("tr")}) self.url = "http://www.hirefinders.com/browsejobs?ddlSearchType=browsejobs&tbSearchKeywords=%s&tbSearchLocation=&ddlDistance=20&tbSearchExcludeKeywords=&ddlJobType=&ddlSearchDateAdded=&ddlSearchDateJoined=&ddlSearchIndustry=&ddlSearchIn=&amsSearchJobFunctions=&jfValues=&hdSearchMode=&hdAdvSearch=&lng_val=&lat_val=" self.nextlink = nextpage
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.fields["company_name"].func = lambda doc: doc.find( 'span', { 'class': 'company-name' }).next.split('—')[0] self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.find( "h3", { 'class': "title" }).a.string self.fields[ "company_joburl"].func = lambda doc: "http://www.internships.com" + doc.find( "h3", { 'class': "title" }).a['href'] self.fields[ "source_joburl"].func = lambda doc: "http://www.internships.com" + doc.find( "h3", { 'class': "title" }).a['href'] self.fields["city"].func = lambda doc: get_loc(doc)[0] self.fields["state"].func = lambda doc: get_loc(doc)[1] self.fields["source"].func = lambda doc: "internships.com" self.fields["posting_date"].func = lambda doc: datetime.now() self.fields.update(kwargs) self.keyword = [ 'the', ] def get_loc(doc): x = doc.find('span', { 'class': 'internship-location' }).string.strip().split(',') if len(x) > 1: return [x[0], x[1]] else: return [x[0], ''] def nextpage(doc, page): x = 'http://www.internships.com/search/post/results?keywords=%s&start=%d&limit=100' % ( self.query, page * 10) return x self.datafunc = lambda doc: doc.findAll( "div", {'class': 'search-result-item item-wrap '}) self.url = 'http://www.internships.com/search/post/results?keywords=%s&limit=100' self.nextlink = nextpage self.dev_mode = True
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.find( "span", { 'class': "company" }).string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.find( "a", {'class': "jobTitle"})["title"] self.fields["company_joburl"].func = lambda doc: doc.find( "a", {'class': "jobTitle"})["href"] self.fields["source_joburl"].func = lambda doc: doc.find( "a", {'class': "jobTitle"})["href"] self.fields["city"].func = lambda doc: "".join( doc.find("span", { 'class': "jobplace" }).findAll(text=True)) self.fields["city"].patterns = [r"^([^,]*),"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: "".join( doc.find("span", { 'class': "jobplace" }).findAll(text=True)) self.fields["state"].patterns = [r"\W(\w\w),", r"\W(\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "monster.com" self.fields["posting_date"].func = lambda doc: doc.find( "span", { 'class': "postingdate" }).string self.fields["posting_date"].patterns = [r"()today", r"(\d+)"] self.fields["posting_date"].process = common.daysago self.fields.update(kwargs) self.dev_mode = True def nextpage(doc, page): nav = doc.find('div', {'class': 'navigationBar'}) links = nav.findAll('a') if links[-1].has_key('href'): url = 'http://jobsearch.monster.com/PowerSearch.aspx?q=%s&rad=20&rad_units=miles&tm=60&dv=&pg=%d&pp=500&sort=dt.rv' % ( self.query, page + 1) return url return None self.nextlink = nextpage self.datafunc = lambda doc: doc.findAll("div", {'class': "itemHeader"}) self.url = "http://jobsearch.monster.com/PowerSearch.aspx?q=%s&rad=20&rad_units=miles&tm=60&dv=&pg=1&pp=500&sort=dt.rv"
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: "".join( doc.findAll("td")[1].findAll(text=True)) self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.find("a", { 'class': "jt" }).string self.fields["company_joburl"].func = lambda doc: doc.find( "a", {'class': "jt"})["href"] self.fields["source_joburl"].func = lambda doc: doc.find( "a", {'class': "jt"})["href"] self.fields["city"].func = lambda doc: doc.find( "td", id=re.compile(r"Location")).string self.fields["city"].patterns = [r"-([^-]*)$"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find( "td", id=re.compile(r"Location")).string self.fields["state"].patterns = [r"-(\w\w)-"] self.fields["state"].process = lambda t: t[0].strip() self.fields["latitude"].func = lambda doc: doc.find( "a", id=re.compile("MapJob"))["href"] self.fields["latitude"].patterns = [r"lat=(.*?),"] self.fields["latitude"].process = lambda t: float(t[0].strip()) self.fields["latitude"].depth = 2 self.fields["longitude"].func = lambda doc: doc.find( "a", id=re.compile("MapJob"))["href"] self.fields["longitude"].patterns = [r"lon=(.*?),"] self.fields["longitude"].process = lambda t: float(t[0].strip()) self.fields["longitude"].depth = 2 self.fields["source"].func = lambda doc: "careerrookie.com" self.fields["posting_date"].func = lambda doc: doc.find( "span", id=re.compile(r"Posted")).string self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.fields.update(kwargs) def nextpage(doc, page): link = doc.find("td", {'class': "nav_btm_cell"}).span.a if link is None: return None return link["href"] self.datafunc = lambda doc: doc.findAll( "tr", {'class': re.compile(r"^jl_\w+_row$")}) self.url = "http://www.careerrookie.com/CC/jobseeker/jobs/jobresults.aspx?mxjobsrchcriteria_rawwords=%s&s_freeloc=&_SearchJobBySkills%3As_jobtypes=ALL&s_emptype=JTFT&s_emptype=JTPT&s_emptype=JTIN&s_emptype=JTSE&s_emptype=JTIO&_SearchJobBySkills%3AImage1.x=64&_SearchJobBySkills%3AImage1.y=26&subtbtn=true" self.nextlink = nextpage
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.find( 'td', { 'class': 'results_company' }).a.string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.find( 'td', { 'class': 'results_title' }).a.string self.fields["company_joburl"].func = lambda doc: doc.a["href"] self.fields["source_joburl"].func = lambda doc: doc.a["href"] self.fields["city"].func = lambda doc: doc.find( 'td', { 'class': 'results_location' }).findAll('a')[0].string self.fields["city"].process = lambda t: t.strip() self.fields["state"].func = lambda doc: doc.find( 'td', { 'class': 'results_location' }).findAll('a')[1].string self.fields["state"].process = lambda t: t.strip() self.fields["source"].func = lambda doc: "ihispano.com" self.fields["posting_date"].func = lambda doc: doc.find( 'td', { 'class': 'results_create' }).string.strip() self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy # self.fields["posting_date"].depth = 2 self.fields.update(kwargs) def nextpage(doc, page): links = doc.find("span", {'class': "pager-list"}) if links.findChildren()[-1].name != 'strong': url = 'http://www.ihispano.com' + links.find( 'strong').findNext('a')['href'] return url return None self.datafunc = lambda doc: doc.findAll('tr', {'class': 'top-result-row'}) self.dev_mode = True self.url = "http://www.ihispano.com/careers/searchjob/results?key_words=%s&country=USA&state=&city=&searchtype=qck&Save=save&zip_code=&jobs_within_miles=10&category=&op=Search&form_id=candidate_searchjob_quick" self.nextlink = nextpage
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.find( "span", id=re.compile("spanCompanyName")).string self.fields["company_name"].depth = 2 self.fields["company_id"].func = self.fields["company_name"].func self.fields["company_id"].depth = 2 self.fields["title"].func = lambda doc: doc.find( "a", id=re.compile("lnkTitle")).string self.fields[ "company_joburl"].func = lambda doc: "http://computerjobs.com" + doc.find( "a", id=re.compile("lnkTitle"))["href"] self.fields[ "source_joburl"].func = lambda doc: "http://computerjobs.com" + doc.find( "a", id=re.compile("lnkTitle"))["href"] self.fields["company_joburl"].patterns = [r"^(.*?)&searchid"] self.fields["source_joburl"].patterns = [r"^(.*?)&searchid"] self.fields["company_joburl"].process = lambda t: t[0].strip() self.fields["source_joburl"].process = lambda t: t[0].strip() self.fields["city"].func = lambda doc: doc.find( "span", id=re.compile("spanLocation")).string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find( "span", id=re.compile("spanLocation")).string self.fields["state"].patterns = [r", (\w\w)\W", r", (\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "computerjobs.com" self.fields["posting_date"].func = lambda doc: doc.find( "span", id=re.compile("spanPosted")).string self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.fields["posting_date"].depth = 2 self.fields.update(kwargs) def nextpage(doc, page): link = doc.find("a", id=re.compile(r"hlNextPage")) if link is None: return None return "http://computerjobs.com" + link["href"] self.datafunc = lambda doc: doc.findAll("table", id=re.compile(r"jobResults")) self.url = "http://computerjobs.com/jresults.aspx?s_kw=%s&s_sl=&s_excNonIT=off&s_excNatl=off" self.nextlink = nextpage
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.find( "span", { 'class': "Company" }).string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.h3.a.string self.fields[ "company_joburl"].func = lambda doc: "http://postjobfree.com/" + doc.h3.a[ "href"] self.fields[ "source_joburl"].func = lambda doc: "http://postjobfree.com/" + doc.h3.a[ "href"] self.fields["city"].func = lambda doc: doc.find( "span", { 'class': "Location" }).string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find( "span", { 'class': "Location" }).string self.fields["state"].patterns = [r", (\w\w)\W", r", (\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "postjobfree.com" self.fields["posting_date"].func = lambda doc: doc.find( "span", { 'class': "PostedDate" }).string self.fields["posting_date"].patterns = [r"(\w\w\w) (\d\d?)"] self.fields["posting_date"].process = common.mmm_dd self.filterfields["zipcode"].func = lambda doc: "".join( doc.find("td", text=re.compile("ZIP:")).parent.parent.findAll( text=True)) self.filterfields["zipcode"].patterns = [r"(\d{5})"] self.filterfields["zipcode"].process = lambda t: t[0].strip() self.filterfields["zipcode"].depth = 2 self.fields.update(kwargs) self.datafunc = lambda doc: doc.findAll("div", {'class': "JobRow"}) self.url = "http://postjobfree.com/JobList.aspx?q=%s&n=&t=&c=&jt=&l=&radius=25&r=50&lat=&lng=&lct=&lc=&ls=&lz=&accuracy=&address="
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["code"].func = lambda x: (x["pre"] or "") + " " + (x["post"] or "") self.fields["code"].depth = 2 self.fields["title"].func = lambda doc: doc.p.string self.fields["title"].patterns = [r"\. (.*)"] self.fields["title"].process = lambda t: t[0].strip() self.fields["text"].func = lambda doc: "".join(doc.findAll("p")[1].findAll(text=True)) self.filterfields = {"pre": CourseField.CourseField(False), "post": CourseField.CourseField(False)} def prefunc(doc): d = doc while True: try: if d.name == 'h3': break except: pass d = d.previousSibling return d.string self.filterfields["pre"].func = prefunc self.filterfields["pre"].patterns = [r":(.*)"] self.filterfields["pre"].process = lambda t: t[0].strip() self.filterfields["post"].func = lambda doc: doc.p.string self.filterfields["post"].patterns = [r"(.*?)\."] self.filterfields["post"].process = lambda t: t[0].strip() self.fields.update(kwargs) def blow(data): lst = [] for entry in data: en = str(entry) en = en.replace(str(entry.strong), "") s = entry.strong.string x = s.find(".") for num in s[:x].split(","): new = num.strip() + s[x:] soup = self.getsoup(new+en) soup.previousSibling = entry.previousSibling lst.append(soup) return lst self.datafunc = lambda doc: blow(doc.findAll("p", {'class': "desc"})) self.url = "http://registrar.utexas.edu/catalogs/ug08-10/ch02/ug08.cr02.html#courses"
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: "".join( doc.findAll("td")[0].findAll(text=True)) self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.findAll("td")[4].b.a.string self.fields[ "company_joburl"].func = lambda doc: "http://hirelifescience.com/" + doc.findAll( "td")[4].b.a["href"] self.fields[ "source_joburl"].func = lambda doc: "http://hirelifescience.com/" + doc.findAll( "td")[4].b.a["href"] self.fields["city"].func = lambda doc: doc.findAll("td")[1].string self.fields["state"].func = lambda doc: doc.findAll("td")[2].string self.fields["source"].func = lambda doc: "hirelifescience.com" self.fields["posting_date"].func = lambda doc: doc.findAll("td")[ 3].string self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.fields.update(kwargs) def getall(doc): trs = doc.find("div", { 'class': "indent" }).div.findAll("table")[1].findAll("tr")[2:-1] for i in range(0, len(trs) - 1, 3): trs[i].append(trs[i + 1]) return trs[::3] self.datafunc = getall def nextpage(doc, page): trs = doc.find("div", { 'class': "indent" }).div.findAll("table")[1].findAll("tr")[0] links = trs.findAll("a") if len(links) < page: return None return "http://hirelifescience.com/" + links[page - 1]["href"] self.nextlink = nextpage self.url = "http://hirelifescience.com/seeker_jobs.asp?search=yes&page=&keyword=%s&pagesize=500&updown=&orderby=date"
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.find( "div", id=re.compile("^companyname")).span.string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.h3.findAll("div")[ 0].a.string self.fields[ "company_joburl"].func = lambda doc: "http://www.healthcarejobsite.com" + doc.h3.findAll( "div")[0].a["href"] self.fields[ "source_joburl"].func = lambda doc: "http://www.healthcarejobsite.com" + doc.h3.findAll( "div")[0].a["href"] self.fields["city"].func = lambda doc: doc.find( "div", id=re.compile("^companyname")).i.string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find( "div", id=re.compile("^companyname")).i.string self.fields["state"].patterns = [r", (\w\w)\W", r", (\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "healthcarejobsite.com" self.fields["posting_date"].func = lambda doc: doc.h3.findAll("div")[ 1].i.string self.fields["posting_date"].patterns = [r"(\w\w\w) (\d\d?)"] self.fields["posting_date"].process = common.mmm_dd self.fields.update(kwargs) def nextpage(doc, page): x = doc.findAll("td", {'class': "paging"})[-1] links = doc.find("td", {'class': "paging"}).findAll("a") if links[-1].string == 'Next': return "http://www.healthcarejobsite.com" + x.find( 'span', { 'class': 'currentPage' }).findNext()['href'] return None self.datafunc = lambda doc: doc.findAll( "td", {'class': re.compile("^job_title")}) self.url = "http://www.healthcarejobsite.com/jobs/job-search.asp?fkeywords=%s&forderby=M" self.nextlink = nextpage
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.findAll( "td", {'class': None})[1].a.string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.findAll( "td", {'class': None})[0].a.string self.fields[ "company_joburl"].func = lambda doc: "http://seeker.dice.com" + doc.findAll( "td", {'class': None})[0].a["href"] self.fields[ "source_joburl"].func = lambda doc: "http://seeker.dice.com" + doc.findAll( "td", {'class': None})[0].a["href"] self.fields["city"].func = lambda doc: doc.findAll( "td", {'class': None})[2].string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.findAll( "td", {'class': None})[2].string self.fields["state"].patterns = [r", (\w\w)\W", r", (\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "dice.com" self.fields["posting_date"].func = lambda doc: doc.findAll( "td", {'class': None})[3].string self.fields["posting_date"].patterns = [r"(\w\w\w)-(\d\d)"] self.fields["posting_date"].process = common.mmm_dd self.fields.update(kwargs) self.dev_mode = True def nextpage(doc, page): links = doc.find('div', {'class': 'pageProg'}) if links: last_link = links.findAll('a')[-1] return 'http://seeker.dice.com' + last_link[ 'href'] if last_link.string.startswith('Next') else None self.nextlink = nextpage self.datafunc = lambda doc: [ elem for elem in doc.tbody.findAll("tr") if elem('td', {'class': "icon"}) ] self.url = "http://seeker.dice.com/jobsearch/servlet/JobSearch?QUICK=1&NUM_PER_PAGE=500&TRAVEL=0&FRMT=0&LOCATION_OPTION=2&Ntx=mode+matchall&DAYSBACK=30&RADIUS=64.37376&op=300&Hf=0&N=0&ZC_COUNTRY=0&FREE_TEXT=%s&Ntk=JobSearchRanking&TAXTERM=0&Ns=p_PostedAge|0&SORTDIR=7&SORTSPEC=0"
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.findAll("td")[ 3].p.string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: "".join( doc.find("a", { 'class': "colour" }).parent.findAll(text=True)) self.fields[ "company_joburl"].func = lambda doc: "http://www.hirediversity.com/jobseekers/jobs/" + doc.find( "a", {'class': "colour"})["href"] self.fields[ "source_joburl"].func = lambda doc: "http://www.hirediversity.com/jobseekers/jobs/" + doc.find( "a", {'class': "colour"})["href"] self.fields["city"].func = lambda doc: doc.findAll("td")[4].p.string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.findAll("td")[4].p.string self.fields["state"].patterns = [r", (\w\w)\W", r", (\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "hirediversity.com" self.fields["posting_date"].func = lambda doc: doc.findAll("td")[ 6].p.string self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.fields.update(kwargs) def nextpage(doc, page): links = doc.find("div", {'class': "text"}).h3.findAll("a") if len(links) < page: return None return "http://www.hirediversity.com/jobseekers/jobs/" + links[ page - 1]["href"] self.datafunc = lambda doc: doc.find("div", { 'class': "content" }).table.findAll("tr")[1:] if doc else None self.url = "http://www.hirediversity.com/jobseekers/jobs/list.asp?quicksearch=yes&ambiguouslocation=City%2C+State&zipcode=ZipCode&industryids=&keywords=%s&Search.x=57&Search.y=10" self.nextlink = nextpage self.dev_mode = True
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.find("ul", {'class': "details"}).findAll("li")[1].string self.fields["company_name"].patterns = [r"^\s*(.*?)\s*$"] self.fields["company_name"].process = lambda doc: doc[0] self.fields["company_id"].func = self.fields["company_name"].func self.fields["company_id"].patterns = self.fields["company_name"].patterns self.fields["company_id"].process = self.fields["company_name"].process self.fields["title"].func = lambda doc: doc.h3.a["title"] self.fields["company_joburl"].func = lambda doc: "http://www.careerboard.com" + doc.h3.a["href"] self.fields["source_joburl"].func = lambda doc: "http://www.careerboard.com" + doc.h3.a["href"] self.fields["city"].func = lambda doc: doc.find("ul", {'class': "details"}).findAll("li")[0].string self.fields["city"].patterns = [r".*?\|.*?\|([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find("ul", {'class': "details"}).findAll("li")[0].string self.fields["state"].patterns = [r".*?\|.*?\|.*?, (\w\w)\W", r".*?\|.*?\|.*?, (\w\w)$"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "careerboard.com" self.fields["posting_date"].func = lambda doc: doc.find("p", {'class': "floatr small"}).string self.fields["posting_date"].patterns = [r"\s*(\d\d)/(\d\d)/(\d\d\d\d)\s*"] self.fields["posting_date"].process = common.mm_dd_yyyy self.fields.update(kwargs) self.fields["all_text"].func = lambda doc: all_text(doc) def all_text(doc): doc= doc.find('div',{'class':'job-description'}) doc.find('h2').extract() if doc.find('h2') else None return get_desc(doc,self.url) def nextpage(doc, page): linkSet = doc.find("div", {'class': "pages"}) if linkSet.findAll()[-1].name!='span': return 'http://www.careerboard.com'+linkSet.find('span').findNext('a')['href'] return None self.datafunc = lambda doc: doc.findAll('div', {'class': " listing1 "}) self.url = "http://www.careerboard.com/jobs/containing-any-of-%s" self.nextlink = nextpage
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.findAll("p")[ 0].string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.h4.a.string self.fields[ "company_joburl"].func = lambda doc: "http://chronicle.com" + doc.h4.a[ "href"] self.fields[ "source_joburl"].func = lambda doc: "http://chronicle.com" + doc.h4.a[ "href"] self.fields["state"].func = lambda doc: doc.div.find( "dl", { 'class': None }).dd.string self.fields["state"].patterns = [r"(.*)"] self.fields["state"].process = lambda doc: doc[0] #common.shorten self.fields["state"].mandatory = True self.fields["source"].func = lambda doc: "chronicle.com" self.fields["posting_date"].func = lambda doc: doc.findAll("p")[ 1].string self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.fields.update(kwargs) def nextpage(doc, page): links = doc.find("div", {'class': "pagination"}) if len(links.ul.findAll("li")) >= page + 1 and links.findChildren( 'li')[-1].findChild('a'): url = "http://chronicle.com/jobSearch" + links.ul.findAll( "li")[page].a["href"] return url return None self.datafunc = lambda doc: doc.findAll("div", {'class': "result"}) self.url = "http://chronicle.com/jobSearch?contextId=434&facetClear=1&searchQueryString=%s&position=&location=&locationmulti[]=ODg6OjpVbml0ZWQgU3RhdGVz" self.nextlink = nextpage self.dev_mode = True
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: "".join( doc.findAll("td")[-2].findAll(text=True)) self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.findAll("td")[3].a.string self.fields[ "company_joburl"].func = lambda doc: "http://www.hcareers.com" + doc.findAll( "td")[3].a["href"] self.fields[ "source_joburl"].func = lambda doc: "http://www.hcareers.com" + doc.findAll( "td")[3].a["href"] self.fields["city"].func = lambda doc: doc.findAll("td")[2].string self.fields["city"].patterns = [r"-([^-]*)$"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.findAll("td")[2].string self.fields["state"].patterns = [r"\W(\w\w)\W"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "hcareers.com" self.fields["posting_date"].func = lambda doc: doc.findAll("td")[ 0].string self.fields["posting_date"].patterns = [ r"(\w\w\w) (\d\d?), (\d\d\d\d)" ] self.fields["posting_date"].process = common.mmm_dd_yyyy self.fields.update(kwargs) def nextpage(doc, page): link = doc.find("div", {'class': "search-results-nav"}).a if link is None: return None return "http://www.hcareers.com" + link["href"] self.datafunc = lambda doc: doc.find("table", id="table1").findAll( "tr")[1:] self.url = "http://www.hcareers.com/seeker/search/advanced?jobDetectiveId=&booleanKeyWordSearch=%s&industryCodes=&management=&managementCheckbox=on&nonmanagementCheckbox=on&form.commit=Search&h_v=XG_20071127_1" self.nextlink = nextpage self.dev_mode = True
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: "".join( doc.find("td", id=re.compile(r"Company")).findAll(text=True)) self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.find("a", { 'class': "jt" }).string self.fields["company_joburl"].func = lambda doc: doc.find( "a", {'class': "jt"})["href"] self.fields["source_joburl"].func = self.fields["company_joburl"].func self.fields["city"].func = lambda doc: doc.find( "td", id=re.compile(r"Location")).string self.fields["city"].patterns = [r"-([^-]*)$"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find( "td", id=re.compile(r"Location")).string self.fields["state"].patterns = [r"^(\w\w)\W"] self.fields["state"].process = lambda t: t[0].strip() self.fields["source"].func = lambda doc: "jobpath.com" self.fields["posting_date"].func = lambda doc: doc.find( "span", id=re.compile(r"Posted"))["title"] self.fields["posting_date"].patterns = [r"(\w\w\w)-(\d\d?)"] self.fields["posting_date"].process = common.mmm_dd self.fields.update(kwargs) def nextpage(doc, page): links = doc.find("td", {'class': "nav_btm_cell"}) if links: return links.a["href"] if links.find( 'a', text='Next Page') else None return None self.datafunc = lambda doc: doc.findAll( "tr", {'class': re.compile(r"^jl_\w+_row$")}) self.url = "http://www.jobpath.com/JobSeeker/Jobs/JobResults.aspx?IPath=QHKCV&excrit=QID%3dA6657255451511%3bst%3da%3buse%3dALL%3brawWords%3d%s%3bCID%3dUS%3bSID%3d%3f%3bTID%3d0%3bENR%3dNO%3bDTP%3dDRNS%3bYDI%3dYES%3bIND%3dALL%3bPDQ%3dAll%3bPDQ%3dAll%3bPAYL%3d0%3bPAYH%3dgt120%3bPOY%3dNO%3bETD%3dALL%3bRE%3dALL%3bMGT%3dDC%3bSUP%3dDC%3bFRE%3d30%3bQS%3dsid_unknown%3bSS%3dNO%3bTITL%3d0%3bJQT%3dRAD%3bJDV%3dFalse%3bExpHigh%3dgt50%3bExpLow%3d0%3bMaxLowExp%3d-1&sc=3&ff=21&sd=2" self.nextlink = nextpage self.dev_mode = True
def __init__(self, keyword, logger, ** kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.company.string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.jobtitle.string self.fields["company_joburl"].func = lambda doc: get_company_url(doc) self.fields["source_joburl"].func = lambda doc: doc.url.string self.fields["city"].func = lambda doc: doc.city.string self.fields["state"].func = lambda doc: doc.state.string self.fields["source"].func = lambda doc: "indeed.com" self.fields["posting_date"].func = lambda doc: datetime.strptime(doc.date.string,'%a, %d %b %Y %H:%M:%S %Z') self.fields.update(kwargs) self.cookie=False def get_company_url(doc): try: url = 'http://www.indeed.com/rc/clk?jk=%s&from=vj' % doc.jobkey.string request = urllib2.Request(url) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) u = opener.open(request, timeout=10) url=u.url urlp=urlparse(url) for e in ignore_list: m=re.search(e,urlp.netloc) if m: self.logger.info(" Ignoring : "+url+" For : "+e) return None return url except Exception as ex: self.logger.warning(str(ex)) return doc.url.string self.datafunc = lambda doc: doc.findAll('result') or None self.nextlink = lambda x, y: None self.url=domains self.dev_mode=True
def __init__(self, keyword, logger, **kwargs): JobsiteParser.__init__(self, logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: "".join( doc.find("td", { 'class': "resultsCompanyUrl resultsStandard" }).findAll(text=True)) self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.find( "a", id="results.job.title").string self.fields[ "company_joburl"].func = lambda doc: "http://jobs.nj.com" + doc.find( "a", id="results.job.title")["href"] self.fields[ "source_joburl"].func = lambda doc: "http://jobs.nj.com" + doc.find( "a", id="results.job.title")["href"] self.fields["city"].func = lambda doc: doc.find( "a", id="results.job.location").string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.find( "a", id="results.job.location").string self.fields["state"].patterns = [r", (.*?)\s+\d", r", ([^,/]*)$"] self.fields["state"].process = common.shorten self.fields["source"].func = lambda doc: "nj.com" self.fields["posting_date"].func = lambda doc: doc.findAll("td")[ 3].string self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d\d\d)"] self.fields["posting_date"].process = common.mm_dd_yyyy self.filterfields["zipcode"].func = lambda doc: doc.find( "a", id="results.job.location").string self.filterfields["zipcode"].patterns = [r"(\d{5})"] self.filterfields["zipcode"].process = lambda t: t[0].strip() self.fields.update(kwargs) self.cookie = False self.datafunc = lambda doc: doc.findAll( "tbody", {'class': re.compile("displayTableRow")}) self.url = "http://jobs.nj.com/careers/jobsearch/results?searchType=quick;kAndEntire=%s;lastUpdated=-30+days;sortBy=moddate;pageSize=50;lastUpdated_i18n_date_array[month]=8;lastUpdated_i18n_date_array[day]=30;lastUpdated_i18n_date_array[year]=2010;lastUpdated_i18n_date_mysql=2010-08-30;lastUpdated_i18n[date_array][month]=8;lastUpdated_i18n[date_array][day]=30;lastUpdated_i18n[date_array][year]=2010;lastUpdated_i18n[date_mysql]=2010-08-30;lastUpdated_i18n[utc_beginning_mysql]=2010-08-30+04%3A00%3A00;lastUpdated_i18n[utc_end_mysql]=2010-08-31+03%3A59%3A59;lastUpdated_i18n[timezone_used_for_conversion]=EST"
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["title"].func = lambda doc: doc.div.a.string self.fields["company_joburl"].func = lambda doc: "http://www.therapyjobs.com" + doc.div.a["href"] self.fields["source_joburl"].func = lambda doc: "http://www.therapyjobs.com" + doc.div.a["href"] self.fields["city"].func = lambda doc: doc.findAll("div", recursive=False)[2].findAll("span").pop().string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["state"].func = lambda doc: doc.findAll("div", recursive=False)[2].findAll("span").pop().string self.fields["state"].patterns = [r", (.*?)\s+\d", r", ([^,/]*)$"] self.fields["state"].process = common.shorten self.fields["source"].func = lambda doc: "therapyjobs.com" self.fields["posting_date"].func = lambda doc: doc.findAll("div", recursive=False).pop().span.string self.fields["posting_date"].patterns = [r"(\w\w\w) (\d\d?), (\d\d\d\d)", r"()\d+ hours? ago"] self.fields["posting_date"].process = common.mmm_dd_yyyy self.filterfields["zipcode"].func = lambda doc: doc.findAll("div", recursive=False)[2].findAll("span").pop().string self.filterfields["zipcode"].patterns = [r"(\d{5})"] self.filterfields["zipcode"].process = lambda t: t[0].strip() self.fields.update(kwargs) self.datafunc = lambda doc: doc.findAll("div", {'class': "detailcell"}) self.url = "http://www.therapyjobs.com/Results.aspx?srch=%s&rpp=50"
def __init__(self, keyword,logger, **kwargs): JobsiteParser.__init__(self,logger) self.keyword = keyword self.fields["company_name"].func = lambda doc: doc.findAll("td")[2].string self.fields["company_id"].func = self.fields["company_name"].func self.fields["title"].func = lambda doc: doc.findAll("td")[0].span.a.string self.fields["company_joburl"].func = lambda doc: doc.findAll("td")[0].span.a["href"] self.fields["source_joburl"].func = lambda doc: doc.findAll("td")[0].span.a["href"] self.fields["city"].func = lambda doc: doc.find("div", {'id': "jd"}).findAll("div", recursive=False)[2].findAll("div",recursive=False)[1].div.table.findAll("tr")[2].findAll("td").pop().string self.fields["city"].patterns = [r"^([^,]*)"] self.fields["city"].process = lambda t: t[0].strip() self.fields["city"].depth = 2 self.fields["state"].func = lambda doc: doc.find("div", {'id': "jd"}).findAll("div", recursive=False)[2].findAll("div",recursive=False)[1].div.table.findAll("tr")[2].findAll("td").pop().string self.fields["state"].patterns = [r", ([^,/]*)$"] self.fields["state"].process = common.shorten self.fields["state"].depth = 2 self.fields["source"].func = lambda doc: "adminfinder.com" self.fields["posting_date"].func = lambda doc: doc.findAll("td")[3].string self.fields["posting_date"].patterns = [r"(\d\d?)/(\d\d?)/(\d\d)"] self.fields["posting_date"].process = common.mm_dd_yy self.fields.update(kwargs) self.datafunc = lambda doc: doc.find("table", {'id': "joblist"}).findAll("tr", recursive=False)[1:] self.url = "http://adminfinder.com/index.php?order_by=post_date&ord=asc&action=search&5=%s" self.dev_mode=True def nextpage(doc, page): links = doc.find("p", {'class': "nav_page_links"}) if links is None: return None link= links.findAll("a")[-1] if link.string.strip().startswith('Next'): return link['href'] return None self.nextlink = nextpage