class Datasource(CommonCrawlDatasource): name = module_name(__name__) query = "gumtree.com.au/s-ad/*" query_filters = [ "~url:.*/(account-manager|account-relationship-management|accounting|accounts-officer-clerk|accounts-payable|accounts-receivable-credit-control|admin|administration-office-support|administrative-assistant|advertising-arts-media|aged-disability-support|agronomy-farm-services|air-conditioning-refrigeration|analysis-reporting|architecture|art-director|assembly-process-work|automotive-engineering|automotive-trades|bakers-pastry-chefs|banking-financial-services|banking-retail-branch|bar-beverage-staff|bookkeeping-small-practice-accounting|building-services-engineering|building-trades|business-services-corporate-advisory|butcher|call-centre-customer-service|carpentry-cabinet-making|chef-cook|child-welfare-youth-family-services|childcare-after-school-care|civil-structural-engineering|cleaner-housekeeper|coaching-instruction|commercial-sales-leasing-property-mgmt|community-services-development|construction|consulting-generalist-hr|contract-management|corporate-commercial-law|courier-driver-postal-service|customer-service-call-centre|customer-service-customer-facing|defence|dental-dentist|design-architecture|developer-programmer|digital-search-marketing|education-teaching|electrician|employment-services|engineering|event-management|facilities-management-body-corporate|farm-management|farming-veterinary|financial-accounting-reporting|financial-manager-controller|financial-planning|fitter-turner-machinist|florist|foreman-supervisor|freight-cargo-forwarding|front-office-guest-services|funds-management|gardening-landscaping|general-practitioner-gp-|generalist|government-defence|government|graphic-design|hair-beauty-services|healthcare-administration|healthcare-nursing|horticulture|hospitality-tourism|information-communication-technology|interaction-web-design|interior-design|it-support-help-desk|kitchen-sandwich-hand|labourer|legal-secretary|legal|locksmith|machine-operators|machine-plant-operator|maintenance-handyman|maintenance|management|manufacturing-transport-logistics|marketing-assistants|marketing-communications|marketing-communications|marketing-manager|mechanical-engineering|media-planning-strategy-buying|merchandiser|mining-engineering-maintenance|mining-operations|mining-resources-energy|mortgage-broker|nanny-babysitter|new-business-development|nursing|oil-gas-engineering-maintenance|oil-gas-operations|other-jobs|other|pa-ea-secretary|painter-sign-writer|paralegal-law-clerk|payroll-accounting|performing-arts|personal-trainer|pharmacy|physiotherapy-ot-rehabilitation|plumber|police-corrections-officer|printing-publishing-services|production-planning-scheduling|project-management|property-law|public-relations-corporate-affairs|purchasing-procurement-inventory|real-estate-property|receptionist|recruitment-agency|recruitment-hr|recruitment-internal|relationship-account-management|removalist|residential-leasing-property-management|residential-sales|retail-assistant|retail-management|retail|road-transport|sales-call-centre|sales-coordinator|sales-customer-facing|sales-management|sales-representative-consultant|sales|security-services|sports-management|sports-recreation|systems-business-analyst|tailor-dressmaker|taxation|teaching|technician|tour-guide|trade-marketing|trades-services|training-development|travel-agent-consultant|tutoring|vet-animal-welfare|waiting-staff|warehousing-storage-distribution|web-development-production|workplace-training-assessment|writing-journalist|welder-boilermaker)/" ] def extract(self, html: bytes, uri, view_date): text = html.decode("utf-8") obj = parse_js_obj(text, JS_STR_APP) if obj is None: return [] else: data = obj["vip"]["item"] # adType: OFFER is job ad, WANTED is ask for work if data["isJobsCategory"] and data["adType"] == "OFFER": return [{"data": data, "uri": uri, "view_date": view_date}] else: return [] def normalise(self, data, uri, view_date): metadata = {row["value"]: row["name"] for row in data["mainAttributes"]} salary_raw = metadata.get("Salary Detail") salary_data = get_salary_data(salary_raw) return { "title": data["title"], "description": html2plain(data["description"]), "uri": uri, "view_date": datetime_from_iso_utc(view_date), "org": None, **salary_data, "location_raw": data["mapAddress"], **AU_GEOCODER.geocode(data["mapAddress"]), }
class Datasource(CommonCrawlDatasource): name = module_name(__name__) def extract(self, html: bytes, base_url: str, view_date): data = extruct.extract(html, base_url, syntaxes=["json-ld"])["json-ld"] job_posts = [datum for datum in data if datum["@type"] == "JobPosting"] return [{ "data": post, "uri": base_url, "view_date": view_date } for post in job_posts] def normalise(self, data, uri, view_date): if "description" in data: description = html2plain(data["description"]) else: description = None org = data.get("hiringOrganization") if isinstance(org, dict): org = org["name"] return { "title": data["title"], "description": description, "uri": uri, "view_date": datetime_from_iso_utc(view_date), "org": org, }
class Datasource(CommonCrawlDatasource): name = module_name(__name__) def extract(self, html: bytes, base_url: str, view_date): data = extruct.extract(html, base_url, syntaxes=["microdata"])["microdata"] job_posts = [ datum["properties"] for datum in data if datum["type"] == "http://schema.org/JobPosting" ] return [ {"data": post, "uri": base_url, "view_date": view_date} for post in job_posts ] def normalise(self, data, uri, view_date): org = data["hiringOrganization"] if isinstance(org, dict): org = org.get("name") if not isinstance(org, str): org = None return { "title": data["title"], "description": html2plain(data.get("description", "")), "uri": uri, "view_date": datetime_from_iso_utc(view_date), "org": org, }
class Datasource(CommonCrawlDatasource): name = module_name(__name__) query = "probonoaustralia.com.au/jobs/*" def extract(self, html: Union[bytes, str], uri, view_date): soup = bs4.BeautifulSoup(html, "html5lib") infos = soup.select(".org-basic-info > div > p.org-add") data = {} for info in infos: key = info.select_one("b") if not key: logging.warning("Missing key in %s; %s", uri, info) continue schema_key = key.get_text().strip() value = "".join( str(s.get_text() if isinstance(s, bs4.element.Tag) else s).strip() for s in key.next_siblings ) data[schema_key] = value description = str(soup.select_one("#about-role") or "") hiringOrganization_description = str( soup.select_one("#about-organisation") or "" ) header = soup.select_one("h1") if not header: logging.warning("Missing header: %s", uri) title = None else: title = header.get_text().strip() return [ { "title": title, "description": description, "organisation_description": hiringOrganization_description, "metadata": data, "uri": uri, "view_date": view_date, } ] def normalise( self, title, description, organisation_description, metadata, uri, view_date ): salary_text = metadata.get("Salary :") salary_data = get_salary_data(salary_text) location_raw = metadata["Location :"] return { "title": title, "description": html2plain(description), "uri": uri, "view_date": datetime_from_iso_utc(view_date), "org": metadata.get("Organisation :"), **salary_data, "location_raw": location_raw, **AU_GEOCODER.geocode(fix_probono_location(location_raw)), }
class Datasource(JSONLinkedDatasource): name = module_name(__name__) query = "ethicaljobs.com.au/members/*" def normalise(self, data, uri, view_date): ans = super().normalise(data, uri, view_date) # Salary not in metadata location_raw = location_jsonld(data) return { **ans, "location_raw": location_raw, **AU_GEOCODER.geocode(location_raw), }
class Datasource(JSONLinkedDatasource): name = module_name(__name__) query = "www.cgcrecruitment.com/job/*" def normalise(self, data, uri, view_date): ans = super().normalise(data, uri, view_date) salary_raw = data["baseSalary"]["value"].get("value") salary = get_salary_data(salary_raw) location_raw = location_jsonld(data) return { **ans, **salary, "location_raw": location_raw, **AU_GEOCODER.geocode(location_raw), }
class Datasource(MicrodataDatasource): name = module_name(__name__) query = "jobs.csiro.au/job/*" def normalise(self, data, uri, view_date): # Description is dometimes a list, e.g. CC-MAIN-2019-18 if isinstance(data.get("description"), list): data["description"] = "\n".join(data["description"]) ans = super().normalise(data, uri, view_date) # jobLocation *can* be an array location_raw = str(data.get("jobLocation") or "") return { **ans, "location_raw": location_raw, **AU_GEOCODER.geocode(location_raw), }
class Datasource(CommonCrawlDatasource): name = module_name(__name__) query = "iworkfor.nsw.gov.au/job/*" def extract(self, html: Union[bytes, str], uri, view_date): soup = bs4.BeautifulSoup(html, "html5lib") body = soup.select_one("tbody") # Some pages are missing a body; e.g. CC-MAIN-2018-17 if not body: return [] infos = body.select("tr") data = {} for info in infos: key = info.select_one("th") value = info.select_one("td") if key and value: data[key.get_text().strip()] = value.get_text().strip() title_tag = soup.select_one(".job-detail-title") if not title_tag: logging.warning("Missing title tag in %s, %s", uri, view_date) title = None else: title = title_tag.get_text().strip() description = str(soup.select_one(".job-detail-des") or "") return [ { "title": title, "description": description, "metadata": data, "uri": uri, "view_date": view_date, } ] def normalise(self, title, description, metadata, uri, view_date): salary = get_salary_data(metadata.get("Total Remuneration Package:") or "") location_raw = metadata["Job Location:"] return { "title": title, "description": html2plain(description), "uri": uri, "view_date": datetime_from_iso_utc(view_date), "org": metadata["Organisation/Entity:"], **salary, "location_raw": location_raw, **AU_GEOCODER.geocode(fixup_iworkfornsw_loc(location_raw)), }
class Datasource(CommonCrawlDatasource): name = module_name(__name__) query = "careers.vic.gov.au/job/*" def extract(self, html: Union[bytes, str], uri, view_date) -> List[Dict[Any, Any]]: soup = bs4.BeautifulSoup(html, "html5lib") data = {} for info in soup.select(".txt-info"): key = info.select_one(".txt-bold") if not key: continue key_text = key.get_text().strip() value = "".join(str(s).strip() for s in key.next_siblings) data[key_text] = value title_tag = soup.select_one(".txt-title") if not title_tag: logging.warning("Missing title tag in %s", uri) title = None else: title = str(title_tag.get_text()) description = str(soup.select_one(".txt-pre-line") or "") return [{ "title": title, "description": description, "metadata": data, "uri": uri, "view_date": view_date, }] def normalise(self, title, description, metadata, uri, view_date): salary_data = get_salary_data( metadata.get("Salary:") or metadata["Salary Range:"]) location_raw = metadata.get("Location:") or metadata["Work location:"] return { "title": title, "description": html2plain(description), "uri": uri, "view_date": datetime_from_iso_utc(view_date), "org": metadata.get("Organisation:"), **salary_data, "location_raw": location_raw, **AU_GEOCODER.geocode(fixup_careers_vic_location(location_raw)), }
class Datasource(KaggleDatasource): # License: CC0: Public Domain dataset = "promptcloud/latest-seek-australia-job-dataset" sources = { "seekau_2019q3": "marketing_sample_for_seek_au-jobs_listing__20190901_20191231__10k_data.json" } name = module_name(__name__) raw_extension = ".json" def extract_one(self, path: Path) -> Generator[Dict[Any, Any], None, None]: with open(path, "r") as f: for line in f: yield json.loads(line) def normalise(self, *args, **data) -> Dict[str, Any]: location_text = ", ".join([ data["city"], data["state"], data.get("country") or data["inferred_country"], ]) salary_text = data.get("salary_offered") return { "title": data["job_title"], "description": data["job_description"], "uri": data["url"], "view_date": datetime.datetime.strptime(data["crawl_timestamp"], "%Y-%m-%d %H:%M:%S +0000"), "org": data["company_name"], **get_salary_data(salary_text), "location_raw": location_text, **AU_GEOCODER.geocode(location_text), }
class Datasource(KaggleDatasource): # License: CC0: Public Domain dataset = "santokalayil/data-scientist-jobs-in-australia-october-25-2019" sources = { "indeedau_datascience_202010": "datascientist_jobs_in_australia_Oct_25_2019.csv" } raw_extension = ".csv" name = module_name(__name__) def extract_one(self, path: Path) -> Generator[Dict[Any, Any], None, None]: with open(path, "r", encoding="latin-1") as f: for row in csv.DictReader(f): yield {k: v for k, v in row.items() if k} def normalise(self, *args, **data) -> Dict[str, Any]: return { "title": data["title"], "description": data["summary"], "view_date": datetime.datetime(2019, 10, 25), "org": data["company"], "location_raw": data["location"], **AU_GEOCODER.geocode(data["location"]), }
class Datasource(KaggleDatasource): # License: CC BY-SA 4.0 dataset = "PromptCloudHQ/australian-job-listings-data-from-seek-job-board" sources = {"seekau": "seek_australia_sample.csv"} raw_extension = ".csv" name = module_name(__name__) def extract_one(self, path: Path) -> Generator[Dict[Any, Any], None, None]: with open(path, "r", encoding="latin-1") as f: for row in csv.DictReader(f): yield row def normalise(self, *args, **data) -> Dict[str, Any]: parts = [ x for x in [data.get("city"), data.get("state"), data.get("geo")] if x ] location_text = ", ".join(parts) return { "title": data["job_title"], "description": data["job_description"], "uri": data["pageurl"], "view_date": datetime.datetime.strptime(data["crawl_timestamp"], "%Y-%m-%d %H:%M:%S +0000"), "org": data["company_name"], **get_salary_data(data.get("salary_offered")), "location_raw": location_text, **AU_GEOCODER.geocode(location_text), }
class Datasource(CommonCrawlDatasource): name = module_name(__name__) query = "seek.com.au/job/*" query_filters = ["!~url:.*/apply/*"] def extract(self, html: bytes, uri, view_date): text = html.decode("utf-8") obj = parse_js_obj(text, JS_STR_REDUX) if obj is None: return [] else: return [{ "data": obj["jobdetails"]["result"], "uri": uri, "view_date": view_date, }] def normalise(self, data, uri, view_date): salary_text = data["salary"] location = data["locationHierarchy"] location_text = ", ".join([ location["suburb"], location["city"], location["state"], location["nation"], ]) return { "title": data["title"], "description": html2plain(data["mobileAdTemplate"]), "uri": uri, "view_date": datetime_from_iso_utc(view_date), "org": data["advertiser"]["description"], **get_salary_data(salary_text), "location_raw": location_text, **AU_GEOCODER.geocode(location_text), }
class Datasource(KaggleDatasource): # License: CC BY-NC-SA 4.0 dataset = "PromptCloudHQ/australian-jobs-on-gumtreecomau" sources = {"gumtreeau": "gumtree_com_au-sample.csv"} raw_extension = ".csv" name = module_name(__name__) def extract_one(self, path: Path) -> Generator[Dict[Any, Any], None, None]: with open(path, "r") as f: for row in csv.DictReader(f): yield row def normalise(self, *args, **data) -> Dict[str, Any]: return { "title": data["job_title"], "description": data["job_description"], "uri": data["page_url"], # Not quite true; this is date added "view_date": datetime.datetime.strptime(data["date_added"], "%d/%m/%Y"), **get_salary_data(data["salary"]), "location_raw": data["location"], **AU_GEOCODER.geocode(data["location"]), }