def parse(self, source, graph, vocab_expansion = False, vocab_cache = False, rdfOutput = False) : """ @param source: one of the input sources that the RDFLib package defined @type source: InputSource class instance @param graph: target graph for the triples; output graph, in RDFa spec. parlance @type graph: RDFLib Graph @keyword vocab_expansion: whether the RDFa @vocab attribute should also mean vocabulary expansion (see the RDFa 1.1 spec for further details) @type vocab_expansion: Boolean @keyword vocab_cache: in case vocab expansion is used, whether the expansion data (i.e., vocabulary) should be cached locally. This requires the ability for the local application to write on the local file system @type vocab_chache: Boolean @keyword rdfOutput: whether Exceptions should be catched and added, as triples, to the processor graph, or whether they should be raised. @type rdfOutput: Boolean """ from pyMicrodata import pyMicrodata if isinstance(source, StringInputSource) : orig_source = source.getByteStream() elif isinstance(source, URLInputSource) : orig_source = source.url elif isinstance(source, FileInputSource) : orig_source = source.file.name source.file.close() baseURI = source.getPublicId() processor = pyMicrodata(base = baseURI, vocab_expansion = vocab_expansion, vocab_cache = vocab_cache) processor.graph_from_source(orig_source, graph=graph, rdfOutput = rdfOutput)
def __init__(self, source): super(CompoundGraph, self).__init__() try: self.microdata_graph = pyMicrodata().graph_from_source(source) except: self.microdata_graph = None try: self.rdfa_graph = pyRdfa().graph_from_source(source) except: self.rdfa_graph = None
def parse(self, source, graph, pgraph = None, embedded_rdf = True, vocab_expansion = False, vocab_cache = False, rdfOutput = False) : """ @param source: one of the input sources that the RDFLib package defined @type source: InputSource class instance @param graph: target graph for the triples; output graph, in RDFa spec. parlance @type graph: RDFLib Graph @keyword pgraph: target for error and warning triples; processor graph, in RDFa spec. parlance. If set to None, these triples are ignored @type pgraph: RDFLib Graph @keyword embedded_rdf: some formats allow embedding RDF in other formats: (X)HTML can contain turtle in a special <script> element, SVG can have RDF/XML embedded in a <metadata> element. This flag controls whether those triples should be interpreted and added to the output graph. Some languages (e.g., SVG) require this, and the flag is ignored. @type embedded_rdf: Boolean @keyword vocab_expansion: whether the RDFa @vocab attribute should also mean vocabulary expansion (see the RDFa 1.1 spec for further details) @type vocab_expansion: Boolean @keyword vocab_cache: in case vocab expansion is used, whether the expansion data (i.e., vocabulary) should be cached locally. This requires the ability for the local application to write on the local file system @type vocab_chache: Boolean @keyword rdfOutput: whether Exceptions should be catched and added, as triples, to the processor graph, or whether they should be raised. @type rdfOutput: Boolean """ if isinstance(source, StringInputSource) : orig_source = source.getByteStream() elif isinstance(source, URLInputSource) : orig_source = source.url elif isinstance(source, FileInputSource) : orig_source = source.file.name source.file.close() baseURI = source.getPublicId() # The RDFa part from pyRdfa import pyRdfa, Options self.options = Options(output_processor_graph = (pgraph != None), embedded_rdf = embedded_rdf, vocab_expansion = vocab_expansion, vocab_cache = vocab_cache) processor = pyRdfa(self.options, base = baseURI, media_type = 'text/html', rdfa_version = '1.1') processor.graph_from_source(orig_source, graph=graph, pgraph=pgraph, rdfOutput = rdfOutput) # The Microdata part try: from pyMicrodata import pyMicrodata processor = pyMicrodata(base = baseURI, vocab_expansion = vocab_expansion, vocab_cache = vocab_cache) processor.graph_from_source(orig_source, graph=graph, rdfOutput = rdfOutput) except ImportError: # no pyMicrodata installed! pass
def __init__(self, url, impl): self.ns_ont = {} self.attribs_by_class = defaultdict(list) self.ontologies = [] # are these initializations necessary self.attributes = [] self.source = url self.impl = impl if 'rdfa' == impl: self.range_uri = "http://www.w3.org/2000/01/rdf-schema#range" self.domain_uri = "http://www.w3.org/2000/01/rdf-schema#domain" self.type_uri = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" self.subclass_uri = "http://www.w3.org/2000/01/rdf-schema#subClassOf" self.parser = pyRdfa() elif 'microdata' == impl: self.range_uri = "http://schema.org/range" self.domain_uri = "http://schema.org/domain" self.type_uri = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" self.subclass_uri = "http://www.w3.org/2000/01/rdf-schema#subClassOf" self.parser = pyMicrodata() return super(Graph, self).__init__()
try : opts, value = getopt.getopt(sys.argv[1:],"xtjnpb:") for o,a in opts: if o == "-t" : format = "turtle" elif o == "-j" : format = "json-ld" elif o == "-n" : format = "nt" elif o == "-p" or o == "-x": format = "pretty-xml" elif o == "-b" : base = a else : usage() sys.exit(1) except : usage() sys.exit(1) processor = pyMicrodata(base) if len(value) >= 1 : print processor.rdf_from_sources(value, outputFormat = format) else : print processor.rdf_from_source(sys.stdin, outputFormat = format)
Run the microdata testing locally """ import sys sys.path.insert(0, "/Users/ivan/Library/Python") # You may want to adapt this to your environment... import sys, getopt from pyMicrodata import pyMicrodata, __version__ ########################################### test_path = "/Users/ivan/W3C/github/microdata-rdf/tests/" test_file_base = test_path + ("%04d" % int(sys.argv[1])) #test_file_base = test_path + ("sdo_eg_md_%d" % int(sys.argv[1])) test_html = test_file_base + ".html" test_ttl = test_file_base + ".ttl" processor = pyMicrodata() print processor.rdf_from_source(test_html) print "----" with open(test_ttl) as f: for l in f: print l, print "----" with open(test_html) as f: for l in f: print l,
class Crawler(Spider): name = "crawler" microdata = pyMicrodata() no_duplicated_items = 0 context = None standard_sample = None map_schema = None data_reduction = None parse_job = None #vananh currentDomain = "" no_not_vi_doc = 0 custom_settings = { # 'FEED_FORMAT': 'json', # 'FEED_URI': 'topcv.json', 'ITEM_PIPELINES': { 'pipelines.MongoPipeline': 300 }, 'MONGO_URI': MONGO_URI, 'MONGO_DATABASE': MONGO_DATABASE, 'MONGO_COLLECTION': MONGO_COLLECTION } def __init__(self, name=None, **kwargs): self.domain = kwargs.get('domain') Crawler.currentDomain = self.domain super(Crawler, self).__init__(name, **kwargs) def start_requests(self): if os.path.exists(get_context_file(self.domain)): with open(get_context_file(self.domain), mode='r', encoding='utf8') as f: self.context = json.load(f) f.close() if not self.context['is_finished']: raise Exception('Context file is not completed') else: if self.context['data_format'] == 'json+ld': self.parse_job = self.parse_job_json else: self.parse_job = self.parse_job_microdata self.standard_sample = self.get_standard_sample( STANDARD_ATTRIBUTES_FN) self.map_schema = self.get_map_schema(self.context['schema']) self.data_reduction = self.get_data_reduction( MONGO_URI, MONGO_DATABASE, MONGO_COLLECTION) else: raise Exception('Context file name not existed: ' + get_context_file(self.domain)) yield Request(url=self.context['start_url'], callback=self.parse) def parse(self, response): next_page = response.xpath(self.context['selectors']['next_page'] + '/@href').get() job_urls = response.xpath(self.context['selectors']['job_url'] + '/@href').getall() #vananh #yield Request(url=get_correct_url(job_urls[0], response), callback=self.parse_job) for job_url in job_urls: # job_url = response.urljoin(job_url) yield Request(url=get_correct_url(job_url, response), callback=self.parse_job) if next_page is not None: next_page = response.urljoin(next_page) yield Request(url=get_correct_url(next_page, response), callback=self.parse) def parse_job_json(self, response): job_url = response.request.url jobs = self.get_json_from_response_json(response) job_selectors = self.context['selectors']['job_selectors'] for job in jobs: job = self.change_to_right_form(job) if job_selectors is not None: for field, selector in job_selectors.items(): print(selector) job[field] = ','.join( text.strip() for text in response.xpath(selector + '/text()').extract() if text is not None) job = self.normalize(job, job_url) yield job def parse_job_microdata(self, response): job_url = response.request.url jobs = self.get_json_from_response_microdata(response) job_selectors = self.context['selectors']['job_selectors'] for job in jobs: job = self.change_to_right_form(job) if job_selectors is not None: for field, selector in job_selectors.items(): #print(selector) job[field] = ','.join( text.strip() for text in response.xpath(selector + '/text()').extract() if text is not None) job = self.normalize(job, job_url) print(job_url) yield job @staticmethod def get_json_from_response_json(response): result = [] dom = etree.HTML(response.body.decode("utf8")) json_node = dom.xpath("//script[text()]") for node in json_node: try: job = json.loads(node.text, strict=False) if job['@type'] == 'JobPosting': #van anh #dich tai day vi_lang = Crawler.is_vi_language(job["description"]) if not vi_lang: Crawler.no_not_vi_doc = Crawler.no_not_vi_doc + 1 return result # if Crawler.currentDomain == "topcv": temp_job = job job = Crawler.seperate_attributes_topcv(temp_job, dom) result.append(job) except (ValueError, TypeError): pass return result def get_json_from_response_microdata(self, response): raw_json = json.loads( self.microdata.rdf_from_source(response.body, 'json-ld').decode('utf8')) result = parse_json(raw_json) return result def change_to_right_form(self, job): norm_job = self.standard_sample.copy() flatten_job = flatten_dict(job) for key, value in self.map_schema.items(): real_value = flatten_job.get(key) if real_value is None: continue else: attribute = norm_job for attribute_level in value[:-1]: attribute = attribute.get(attribute_level) if type(real_value) is str: attribute[value[-1]] = re.sub(r'<[^<>]*>', '', str(real_value)) elif type(attribute[value[-1]]) == dict and type( real_value) == list: attribute[value[-1]] = real_value[0] else: attribute[value[-1]] = real_value return norm_job def normalize(self, job, url): result = normalize_job(job) result['url'] = url # Check duplicate if self.data_reduction.is_match(self.get_filter_data(job)): self.no_duplicated_items += 1 result = None return result @staticmethod def get_standard_sample(file_name): if os.path.exists(file_name): with open(file_name, mode='r', encoding='utf8') as f: standard_sample = json.load(f) f.close() else: raise Exception('Not exist standard file: ' + file_name) return standard_sample @staticmethod def get_map_schema(schema): return {key: value.split('_') for key, value in schema.items()} def get_data_reduction(self, uri, database, collection): collection = pymongo.MongoClient(uri)[database][collection] #Lay ra ten, noi tuyen dung, dia diem, ko lay id cua danh sach tuyen dung jobs = list( collection.find({}, { 'title': 1, 'hiringOrganization.name': 1, 'jobLocation.address.addressRegion': 1, 'validThrough': 1, 'datePosted': 1, '_id': 0 })) data = [self.get_filter_data(job) for job in jobs] data_reduction = DataReduction(3, data) return data_reduction @staticmethod def get_filter_data(job): title = job['title'] hiring_organization_name = job['hiringOrganization']['name'] if type(job['jobLocation']) is list: address_region = ','.join([ location['address']['addressRegion'] for location in job['jobLocation'] ]) else: address_region = job['jobLocation']['address']['addressRegion'] #vananh #validThrough = #trong db la date => chuyen ve str valid_through = job['validThrough'] #validThroughDate = pd.to_datetime(date_str) #valid_through = str(validThroughDate.year) + "-" + str(validThroughDate.month) + "-" + str(validThroughDate.day) date_posted = job['datePosted'] return [ title, hiring_organization_name, address_region, date_posted, valid_through ] #return [title, hiring_organization_name, address_region, valid_through] #van anh #vananh @staticmethod def seperate_attributes_topcv(job, dom): inital_description = job['description'] description_dom = etree.HTML(inital_description) first_benefit = "" first_requirement = "" if "jobBenefits" not in job: raw_benefits = description_dom.xpath( "//*[contains(text(),'Quyền lợi')]/following-sibling::*") raw_benefits_str = "" for bnf in raw_benefits: bnf_str = etree.tostring(bnf, method='html', encoding="unicode") raw_benefits_str = raw_benefits_str + bnf_str first_benefit = etree.tostring(raw_benefits[0], method='html', encoding="unicode") jobBenefits = raw_benefits_str job["jobBenefits"] = jobBenefits if "experienceRequirements" not in job: raw_requirements = description_dom.xpath( "//*[contains(text(),'Yêu cầu')]/following-sibling::*") requirements_str = "" req_length = len(raw_requirements) i = 0 while i < req_length: req_str = etree.tostring(raw_requirements[i], method='html', encoding="unicode") if (first_benefit == req_str): folowing_req_str = etree.tostring(raw_requirements[i - 1], method='html', encoding="unicode") requirements_str = requirements_str.replace( folowing_req_str, "") break requirements_str = requirements_str + req_str i += 1 first_requirement = etree.tostring(raw_requirements[0], method='html', encoding="unicode") experienceRequirements = requirements_str job["experienceRequirements"] = experienceRequirements # if first_requirement.strip() != "": std_description = description_dom.xpath( "//*[contains(text(),'Mô tả')]/following-sibling::*") std_description_str = "" i = 0 std_description_length = len(std_description) while i < std_description_length: des_str = etree.tostring(std_description[i], method='html', encoding="unicode") if first_requirement == des_str: folowing_des_str = etree.tostring(std_description[i - 1], method='html', encoding="unicode") std_description_str = std_description_str.replace( folowing_des_str, "") break std_description_str = std_description_str + des_str i += 1 job["description"] = std_description_str if job["experienceRequirements"] == "" or job["jobBenefits"] == "": job["jobBenefits"] = seperate.extract_info(inital_description, "quyền lợi") job["description"] = seperate.extract_info(inital_description, "mô tả") job["experienceRequirements"] = seperate.extract_info( inital_description, "yêu cầu") if job["experienceRequirements"] == "" and job[ "jobBenefits"] == "" and job["description"] == "": #print("lala") meta_description = dom.xpath("//meta[@name='description']") for temp in meta_description: job["jobBenefits"] = "" job["description"] = temp.attrib['content'] job["experienceRequirements"] = temp.attrib['content'] #lay so luong tuyen dung: job_available_node = dom.xpath( "//div[@id='col-job-right']//div[@id='box-info-job']//div[@class='job-info-item']//*[contains(text(),'cần tuyển')]/following-sibling::*[1]" ) if (len(job_available_node) == 0): job_available_node = dom.xpath( "///*[@data-original-title='Số lượng cần tuyển']") if (len(job_available_node) > 0): job_available_text = job_available_node[0].text if "không giới hạn" in job_available_text.lower(): job["totalJobOpenings"] = 10 elif "người" in job_available_text.lower(): num_job_available = (job_available_text.split(" ")[0]) if (num_job_available.isdigit()): job["totalJobOpenings"] = int(num_job_available) else: job["totalJobOpenings"] = 1 else: job["totalJobOpenings"] = 1 #print(job["totalJobOpenings"]) return job @staticmethod def is_vi_language(raw_text): tag_re = re.compile(r'<[^>]+>') text = tag_re.sub('', raw_text) text = text.strip() result = detect(text) if result != "vi": return False return True # def close(self, spider, reason): print("Number of english items: ", self.no_not_vi_doc) print('Number of duplicated items: %d' % self.no_duplicated_items) print("Finished!")
opts, value = getopt.getopt(sys.argv[1:], "vxtjnpb:") for o, a in opts: if o == "-t": format = "turtle" elif o == "-j": format = "json-ld" elif o == "-n": format = "nt" elif o == "-p" or o == "-x": format = "pretty-xml" elif o == "-b": base = a elif o == "-v": version_only = True else: usage() sys.exit(1) except: usage() sys.exit(1) if version_only: print "pyMicrodata version: %s" % __version__ sys.exit(0) processor = pyMicrodata(base) if len(value) >= 1: print processor.rdf_from_sources(value, outputFormat=format) else: print processor.rdf_from_source(sys.stdin, outputFormat=format)
class SchemaCrawler(Spider): name = "schema_crawler" microdata = pyMicrodata() schema = None get_job_sample = None samples = [] selectors = { # 'job_url': "//*[@id='box-job-result']/div[1]/div/div/div[2]/h4/a", # 'next_page': "//*[@id='box-job-result']/div[2]/ul/li[last()]/a", } # start_url = 'https://www.topcv.vn/viec-lam/moi-nhat.html?utm_source=click-search-job&utm_medium=page-job&utm_campaign=tracking-job' context = {} domain = "" currentDomain = "" def __init__(self, name=None, **kwargs): self.start_url = kwargs.get('start_url') self.selectors['job_url'] = kwargs.get('job_url') self.selectors['next_page'] = kwargs.get('next_page') self.domain = kwargs.get('domain') #self.driver = webdriver.Firefox() super(SchemaCrawler, self).__init__(name, **kwargs) def start_requests(self): self.context['start_url'] = self.start_url self.context['domain'] = self.domain SchemaCrawler.currentDomain = self.domain if not os.path.exists(get_context_file(self.domain)): if not os.path.exists(STANDARD_ATTRIBUTES_FN): raise Exception('Not exist standard file: ' + STANDARD_ATTRIBUTES_FN) yield Request(url=self.start_url, callback=self.get_data_format) #vananh #yield Request(url=self.start_url, callback=self.get_data_format) def parse(self, response): pass def get_data_format(self, response): #chi lay 1 url dau tien sample_job_url = response.xpath(self.selectors['job_url'] + '/@href').get() #vananh #print(len(sample_job_url)) #print(sample_job_url) yield Request(url=get_correct_url(sample_job_url, response), callback=self.decide_data_format) def decide_data_format(self, response): can_crawl = True if self.is_data_json_format(response): print("json") #buoc gan, chua goi self.get_job_sample = self.get_job_sample_json self.context['data_format'] = 'json+ld' elif self.is_data_microdata_format(response): self.get_job_sample = self.get_job_sample_microdata self.context['data_format'] = 'microdata' else: print('Cannot crawl') can_crawl = False #vananh if can_crawl: yield Request(url=self.start_url, callback=self.get_job_url_samples) def get_job_url_samples(self, response): job_urls = response.meta.setdefault('job_urls', []) next_page = response.xpath(self.selectors['next_page'] + '/@href').get() #print(next_page) #Lay 20 job_urls job_urls += response.xpath(self.selectors['job_url'] + '/@href').getall() #print(job_urls) if next_page is not None and len(job_urls) < MAX_NO_SAMPLES: yield Request(url=get_correct_url(next_page, response), callback=self.get_job_url_samples, meta={'job_urls': job_urls}) else: yield Request(url=get_correct_url(job_urls[0], response), callback=self.get_job_sample, meta={'job_urls': job_urls[1:MAX_NO_SAMPLES]}) def decide_schema(self): #print("VanAnh\n\n") print("so luong samples: ", len(self.samples)) schema = JobSchemaDetection(self.samples, MODEL_DIR, STANDARD_ATTRIBUTES_FN, WEIGHT_MODEL_FN).get_mapping_schema() self.context['schema'] = schema self.context['selectors'] = self.selectors self.context['is_finished'] = False self.logger.error(self.context) with open(get_context_file(self.domain), mode='w', encoding='utf8') as f: json.dump(self.context, f) f.close() def get_job_sample_json(self, response): samples = response.meta.setdefault('samples', []) ''' print("-------") print(response.meta['job_urls']) print("------") ''' job_urls = response.meta['job_urls'] #print(response.meta) samples += self.get_json_from_response_json(response) #print(samples) #lay dan cac job_urls de boc tach??? if len(job_urls) > 0: yield Request(url=get_correct_url(job_urls[0], response), callback=self.get_job_sample_json, meta={ 'samples': samples, 'job_urls': job_urls[1:] }) else: self.samples = samples self.decide_schema() def get_job_sample_microdata(self, response): samples = response.meta.setdefault('samples', []) job_urls = response.meta['job_urls'] samples.append(self.get_json_from_response_microdata(response)) if len(job_urls) > 0: yield Request(url=get_correct_url(job_urls[0], response), callback=self.get_job_sample_microdata, meta={ 'samples': samples, 'job_urls': job_urls[1:] }) else: self.samples = samples self.decide_schema() def is_data_json_format(self, response): return len(self.get_json_from_response_json(response, True)) > 0 #them True def is_data_microdata_format(self, response): return len(self.get_json_from_response_microdata(response)) > 0 @staticmethod def get_json_from_response_json(response, is_sample=False): print("url:") print(response.url) result = [] dom = etree.HTML(response.body.decode("utf8")) json_node = dom.xpath( "//script[text()]") #xac dinh cac doan script json+ld for node in json_node: try: job = json.loads(node.text, strict=False) if job['@type'] == 'JobPosting': #van anh if is_sample == False: #minh them vao #dich tai day vi_lang = SchemaCrawler.is_vi_language( job["description"]) if not vi_lang: print("tieng anh") print(response.url) print("-----") return result # if SchemaCrawler.currentDomain == "topcv": temp_job = job job = SchemaCrawler.seperate_attributes_topcv( temp_job, dom) #lay ra so luong tuyendung elif SchemaCrawler.currentDomain == "timviecnhanh": temp_job = job job = SchemaCrawler.seperate_attributes_timviecnhanh( temp_job, dom) #print(job) result.append(job) except (ValueError, TypeError): pass return result def get_json_from_response_microdata(self, response): print("microdata") raw_json = json.loads( self.microdata.rdf_from_source(response.body, 'json-ld').decode('utf8')) #print(raw_json) result = parse_json(raw_json) return result #vananh @staticmethod def seperate_attributes_topcv(job, dom): print("ok cv") inital_description = job['description'] #kiem tra tieng anh - true return None description_dom = etree.HTML(inital_description) first_benefit = "" first_requirement = "" if "jobBenefits" not in job: raw_benefits = description_dom.xpath( "//*[contains(text(),'Quyền lợi')]/following-sibling::*") raw_benefits_str = "" for bnf in raw_benefits: bnf_str = etree.tostring(bnf, method='html', encoding="unicode") raw_benefits_str = raw_benefits_str + bnf_str first_benefit = etree.tostring(raw_benefits[0], method='html', encoding="unicode") jobBenefits = raw_benefits_str job["jobBenefits"] = jobBenefits if "experienceRequirements" not in job: raw_requirements = description_dom.xpath( "//*[contains(text(),'Yêu cầu')]/following-sibling::*") requirements_str = "" req_length = len(raw_requirements) i = 0 while i < req_length: req_str = etree.tostring(raw_requirements[i], method='html', encoding="unicode") if (first_benefit == req_str): folowing_req_str = etree.tostring(raw_requirements[i - 1], method='html', encoding="unicode") requirements_str = requirements_str.replace( folowing_req_str, "") break requirements_str = requirements_str + req_str i += 1 first_requirement = etree.tostring(raw_requirements[0], method='html', encoding="unicode") experienceRequirements = requirements_str job["experienceRequirements"] = experienceRequirements # if first_requirement.strip() != "": std_description = description_dom.xpath( "//*[contains(text(),'Mô tả')]/following-sibling::*") std_description_str = "" i = 0 std_description_length = len(std_description) while i < std_description_length: des_str = etree.tostring(std_description[i], method='html', encoding="unicode") if first_requirement == des_str: folowing_des_str = etree.tostring(std_description[i - 1], method='html', encoding="unicode") std_description_str = std_description_str.replace( folowing_des_str, "") break std_description_str = std_description_str + des_str i += 1 job["description"] = std_description_str #lay so luong tuyen dung: job_available_node = dom.xpath( "//div[@id='col-job-right']//div[@id='box-info-job']//div[@class='job-info-item']//*[contains(text(),'cần tuyển')]/following-sibling::*[1]" ) #print("so luong:") #print(job_available_node) if (len(job_available_node) == 0): job_available_node = dom.xpath( "///*[@data-original-title='Số lượng cần tuyển']") if (len(job_available_node) > 0): job_available_text = job_available_node[0].text if "không giới hạn" in job_available_text.lower(): job["totalJobOpenings"] = 50 elif "người" in job_available_text.lower(): num_job_available = (job_available_text.split(" ")[0]) if (num_job_available.isdigit()): job["totalJobOpenings"] = int(num_job_available) else: job["totalJobOpenings"] = 1 else: job["totalJobOpenings"] = 1 #print(job["totalJobOpenings"]) return job @staticmethod def seperate_attributes_timviecnhanh(job, dom): ''' https://www.timviecnhanh.com/dxmbhn/nhan-vien-kinh-doanh-bds-kv-quan-4-quan-7-nha-be-dat-xanh-mien-bac-ho-chi-minh-id4352813.html ''' jobOpenings = 0 if "totalJobOpenings" not in job: job_available_values = dom.xpath( "//*[@id='left-content']//*[contains(text(),'Số lượng tuyển dụng')]/parent::*/text()" ) if len(job_available_values) == 0: job_available_values = dom.xpath( "//div[@class='info-left']//*[contains(text(),'Số lượng cần tuyển')]/parent::*/text()" ) for value in job_available_values: #print("value") #print(value) temp = value.strip() if temp != "" and temp.isdigit(): job["totalJobOpenings"] = int(temp) jobOpenings = job["totalJobOpenings"] elif temp != "" and "giới hạn" in temp: job["totalJobOpenings"] = 10 jobOpenings = job["totalJobOpenings"] if jobOpenings == 0: job["totalJobOpenings"] = 2 #print("timviecnhanh") #print(job["totalJobOpenings"]) #print(job) return job @staticmethod def is_vi_language(raw_text): tag_re = re.compile(r'<[^>]+>') text = tag_re.sub('', raw_text) text = text.strip() result = detect(text) if result != "vi": return False return True
def _process(self, graph, baseURI, orig_source): from pyMicrodata import pyMicrodata processor = pyMicrodata(base=baseURI) processor.graph_from_source( orig_source, graph=graph, rdfOutput=False)
def __init__(self, graph, doc_lines, url=""): super(MicrodataValidator, self).__init__(graph, doc_lines, url=url) self.parser = pyMicrodata() self.graph = self.graph.microdata_graph # use the microdata half of the compound
sys.path.insert(0,"/Users/ivan/Library/Python") # You may want to adapt this to your environment... import sys, getopt from pyMicrodata import pyMicrodata, __version__ ########################################### test_path = "/Users/ivan/W3C/github/microdata-rdf/tests/" test_file_base = test_path + ("%04d" % int(sys.argv[1])) #test_file_base = test_path + ("sdo_eg_md_%d" % int(sys.argv[1])) test_html = test_file_base + ".html" test_ttl = test_file_base + ".ttl" processor = pyMicrodata() print processor.rdf_from_source(test_html) print "----" with open(test_ttl) as f : for l in f: print l, print "----" with open(test_html) as f : for l in f: print l,
class Crawler(Spider): name = "crawler" microdata = pyMicrodata() no_duplicated_items = 0 context = None standard_sample = None map_schema = None data_reduction = None parse_job = None #vananh currentDomain = "" no_not_vi_doc = 0 home = 0 custom_settings = { # 'FEED_FORMAT': 'json', # 'FEED_URI': 'topcv.json', 'ITEM_PIPELINES': { 'pipelines.MongoPipeline': 300 }, 'MONGO_URI': MONGO_URI, 'MONGO_DATABASE': MONGO_DATABASE, 'MONGO_COLLECTION': MONGO_COLLECTION, 'USER_AGENT': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0", 'HTTPERROR_ALLOW_ALL': True, 'COOKIES_ENABLED': False, } handle_httpstatus_list = [404, 410] def __init__(self, name=None, **kwargs): self.domain = kwargs.get('domain') Crawler.currentDomain = self.domain super(Crawler, self).__init__(name, **kwargs) def start_requests(self): if os.path.exists(get_context_file(self.domain)): with open(get_context_file(self.domain), mode='r', encoding='utf8') as f: self.context = json.load(f) f.close() if not self.context['is_finished']: raise Exception('Context file is not completed') else: if self.context['data_format'] == 'json+ld': self.parse_job = self.parse_job_json else: self.parse_job = self.parse_job_microdata self.standard_sample = self.get_standard_sample( STANDARD_ATTRIBUTES_FN) self.map_schema = self.get_map_schema(self.context['schema']) self.data_reduction = self.get_data_reduction( MONGO_URI, MONGO_DATABASE, MONGO_COLLECTION) self.inserted_data = [] self.inserted_data_reduction = DataReduction( 3, self.inserted_data) #self.eng_collection = pymongo.MongoClient(MONGO_URI)[MONGO_DATABASE]["english_job"] else: raise Exception('Context file name not existed: ' + get_context_file(self.domain)) yield Request(url=self.context['start_url'], callback=self.parse) def parse(self, response): next_page = response.xpath(self.context['selectors']['next_page'] + '/@href').get() job_urls = response.xpath(self.context['selectors']['job_url'] + '/@href').getall() #job_urls = [] default_url = "https://www.timviecnhanh.com/tuyen-nhan-vien-phuc-vu-nha-hang-part-time-ho-chi-minh-" #loi --3041261 3021301 3021261 #3191261 chua #nham 3891261,3991261 3907733 #3891261-3901261 ok #nham 3491261,3591261 chua - 3562917 ok for i in range(3271261, 3281261): job_url = "https://www.timviecnhanh.com/tuyen-ke-toan-tong-hop-ho-chi-minh-" + str( i) + ".html" #job_url = 'https://www.timviecnhanh.com/tuyen-ke-toan-van-phong-3084598.html' #headers = {'User-Agent': 'whatever'} yield Request(url=get_correct_url(job_url, response), callback=self.parse_job, errback=self.error_parse) ''' for job_url in job_urls: # job_url = response.urljoin(job_url) yield Request(url=get_correct_url(job_url, response), callback=self.parse_job) if next_page is not None: next_page = response.urljoin(next_page) yield Request(url=get_correct_url(next_page, response), callback=self.parse) ''' def error_parse(self, response): print(response.status) print("errrrrr") def parse_job_json(self, response): job_url = response.request.url #error_log = response.request.url + '-----' + str(response.status) + '\n' #self.logger.error(error_log) jobs = self.get_json_from_response_json(response) job_selectors = self.context['selectors']['job_selectors'] for job in jobs: #neu ko co industry thi tim lan can if "industry" not in job: print("not in") job["url"] = job_url yield self.get_from_neighbor(response, job) continue language = job["language"] job = self.change_to_right_form(job) if job_selectors is not None: for field, selector in job_selectors.items(): print(selector) job[field] = ','.join( text.strip() for text in response.xpath(selector + '/text()').extract() if text is not None) job = self.normalize(job, job_url) if job is not None: job["language"] = language yield job def parse_job_microdata(self, response): job_url = response.request.url jobs = self.get_json_from_response_microdata(response) job_selectors = self.context['selectors']['job_selectors'] for job in jobs: job = self.change_to_right_form(job) if job_selectors is not None: for field, selector in job_selectors.items(): #print(selector) job[field] = ','.join( text.strip() for text in response.xpath(selector + '/text()').extract() if text is not None) job = self.normalize(job, job_url) yield job @staticmethod def get_json_from_response_json(response): result = [] #vananh ''' #for topcv if response.url == "https://www.topcv.vn/viec-lam": Crawler.home = Crawler.home + 1 return result ''' # dom = etree.HTML(response.body.decode("utf8")) #print("-------raw------------") #print(dom.xpath("//title/text()"),'-------',response.url,'-----',response.status) #print("---------------------") #for timviecnhanh raw_title_list = dom.xpath("//title/text()") if len(raw_title_list) == 0: error_log = 'title0' + response.url + '-----' + str( response.status) + '\n' self.logger.error(error_log) return result raw_title = raw_title_list[0] if raw_title.lower() == "error": Crawler.home = Crawler.home + 1 return result json_node = dom.xpath("//script[text()]") extract_job = None ''' job_node = dom.xpath("//script[@type='application/ld+json']/text()") for jb in job_node: #print(jb) jb_str = jb.strip() jb_str = jb_str.replace('""', '"') jb_obj = json.loads(jb_str,strict=False) print("lalal") print(jb_obj["industry"]) ''' for node in json_node: try: job_str = node.text.strip() job_str = job_str.replace('""', '"') job = json.loads(job_str, strict=False) if job['@type'] == 'JobPosting': print("---------jobPosting-----") #van anh #dich tai day extract_job = job vi_lang = Crawler.is_vi_language(job['description']) if not vi_lang: job["language"] = "en" temp_job = job if Crawler.currentDomain == "topcv": print(response.url) job = Crawler.seperate_attributes_topcv( temp_job, dom, False) Crawler.no_not_vi_doc = Crawler.no_not_vi_doc + 1 result.append(job) return result elif Crawler.currentDomain == "timviecnhanh": print(response.url) job = Crawler.extract_job_openings_tvn( temp_job, dom) Crawler.no_not_vi_doc = Crawler.no_not_vi_doc + 1 result.append(job) return result else: job["language"] = "vi" temp_job = job if Crawler.currentDomain == "topcv": print(response.url) job = Crawler.seperate_attributes_topcv( temp_job, dom) elif Crawler.currentDomain == "timviecnhanh": print(response.url) job = Crawler.extract_job_openings_tvn( temp_job, dom) result.append(job) except ValueError as e: pass if extract_job is None: if Crawler.currentDomain == "timviecnhanh": print(response.url) job = Crawler.seperate_attributes_timviecnhanh(dom) if job is not None: result.append(job) return result def get_json_from_response_microdata(self, response): raw_json = json.loads( self.microdata.rdf_from_source(response.body, 'json-ld').decode('utf8')) result = parse_json(raw_json) return result def change_to_right_form(self, job): norm_job = self.standard_sample.copy() #print(norm_job) flatten_job = flatten_dict(job) for key, value in self.map_schema.items(): real_value = flatten_job.get(key) if real_value is None: continue else: attribute = norm_job for attribute_level in value[:-1]: attribute = attribute.get(attribute_level) if type(real_value) is str: attribute[value[-1]] = re.sub(r'<[^<>]*>', '', str(real_value)) elif type(attribute[value[-1]]) == dict and type( real_value) == list: attribute[value[-1]] = real_value[0] else: attribute[value[-1]] = real_value #print("norm_job") #print(norm_job) return norm_job def normalize(self, job, url): result = normalize_job(job) result['url'] = url # Check duplicate #phai ktra nhung tin da insert truoc if self.data_reduction.is_match(self.get_filter_data(job)): self.no_duplicated_items += 1 result = None return result else: if self.inserted_data_reduction.is_match( self.get_filter_data(job)): self.no_duplicated_items += 1 result = None return result else: #self.inserted_data.append(self.get_filter_data(job)) self.inserted_data_reduction.add_job(self.get_filter_data(job)) return result @staticmethod def get_standard_sample(file_name): if os.path.exists(file_name): with open(file_name, mode='r', encoding='utf8') as f: standard_sample = json.load(f) f.close() else: raise Exception('Not exist standard file: ' + file_name) return standard_sample @staticmethod def get_map_schema(schema): return {key: value.split('_') for key, value in schema.items()} def get_data_reduction(self, uri, database, collection): collection = pymongo.MongoClient(uri)[database][collection] #Lay ra ten, noi tuyen dung, dia diem, ko lay id cua danh sach tuyen dung jobs = list( collection.find({}, { 'title': 1, 'hiringOrganization.name': 1, 'jobLocation.address.addressRegion': 1, 'validThrough': 1, 'datePosted': 1, '_id': 0 })) data = [self.get_filter_data(job) for job in jobs] data_reduction = DataReduction(3, data) return data_reduction @staticmethod def get_filter_data(job): title = job['title'] hiring_organization_name = job['hiringOrganization']['name'] if type(job['jobLocation']) is list: address_region = ','.join([ location['address']['addressRegion'] for location in job['jobLocation'] ]) else: address_region = job['jobLocation']['address']['addressRegion'] #vananh #validThrough = #trong db la date => chuyen ve str valid_through = job['validThrough'] #validThroughDate = pd.to_datetime(date_str) #valid_through = str(validThroughDate.year) + "-" + str(validThroughDate.month) + "-" + str(validThroughDate.day) date_posted = job['datePosted'] return [ title, hiring_organization_name, address_region, date_posted, valid_through ] #van anh #vananh @staticmethod def seperate_attributes_topcv(job, dom, is_vi=True): ''' if not is_vi: #lay so luong tuyen dung: job_available_node = dom.xpath("//div[@id='col-job-right']//div[@id='box-info-job']//div[@class='job-info-item']//*[contains(text(),'cần tuyển')]/following-sibling::*[1]") if(len(job_available_node) == 0): job_available_node = dom.xpath("///*[@data-original-title='Số lượng cần tuyển']") if(len(job_available_node) > 0): job_available_text = job_available_node[0].text if "không giới hạn" in job_available_text.lower(): job["totalJobOpenings"] = 10 elif "người" in job_available_text.lower(): num_job_available = (job_available_text.split(" ")[0]) if(num_job_available.isdigit()): job["totalJobOpenings"] = int(num_job_available) else: job["totalJobOpenings"] = 2 else: job["totalJobOpenings"] = 2 job["language"] = "en" return job ''' inital_description = job['description'] description_dom = etree.HTML(inital_description) first_benefit = "" first_requirement = "" if "jobBenefits" not in job: raw_benefits = description_dom.xpath( "//*[contains(text(),'Quyền lợi')]/following-sibling::*") print(len(raw_benefits)) raw_benefits_str = "" for bnf in raw_benefits: bnf_str = etree.tostring(bnf, method='html', encoding="unicode") raw_benefits_str = raw_benefits_str + bnf_str if len(raw_benefits) > 0: first_benefit = etree.tostring(raw_benefits[0], method='html', encoding="unicode") jobBenefits = raw_benefits_str job["jobBenefits"] = jobBenefits else: job["jobBenefits"] = "" if "experienceRequirements" not in job: raw_requirements = description_dom.xpath( "//*[contains(text(),'Yêu cầu')]/following-sibling::*") requirements_str = "" req_length = len(raw_requirements) i = 0 while i < req_length: req_str = etree.tostring(raw_requirements[i], method='html', encoding="unicode") if (first_benefit == req_str): folowing_req_str = etree.tostring(raw_requirements[i - 1], method='html', encoding="unicode") requirements_str = requirements_str.replace( folowing_req_str, "") break requirements_str = requirements_str + req_str i += 1 if len(raw_requirements ) > 1: #neu = 1 la req = "" den benefit luon first_requirement = etree.tostring(raw_requirements[0], method='html', encoding="unicode") experienceRequirements = requirements_str job["experienceRequirements"] = experienceRequirements else: job["experienceRequirements"] = "" # if first_requirement.strip() != "": print("hehe") std_description = description_dom.xpath( "//*[contains(text(),'Mô tả')]/following-sibling::*") std_description_str = "" i = 0 std_description_length = len(std_description) while i < std_description_length: des_str = etree.tostring(std_description[i], method='html', encoding="unicode") if first_requirement == des_str: folowing_des_str = etree.tostring(std_description[i - 1], method='html', encoding="unicode") std_description_str = std_description_str.replace( folowing_des_str, "") break std_description_str = std_description_str + des_str i += 1 job["description"] = std_description_str #sua loi out of range print("exp ", job["experienceRequirements"]) print("be ", job["jobBenefits"] == "") if job["experienceRequirements"] == "" or job["jobBenefits"] == "": job["jobBenefits"] = seperate.extract_info(inital_description, "quyền lợi") job["description"] = seperate.extract_info(inital_description, "mô tả") job["experienceRequirements"] = seperate.extract_info( inital_description, "yêu cầu") if job["experienceRequirements"] == "" and job[ "jobBenefits"] == "" and job["description"] == "": #print("lala") meta_description = dom.xpath("//meta[@name='description']") for temp in meta_description: job["jobBenefits"] = "" job["description"] = temp.attrib['content'] job["experienceRequirements"] = temp.attrib['content'] ''' print(meta_description[0]) content = meta_description[0].attrib['content'] print("lalal2") print(content) ''' #lay so luong tuyen dung: job_available_node = dom.xpath( "//div[@id='col-job-right']//div[@id='box-info-job']//div[@class='job-info-item']//*[contains(text(),'cần tuyển')]/following-sibling::*[1]" ) if (len(job_available_node) == 0): job_available_node = dom.xpath( "///*[@data-original-title='Số lượng cần tuyển']") if (len(job_available_node) > 0): job_available_text = job_available_node[0].text if "không giới hạn" in job_available_text.lower(): job["totalJobOpenings"] = 10 elif "người" in job_available_text.lower(): num_job_available = (job_available_text.split(" ")[0]) if (num_job_available.isdigit()): job["totalJobOpenings"] = int(num_job_available) else: job["totalJobOpenings"] = 2 else: job["totalJobOpenings"] = 2 if is_vi: job["language"] = "vi" else: job["language"] = "en" #print(job["totalJobOpenings"]) return job @staticmethod def seperate_attributes_timviecnhanh(dom): job = {} meta_description = dom.xpath("//meta[@property='og:description']") for temp in meta_description: job["jobBenefits"] = "" job["description"] = temp.attrib['content'] job["experienceRequirements"] = "" vi_lang = Crawler.is_vi_language(job['description']) if not vi_lang: Crawler.no_not_vi_doc = Crawler.no_not_vi_doc + 1 return None job["language"] = "vi" raw_title = dom.xpath("//title/text()")[0] raw_title = raw_title.strip() title_list = raw_title.split("|") if len(title_list) > 1: raw_title = title_list[0].strip() job["title"] = raw_title job["validThrough"] = seperate.extract_info_tvn( job["description"], "ngày hết hạn") job["validThrough"] = seperate.normalize_date_tvn(job["validThrough"]) job["hiringOrganization"] = {} job["hiringOrganization"]["name"] = seperate.extract_info_tvn( job["description"], "công ty") job["hiringOrganization"]["name"] = seperate.normalize_org_name_tvn( job["hiringOrganization"]["name"]) raw_salary = seperate.extract_info_tvn(job["description"], "lương") job["baseSalary"] = {} job["baseSalary"] = seperate.extract_salary_tvn(raw_salary) job["totalJobOpenings"] = 2 job["jobLocation"] = {"address": {}} #location raw_location = dom.xpath( "//div[@class='bottom-article']//a[contains(text(), 'Việc làm')]/text()" ) city = raw_location[0].strip() city = city.replace('Việc làm', '') city = city.strip() if city == "": city = "Việt Nam" job["jobLocation"]["address"]["addressLocality"] = city job["jobLocation"]["address"]["streetAddress"] = city job["jobLocation"]["address"]["addressCountry"] = "Việt Nam" category = dom.xpath( "//div[@class='line-full breadcrumb-line']//ol/li[4]//a/text()") job["industry"] = category[0] #print(job) return job @staticmethod def extract_job_openings_tvn(job, dom): jobOpenings = 0 if "totalJobOpenings" not in job: job_available_values = dom.xpath( "//*[@id='left-content']//*[contains(text(),'Số lượng tuyển dụng')]/parent::*/text()" ) if len(job_available_values) == 0: job_available_values = dom.xpath( "//div[@class='info-left']//*[contains(text(),'Số lượng cần tuyển')]/parent::*/text()" ) for value in job_available_values: #print("value") #print(value) temp = value.strip() if temp != "" and temp.isdigit(): job["totalJobOpenings"] = int(temp) jobOpenings = job["totalJobOpenings"] elif temp != "" and "giới hạn" in temp: job["totalJobOpenings"] = 10 jobOpenings = job["totalJobOpenings"] if jobOpenings == 0: job["totalJobOpenings"] = 2 return job @staticmethod def is_vi_language(raw_text): tag_re = re.compile(r'<[^>]+>') text = tag_re.sub('', raw_text) text = text.strip() result = detect(text) if result != "vi": return False return True def get_from_neighbor(self, response, ini_job): #print("-----neighbor_job----") dom = etree.HTML(response.body.decode("utf8")) neighbor_urls = dom.xpath("//*[@id='job-hot-content']//tr[1]//a[1]") if len(neighbor_urls) == 0: neighbor_urls = dom.xpath( "//*[@id='job-week-content']//tr[1]//a[1]") result = [] for neighbor_url in neighbor_urls: url = neighbor_url.attrib["href"] neighbor_request = Request(url=get_correct_url(url, response), callback=self.get_job_from_neighbor) neighbor_request.cb_kwargs["ini_job"] = ini_job #print("-----ok-----") return neighbor_request def get_job_from_neighbor(self, response, ini_job): print("xxxxxxxxxxxxxxxxxxx") neighbor_jobs = self.get_json_from_response_json(response) for neighbor_job in neighbor_jobs: ini_job["industry"] = neighbor_job["industry"] job_selectors = self.context['selectors']['job_selectors'] job = ini_job language = job["language"] job_url = job["url"] job = self.change_to_right_form(job) if job_selectors is not None: for field, selector in job_selectors.items(): job[field] = ','.join( text.strip() for text in response.xpath(selector + '/text()').extract() if text is not None) job = self.normalize(job, job_url) if job is not None: job["language"] = language yield job def close(self, spider, reason): print("Number of english items: ", self.no_not_vi_doc) print("Number of broken items: ", self.home) print('Number of duplicated items: %d' % self.no_duplicated_items) print("Finished!")
class SchemaCrawler(Spider): name = "schema_crawler" microdata = pyMicrodata() schema = None get_job_sample = None samples = [] selectors = { # 'job_url': "//*[@id='box-job-result']/div[1]/div/div/div[2]/h4/a", # 'next_page': "//*[@id='box-job-result']/div[2]/ul/li[last()]/a", } # start_url = 'https://www.topcv.vn/viec-lam/moi-nhat.html?utm_source=click-search-job&utm_medium=page-job&utm_campaign=tracking-job' context = {} domain = None def __init__(self, name=None, **kwargs): self.start_url = kwargs.get('start_url') self.selectors['job_url'] = kwargs.get('job_url') self.selectors['next_page'] = kwargs.get('next_page') self.domain = kwargs.get('domain') #self.driver = webdriver.Firefox() super(SchemaCrawler, self).__init__(name, **kwargs) def start_requests(self): self.context['start_url'] = self.start_url self.context['domain'] = self.domain ''' if not os.path.exists(get_context_file(self.domain)): if not os.path.exists(STANDARD_ATTRIBUTES_FN): raise Exception('Not exist standard file: ' + STANDARD_ATTRIBUTES_FN) yield Request(url=self.start_url, callback=self.get_data_format) ''' #vananh yield Request(url=self.start_url, callback=self.get_data_format) def parse(self, response): pass def get_data_format(self, response): #chi lay 1 url dau tien sample_job_url = response.xpath(self.selectors['job_url'] + '/@href').get() #vananh print(len(sample_job_url)) #print(sample_job_url) yield Request(url=get_correct_url(sample_job_url, response), callback=self.decide_data_format) def decide_data_format(self, response): can_crawl = True if self.is_data_json_format(response): print("json") #buoc gan, chua goi self.get_job_sample = self.get_job_sample_json self.context['data_format'] = 'json+ld' elif self.is_data_microdata_format(response): self.get_job_sample = self.get_job_sample_microdata self.context['data_format'] = 'microdata' else: print('Cannot crawl') can_crawl = False #vananh if can_crawl: yield Request(url=self.start_url, callback=self.get_job_url_samples) def get_job_url_samples(self, response): job_urls = response.meta.setdefault('job_urls', []) next_page = response.xpath(self.selectors['next_page'] + '/@href').get() #print(next_page) #Lay 20 job_urls job_urls += response.xpath(self.selectors['job_url'] + '/@href').getall() #print(job_urls) if next_page is not None and len(job_urls) < MAX_NO_SAMPLES: yield Request(url=get_correct_url(next_page, response), callback=self.get_job_url_samples, meta={'job_urls': job_urls}) else: yield Request(url=get_correct_url(job_urls[0], response), callback=self.get_job_sample, meta={'job_urls': job_urls[1:MAX_NO_SAMPLES]}) def decide_schema(self): print("VanAnh\n\n") print("so luong samples: ", len(self.samples)) schema = JobSchemaDetection(self.samples, MODEL_DIR, STANDARD_ATTRIBUTES_FN, WEIGHT_MODEL_FN).get_mapping_schema() self.context['schema'] = schema self.context['selectors'] = self.selectors self.context['is_finished'] = False self.logger.error(self.context) with open(get_context_file(self.domain), mode='w', encoding='utf8') as f: json.dump(self.context, f) f.close() def get_job_sample_json(self, response): samples = response.meta.setdefault('samples', []) print("-------") print(response.meta['job_urls']) print("------") job_urls = response.meta['job_urls'] #print(response.meta) samples += self.get_json_from_response_json(response) #print(samples) #lay dan cac job_urls de boc tach??? if len(job_urls) > 0: yield Request(url=get_correct_url(job_urls[0], response), callback=self.get_job_sample_json, meta={ 'samples': samples, 'job_urls': job_urls[1:] }) else: self.samples = samples #self.decide_schema() def get_job_sample_microdata(self, response): samples = response.meta.setdefault('samples', []) job_urls = response.meta['job_urls'] samples.append(self.get_json_from_response_microdata(response)) if len(job_urls) > 0: yield Request(url=get_correct_url(job_urls[0], response), callback=self.get_job_sample_microdata, meta={ 'samples': samples, 'job_urls': job_urls[1:] }) else: self.samples = samples self.decide_schema() def is_data_json_format(self, response): return len(self.get_json_from_response_json(response)) > 0 def is_data_microdata_format(self, response): return len(self.get_json_from_response_microdata(response)) > 0 @staticmethod def get_json_from_response_json(response): result = [] dom = etree.HTML(response.body.decode("utf8")) json_node = dom.xpath( "//script[text()]") #xac dinh cac doan script json+ld for node in json_node: try: job = json.loads(node.text, strict=False) if job['@type'] == 'JobPosting': #van anh #tach benefit va requirement ra khoi description inital_description = job['description'] description_dom = etree.HTML(inital_description) first_benefit = "" first_requirement = "" if "jobBenefits" not in job: raw_benefits = description_dom.xpath( "//*[contains(text(),'Quyền lợi')]/following-sibling::*" ) raw_benefits_str = "" for bnf in raw_benefits: bnf_str = etree.tostring(bnf, method='html', encoding="unicode") raw_benefits_str = raw_benefits_str + bnf_str first_benefit = etree.tostring(raw_benefits[0], method='html', encoding="unicode") jobBenefits = raw_benefits_str job["jobBenefits"] = jobBenefits if "experienceRequirements" not in job: raw_requirements = description_dom.xpath( "//*[contains(text(),'Yêu cầu')]/following-sibling::*" ) requirements_str = "" req_length = len(raw_requirements) i = 0 while i < req_length: req_str = etree.tostring(raw_requirements[i], method='html', encoding="unicode") if (first_benefit == req_str): folowing_req_str = etree.tostring( raw_requirements[i - 1], method='html', encoding="unicode") requirements_str = requirements_str.replace( folowing_req_str, "") break requirements_str = requirements_str + req_str i += 1 """ for req in raw_requirements: req_str = etree.tostring(req,method='html',encoding="unicode") if(first_benefit == req_str): break requirements_str = requirements_str + req_str """ first_requirement = etree.tostring(raw_requirements[0], method='html', encoding="unicode") experienceRequirements = requirements_str job["experienceRequirements"] = experienceRequirements # if first_requirement.strip() != "": std_description = description_dom.xpath( "//*[contains(text(),'Mô tả')]/following-sibling::*" ) std_description_str = "" i = 0 std_description_length = len(std_description) while i < std_description_length: des_str = etree.tostring(std_description[i], method='html', encoding="unicode") if first_requirement == des_str: folowing_des_str = etree.tostring( std_description[i - 1], method='html', encoding="unicode") std_description_str = std_description_str.replace( folowing_des_str, "") break std_description_str = std_description_str + des_str i += 1 ''' for des in std_description: des_str = etree.tostring(des,method='html',encoding="unicode") if first_requirement == des_str: break std_description_str = std_description_str + des_str ''' job["description"] = std_description_str print("*******************") print(job) print("\n\n") print("**************") # result.append(job) except (ValueError, TypeError): pass return result def get_json_from_response_microdata(self, response): print("microdata") raw_json = json.loads( self.microdata.rdf_from_source(response.body, 'json-ld').decode('utf8')) print(raw_json) result = parse_json(raw_json) return result #vananh def seperate_attributes_topcv(self, job): inital_description = job['description'] description_dom = etree.HTML(inital_description) first_benefit = "" first_requirement = "" if "jobBenefits" not in job: raw_benefits = description_dom.xpath( "//*[contains(text(),'Quyền lợi')]/following-sibling::*") raw_benefits_str = "" for bnf in raw_benefits: bnf_str = etree.tostring(bnf, method='html', encoding="unicode") raw_benefits_str = raw_benefits_str + bnf_str first_benefit = etree.tostring(raw_benefits[0], method='html', encoding="unicode") jobBenefits = raw_benefits_str job["jobBenefits"] = jobBenefits if "experienceRequirements" not in job: raw_requirements = description_dom.xpath( "//*[contains(text(),'Yêu cầu')]/following-sibling::*") requirements_str = "" req_length = len(raw_requirements) i = 0 while i < req_length: req_str = etree.tostring(raw_requirements[i], method='html', encoding="unicode") if (first_benefit == req_str): folowing_req_str = etree.tostring(raw_requirements[i - 1], method='html', encoding="unicode") requirements_str = requirements_str.replace( folowing_req_str, "") break requirements_str = requirements_str + req_str i += 1 """ for req in raw_requirements: req_str = etree.tostring(req,method='html',encoding="unicode") if(first_benefit == req_str): break requirements_str = requirements_str + req_str """ first_requirement = etree.tostring(raw_requirements[0], method='html', encoding="unicode") experienceRequirements = requirements_str job["experienceRequirements"] = experienceRequirements # if first_requirement.strip() != "": std_description = description_dom.xpath( "//*[contains(text(),'Mô tả')]/following-sibling::*") std_description_str = "" i = 0 std_description_length = len(std_description) while i < std_description_length: des_str = etree.tostring(std_description[i], method='html', encoding="unicode") if first_requirement == des_str: folowing_des_str = etree.tostring(std_description[i - 1], method='html', encoding="unicode") std_description_str = std_description_str.replace( folowing_des_str, "") break std_description_str = std_description_str + des_str i += 1 ''' for des in std_description: des_str = etree.tostring(des,method='html',encoding="unicode") if first_requirement == des_str: break std_description_str = std_description_str + des_str ''' job["description"] = std_description_str print("*******************") print(job) print("\n\n") print("**************") return job