Exemplo n.º 1
0
	def parse(self, source, graph,
			  vocab_expansion        = False,
			  vocab_cache            = False,
			  rdfOutput              = False) :
		"""
		@param source: one of the input sources that the RDFLib package defined
		@type source: InputSource class instance
		@param graph: target graph for the triples; output graph, in RDFa spec. parlance
		@type graph: RDFLib Graph
		@keyword vocab_expansion: whether the RDFa @vocab attribute should also mean vocabulary expansion (see the RDFa 1.1 spec for further details)
		@type vocab_expansion: Boolean
		@keyword vocab_cache: in case vocab expansion is used, whether the expansion data (i.e., vocabulary) should be cached locally. This requires the ability for the local application to write on the local file system
		@type vocab_chache: Boolean
		@keyword rdfOutput: whether Exceptions should be catched and added, as triples, to the processor graph, or whether they should be raised.
		@type rdfOutput: Boolean
		"""		

                from pyMicrodata import pyMicrodata

                if isinstance(source, StringInputSource) :
                        orig_source = source.getByteStream()
                elif isinstance(source, URLInputSource) :
                        orig_source = source.url
                elif isinstance(source, FileInputSource) :
                        orig_source = source.file.name
                        source.file.close()

                baseURI      = source.getPublicId()
                processor    = pyMicrodata(base = baseURI, vocab_expansion = vocab_expansion, vocab_cache = vocab_cache)
                processor.graph_from_source(orig_source, graph=graph, rdfOutput = rdfOutput)
Exemplo n.º 2
0
    def __init__(self, source):
        super(CompoundGraph, self).__init__()
        try:
            self.microdata_graph = pyMicrodata().graph_from_source(source)
        except:
            self.microdata_graph = None

        try:
            self.rdfa_graph = pyRdfa().graph_from_source(source)
        except:
            self.rdfa_graph = None
Exemplo n.º 3
0
	def parse(self, source, graph,
			  pgraph                 = None,
			  embedded_rdf           = True,
			  vocab_expansion        = False,
			  vocab_cache            = False,
			  rdfOutput              = False) :
		"""
		@param source: one of the input sources that the RDFLib package defined
		@type source: InputSource class instance
		@param graph: target graph for the triples; output graph, in RDFa spec. parlance
		@type graph: RDFLib Graph
		@keyword pgraph: target for error and warning triples; processor graph, in RDFa spec. parlance. If set to None, these triples are ignored
		@type pgraph: RDFLib Graph
		@keyword embedded_rdf: some formats allow embedding RDF in other formats: (X)HTML can contain turtle in a special <script> element, SVG can have RDF/XML embedded in a <metadata> element. This flag controls whether those triples should be interpreted and added to the output graph. Some languages (e.g., SVG) require this, and the flag is ignored.
		@type embedded_rdf: Boolean
		@keyword vocab_expansion: whether the RDFa @vocab attribute should also mean vocabulary expansion (see the RDFa 1.1 spec for further details)
		@type vocab_expansion: Boolean
		@keyword vocab_cache: in case vocab expansion is used, whether the expansion data (i.e., vocabulary) should be cached locally. This requires the ability for the local application to write on the local file system
		@type vocab_chache: Boolean
		@keyword rdfOutput: whether Exceptions should be catched and added, as triples, to the processor graph, or whether they should be raised.
		@type rdfOutput: Boolean
		"""
                if isinstance(source, StringInputSource) :
                        orig_source = source.getByteStream()
                elif isinstance(source, URLInputSource) :
                        orig_source = source.url
                elif isinstance(source, FileInputSource) :
                        orig_source = source.file.name
                        source.file.close()
                baseURI      = source.getPublicId()

                # The RDFa part
                from pyRdfa import pyRdfa, Options				
                self.options = Options(output_processor_graph = (pgraph != None),
                                                           embedded_rdf           = embedded_rdf,
                                                           vocab_expansion        = vocab_expansion,
                                                           vocab_cache            = vocab_cache)

                processor = pyRdfa(self.options, base = baseURI, media_type = 'text/html', rdfa_version = '1.1')
                processor.graph_from_source(orig_source, graph=graph, pgraph=pgraph, rdfOutput = rdfOutput)

                # The Microdata part
                try: 
                    from pyMicrodata import pyMicrodata
                    processor    = pyMicrodata(base = baseURI, vocab_expansion = vocab_expansion, vocab_cache = vocab_cache)
                    processor.graph_from_source(orig_source, graph=graph, rdfOutput = rdfOutput)
                except ImportError:
                    # no pyMicrodata installed!
                    pass
Exemplo n.º 4
0
 def __init__(self, url, impl):
     self.ns_ont = {}
     self.attribs_by_class = defaultdict(list)
     self.ontologies = [] # are these initializations necessary
     self.attributes = []
     self.source = url
     self.impl = impl
     if 'rdfa' == impl:
         self.range_uri = "http://www.w3.org/2000/01/rdf-schema#range"
         self.domain_uri = "http://www.w3.org/2000/01/rdf-schema#domain"
         self.type_uri = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
         self.subclass_uri = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
         self.parser = pyRdfa()
     elif 'microdata' == impl:
         self.range_uri = "http://schema.org/range"
         self.domain_uri = "http://schema.org/domain"
         self.type_uri = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
         self.subclass_uri = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
         self.parser = pyMicrodata()
     return super(Graph, self).__init__()
Exemplo n.º 5
0
try :
	opts, value = getopt.getopt(sys.argv[1:],"xtjnpb:")
	for o,a in opts:
		if o == "-t" :
			format = "turtle"
		elif o == "-j" :
			format = "json-ld"
		elif o == "-n" :
			format = "nt"
		elif o == "-p" or o == "-x":
			format = "pretty-xml"
		elif o == "-b" :
			base = a
		else :
			usage()
			sys.exit(1)
except :
	usage()
	sys.exit(1)

processor = pyMicrodata(base)
if len(value) >= 1 :
	print processor.rdf_from_sources(value, outputFormat = format)
else :
	print processor.rdf_from_source(sys.stdin, outputFormat = format)
	
	

	
Exemplo n.º 6
0
Run the microdata testing locally
"""

import sys
sys.path.insert(0, "/Users/ivan/Library/Python")

# You may want to adapt this to your environment...
import sys, getopt

from pyMicrodata import pyMicrodata, __version__

###########################################

test_path = "/Users/ivan/W3C/github/microdata-rdf/tests/"
test_file_base = test_path + ("%04d" % int(sys.argv[1]))
#test_file_base = test_path + ("sdo_eg_md_%d" % int(sys.argv[1]))
test_html = test_file_base + ".html"
test_ttl = test_file_base + ".ttl"

processor = pyMicrodata()
print processor.rdf_from_source(test_html)
print "----"
with open(test_ttl) as f:
    for l in f:
        print l,

print "----"
with open(test_html) as f:
    for l in f:
        print l,
Exemplo n.º 7
0
class Crawler(Spider):
    name = "crawler"
    microdata = pyMicrodata()
    no_duplicated_items = 0
    context = None
    standard_sample = None
    map_schema = None
    data_reduction = None
    parse_job = None
    #vananh
    currentDomain = ""
    no_not_vi_doc = 0

    custom_settings = {
        # 'FEED_FORMAT': 'json',
        # 'FEED_URI': 'topcv.json',
        'ITEM_PIPELINES': {
            'pipelines.MongoPipeline': 300
        },
        'MONGO_URI': MONGO_URI,
        'MONGO_DATABASE': MONGO_DATABASE,
        'MONGO_COLLECTION': MONGO_COLLECTION
    }

    def __init__(self, name=None, **kwargs):
        self.domain = kwargs.get('domain')
        Crawler.currentDomain = self.domain
        super(Crawler, self).__init__(name, **kwargs)

    def start_requests(self):
        if os.path.exists(get_context_file(self.domain)):
            with open(get_context_file(self.domain), mode='r',
                      encoding='utf8') as f:
                self.context = json.load(f)
                f.close()
            if not self.context['is_finished']:
                raise Exception('Context file is not completed')
            else:
                if self.context['data_format'] == 'json+ld':
                    self.parse_job = self.parse_job_json
                else:
                    self.parse_job = self.parse_job_microdata

                self.standard_sample = self.get_standard_sample(
                    STANDARD_ATTRIBUTES_FN)
                self.map_schema = self.get_map_schema(self.context['schema'])
                self.data_reduction = self.get_data_reduction(
                    MONGO_URI, MONGO_DATABASE, MONGO_COLLECTION)
        else:
            raise Exception('Context file name not existed: ' +
                            get_context_file(self.domain))
        yield Request(url=self.context['start_url'], callback=self.parse)

    def parse(self, response):
        next_page = response.xpath(self.context['selectors']['next_page'] +
                                   '/@href').get()
        job_urls = response.xpath(self.context['selectors']['job_url'] +
                                  '/@href').getall()
        #vananh
        #yield Request(url=get_correct_url(job_urls[0], response), callback=self.parse_job)

        for job_url in job_urls:
            # job_url = response.urljoin(job_url)
            yield Request(url=get_correct_url(job_url, response),
                          callback=self.parse_job)

        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield Request(url=get_correct_url(next_page, response),
                          callback=self.parse)

    def parse_job_json(self, response):
        job_url = response.request.url
        jobs = self.get_json_from_response_json(response)
        job_selectors = self.context['selectors']['job_selectors']
        for job in jobs:
            job = self.change_to_right_form(job)
            if job_selectors is not None:
                for field, selector in job_selectors.items():
                    print(selector)
                    job[field] = ','.join(
                        text.strip()
                        for text in response.xpath(selector +
                                                   '/text()').extract()
                        if text is not None)
                job = self.normalize(job, job_url)
                yield job

    def parse_job_microdata(self, response):
        job_url = response.request.url
        jobs = self.get_json_from_response_microdata(response)
        job_selectors = self.context['selectors']['job_selectors']
        for job in jobs:
            job = self.change_to_right_form(job)
            if job_selectors is not None:
                for field, selector in job_selectors.items():
                    #print(selector)
                    job[field] = ','.join(
                        text.strip()
                        for text in response.xpath(selector +
                                                   '/text()').extract()
                        if text is not None)
                job = self.normalize(job, job_url)
                print(job_url)
                yield job

    @staticmethod
    def get_json_from_response_json(response):
        result = []
        dom = etree.HTML(response.body.decode("utf8"))
        json_node = dom.xpath("//script[text()]")
        for node in json_node:
            try:
                job = json.loads(node.text, strict=False)
                if job['@type'] == 'JobPosting':
                    #van anh
                    #dich tai day
                    vi_lang = Crawler.is_vi_language(job["description"])
                    if not vi_lang:
                        Crawler.no_not_vi_doc = Crawler.no_not_vi_doc + 1
                        return result
                        #
                    if Crawler.currentDomain == "topcv":
                        temp_job = job
                        job = Crawler.seperate_attributes_topcv(temp_job, dom)
                    result.append(job)

            except (ValueError, TypeError):
                pass
        return result

    def get_json_from_response_microdata(self, response):
        raw_json = json.loads(
            self.microdata.rdf_from_source(response.body,
                                           'json-ld').decode('utf8'))
        result = parse_json(raw_json)
        return result

    def change_to_right_form(self, job):
        norm_job = self.standard_sample.copy()
        flatten_job = flatten_dict(job)

        for key, value in self.map_schema.items():
            real_value = flatten_job.get(key)
            if real_value is None:
                continue
            else:
                attribute = norm_job
                for attribute_level in value[:-1]:
                    attribute = attribute.get(attribute_level)
                if type(real_value) is str:
                    attribute[value[-1]] = re.sub(r'<[^<>]*>', '',
                                                  str(real_value))
                elif type(attribute[value[-1]]) == dict and type(
                        real_value) == list:
                    attribute[value[-1]] = real_value[0]
                else:
                    attribute[value[-1]] = real_value

        return norm_job

    def normalize(self, job, url):
        result = normalize_job(job)
        result['url'] = url

        # Check duplicate
        if self.data_reduction.is_match(self.get_filter_data(job)):
            self.no_duplicated_items += 1
            result = None

        return result

    @staticmethod
    def get_standard_sample(file_name):
        if os.path.exists(file_name):
            with open(file_name, mode='r', encoding='utf8') as f:
                standard_sample = json.load(f)
                f.close()
        else:
            raise Exception('Not exist standard file: ' + file_name)

        return standard_sample

    @staticmethod
    def get_map_schema(schema):
        return {key: value.split('_') for key, value in schema.items()}

    def get_data_reduction(self, uri, database, collection):
        collection = pymongo.MongoClient(uri)[database][collection]
        #Lay ra ten, noi tuyen dung, dia diem, ko lay id cua danh sach tuyen dung
        jobs = list(
            collection.find({}, {
                'title': 1,
                'hiringOrganization.name': 1,
                'jobLocation.address.addressRegion': 1,
                'validThrough': 1,
                'datePosted': 1,
                '_id': 0
            }))
        data = [self.get_filter_data(job) for job in jobs]

        data_reduction = DataReduction(3, data)
        return data_reduction

    @staticmethod
    def get_filter_data(job):
        title = job['title']
        hiring_organization_name = job['hiringOrganization']['name']
        if type(job['jobLocation']) is list:
            address_region = ','.join([
                location['address']['addressRegion']
                for location in job['jobLocation']
            ])
        else:
            address_region = job['jobLocation']['address']['addressRegion']
        #vananh
        #validThrough =
        #trong db la date => chuyen ve str
        valid_through = job['validThrough']
        #validThroughDate = pd.to_datetime(date_str)
        #valid_through = str(validThroughDate.year) + "-" + str(validThroughDate.month) + "-" + str(validThroughDate.day)
        date_posted = job['datePosted']
        return [
            title, hiring_organization_name, address_region, date_posted,
            valid_through
        ]
        #return [title, hiring_organization_name, address_region, valid_through]

    #van anh
    #vananh
    @staticmethod
    def seperate_attributes_topcv(job, dom):
        inital_description = job['description']
        description_dom = etree.HTML(inital_description)
        first_benefit = ""
        first_requirement = ""
        if "jobBenefits" not in job:
            raw_benefits = description_dom.xpath(
                "//*[contains(text(),'Quyền lợi')]/following-sibling::*")
            raw_benefits_str = ""
            for bnf in raw_benefits:
                bnf_str = etree.tostring(bnf,
                                         method='html',
                                         encoding="unicode")
                raw_benefits_str = raw_benefits_str + bnf_str
            first_benefit = etree.tostring(raw_benefits[0],
                                           method='html',
                                           encoding="unicode")
            jobBenefits = raw_benefits_str
            job["jobBenefits"] = jobBenefits
        if "experienceRequirements" not in job:
            raw_requirements = description_dom.xpath(
                "//*[contains(text(),'Yêu cầu')]/following-sibling::*")
            requirements_str = ""
            req_length = len(raw_requirements)
            i = 0
            while i < req_length:
                req_str = etree.tostring(raw_requirements[i],
                                         method='html',
                                         encoding="unicode")
                if (first_benefit == req_str):
                    folowing_req_str = etree.tostring(raw_requirements[i - 1],
                                                      method='html',
                                                      encoding="unicode")
                    requirements_str = requirements_str.replace(
                        folowing_req_str, "")
                    break
                requirements_str = requirements_str + req_str
                i += 1
            first_requirement = etree.tostring(raw_requirements[0],
                                               method='html',
                                               encoding="unicode")
            experienceRequirements = requirements_str
            job["experienceRequirements"] = experienceRequirements
        #
        if first_requirement.strip() != "":
            std_description = description_dom.xpath(
                "//*[contains(text(),'Mô tả')]/following-sibling::*")
            std_description_str = ""
            i = 0
            std_description_length = len(std_description)
            while i < std_description_length:
                des_str = etree.tostring(std_description[i],
                                         method='html',
                                         encoding="unicode")
                if first_requirement == des_str:
                    folowing_des_str = etree.tostring(std_description[i - 1],
                                                      method='html',
                                                      encoding="unicode")
                    std_description_str = std_description_str.replace(
                        folowing_des_str, "")
                    break
                std_description_str = std_description_str + des_str
                i += 1
            job["description"] = std_description_str
        if job["experienceRequirements"] == "" or job["jobBenefits"] == "":
            job["jobBenefits"] = seperate.extract_info(inital_description,
                                                       "quyền lợi")
            job["description"] = seperate.extract_info(inital_description,
                                                       "mô tả")
            job["experienceRequirements"] = seperate.extract_info(
                inital_description, "yêu cầu")
        if job["experienceRequirements"] == "" and job[
                "jobBenefits"] == "" and job["description"] == "":
            #print("lala")
            meta_description = dom.xpath("//meta[@name='description']")
            for temp in meta_description:
                job["jobBenefits"] = ""
                job["description"] = temp.attrib['content']
                job["experienceRequirements"] = temp.attrib['content']

        #lay so luong tuyen dung:
        job_available_node = dom.xpath(
            "//div[@id='col-job-right']//div[@id='box-info-job']//div[@class='job-info-item']//*[contains(text(),'cần tuyển')]/following-sibling::*[1]"
        )
        if (len(job_available_node) == 0):
            job_available_node = dom.xpath(
                "///*[@data-original-title='Số lượng cần tuyển']")
        if (len(job_available_node) > 0):
            job_available_text = job_available_node[0].text
            if "không giới hạn" in job_available_text.lower():
                job["totalJobOpenings"] = 10
            elif "người" in job_available_text.lower():
                num_job_available = (job_available_text.split(" ")[0])
                if (num_job_available.isdigit()):
                    job["totalJobOpenings"] = int(num_job_available)
            else:
                job["totalJobOpenings"] = 1
        else:
            job["totalJobOpenings"] = 1
        #print(job["totalJobOpenings"])
        return job

    @staticmethod
    def is_vi_language(raw_text):
        tag_re = re.compile(r'<[^>]+>')
        text = tag_re.sub('', raw_text)
        text = text.strip()
        result = detect(text)
        if result != "vi":
            return False
        return True

    #

    def close(self, spider, reason):
        print("Number of english items: ", self.no_not_vi_doc)
        print('Number of duplicated items: %d' % self.no_duplicated_items)
        print("Finished!")
Exemplo n.º 8
0
    opts, value = getopt.getopt(sys.argv[1:], "vxtjnpb:")
    for o, a in opts:
        if o == "-t":
            format = "turtle"
        elif o == "-j":
            format = "json-ld"
        elif o == "-n":
            format = "nt"
        elif o == "-p" or o == "-x":
            format = "pretty-xml"
        elif o == "-b":
            base = a
        elif o == "-v":
            version_only = True
        else:
            usage()
            sys.exit(1)
except:
    usage()
    sys.exit(1)

if version_only:
    print "pyMicrodata version: %s" % __version__
    sys.exit(0)

processor = pyMicrodata(base)
if len(value) >= 1:
    print processor.rdf_from_sources(value, outputFormat=format)
else:
    print processor.rdf_from_source(sys.stdin, outputFormat=format)
Exemplo n.º 9
0
class SchemaCrawler(Spider):
    name = "schema_crawler"
    microdata = pyMicrodata()
    schema = None
    get_job_sample = None
    samples = []
    selectors = {
        # 'job_url': "//*[@id='box-job-result']/div[1]/div/div/div[2]/h4/a",
        # 'next_page': "//*[@id='box-job-result']/div[2]/ul/li[last()]/a",
    }
    # start_url = 'https://www.topcv.vn/viec-lam/moi-nhat.html?utm_source=click-search-job&utm_medium=page-job&utm_campaign=tracking-job'
    context = {}
    domain = ""
    currentDomain = ""

    def __init__(self, name=None, **kwargs):
        self.start_url = kwargs.get('start_url')
        self.selectors['job_url'] = kwargs.get('job_url')
        self.selectors['next_page'] = kwargs.get('next_page')
        self.domain = kwargs.get('domain')
        #self.driver = webdriver.Firefox()
        super(SchemaCrawler, self).__init__(name, **kwargs)

    def start_requests(self):
        self.context['start_url'] = self.start_url
        self.context['domain'] = self.domain
        SchemaCrawler.currentDomain = self.domain

        if not os.path.exists(get_context_file(self.domain)):
            if not os.path.exists(STANDARD_ATTRIBUTES_FN):
                raise Exception('Not exist standard file: ' +
                                STANDARD_ATTRIBUTES_FN)
            yield Request(url=self.start_url, callback=self.get_data_format)

        #vananh
        #yield Request(url=self.start_url, callback=self.get_data_format)

    def parse(self, response):
        pass

    def get_data_format(self, response):
        #chi lay 1 url dau tien
        sample_job_url = response.xpath(self.selectors['job_url'] +
                                        '/@href').get()
        #vananh
        #print(len(sample_job_url))
        #print(sample_job_url)
        yield Request(url=get_correct_url(sample_job_url, response),
                      callback=self.decide_data_format)

    def decide_data_format(self, response):
        can_crawl = True
        if self.is_data_json_format(response):
            print("json")
            #buoc gan, chua goi
            self.get_job_sample = self.get_job_sample_json
            self.context['data_format'] = 'json+ld'
        elif self.is_data_microdata_format(response):
            self.get_job_sample = self.get_job_sample_microdata
            self.context['data_format'] = 'microdata'
        else:
            print('Cannot crawl')
            can_crawl = False
        #vananh
        if can_crawl:
            yield Request(url=self.start_url,
                          callback=self.get_job_url_samples)

    def get_job_url_samples(self, response):
        job_urls = response.meta.setdefault('job_urls', [])
        next_page = response.xpath(self.selectors['next_page'] +
                                   '/@href').get()
        #print(next_page)
        #Lay 20 job_urls
        job_urls += response.xpath(self.selectors['job_url'] +
                                   '/@href').getall()
        #print(job_urls)

        if next_page is not None and len(job_urls) < MAX_NO_SAMPLES:
            yield Request(url=get_correct_url(next_page, response),
                          callback=self.get_job_url_samples,
                          meta={'job_urls': job_urls})
        else:
            yield Request(url=get_correct_url(job_urls[0], response),
                          callback=self.get_job_sample,
                          meta={'job_urls': job_urls[1:MAX_NO_SAMPLES]})

    def decide_schema(self):
        #print("VanAnh\n\n")
        print("so luong samples: ", len(self.samples))
        schema = JobSchemaDetection(self.samples, MODEL_DIR,
                                    STANDARD_ATTRIBUTES_FN,
                                    WEIGHT_MODEL_FN).get_mapping_schema()
        self.context['schema'] = schema
        self.context['selectors'] = self.selectors
        self.context['is_finished'] = False
        self.logger.error(self.context)
        with open(get_context_file(self.domain), mode='w',
                  encoding='utf8') as f:
            json.dump(self.context, f)
            f.close()

    def get_job_sample_json(self, response):
        samples = response.meta.setdefault('samples', [])
        '''
        print("-------")
        print(response.meta['job_urls'])
        print("------")
        '''
        job_urls = response.meta['job_urls']
        #print(response.meta)
        samples += self.get_json_from_response_json(response)
        #print(samples)
        #lay dan cac job_urls de boc tach???
        if len(job_urls) > 0:
            yield Request(url=get_correct_url(job_urls[0], response),
                          callback=self.get_job_sample_json,
                          meta={
                              'samples': samples,
                              'job_urls': job_urls[1:]
                          })
        else:
            self.samples = samples
            self.decide_schema()

    def get_job_sample_microdata(self, response):
        samples = response.meta.setdefault('samples', [])
        job_urls = response.meta['job_urls']
        samples.append(self.get_json_from_response_microdata(response))

        if len(job_urls) > 0:
            yield Request(url=get_correct_url(job_urls[0], response),
                          callback=self.get_job_sample_microdata,
                          meta={
                              'samples': samples,
                              'job_urls': job_urls[1:]
                          })
        else:
            self.samples = samples
            self.decide_schema()

    def is_data_json_format(self, response):
        return len(self.get_json_from_response_json(response,
                                                    True)) > 0  #them True

    def is_data_microdata_format(self, response):
        return len(self.get_json_from_response_microdata(response)) > 0

    @staticmethod
    def get_json_from_response_json(response, is_sample=False):
        print("url:")
        print(response.url)
        result = []
        dom = etree.HTML(response.body.decode("utf8"))
        json_node = dom.xpath(
            "//script[text()]")  #xac dinh cac doan script json+ld
        for node in json_node:
            try:
                job = json.loads(node.text, strict=False)
                if job['@type'] == 'JobPosting':
                    #van anh
                    if is_sample == False:  #minh them vao
                        #dich tai day
                        vi_lang = SchemaCrawler.is_vi_language(
                            job["description"])
                        if not vi_lang:
                            print("tieng anh")
                            print(response.url)
                            print("-----")
                            return result
                        #
                        if SchemaCrawler.currentDomain == "topcv":
                            temp_job = job
                            job = SchemaCrawler.seperate_attributes_topcv(
                                temp_job, dom)
                            #lay ra so luong tuyendung
                        elif SchemaCrawler.currentDomain == "timviecnhanh":
                            temp_job = job
                            job = SchemaCrawler.seperate_attributes_timviecnhanh(
                                temp_job, dom)

                        #print(job)
                    result.append(job)

            except (ValueError, TypeError):
                pass
        return result

    def get_json_from_response_microdata(self, response):
        print("microdata")
        raw_json = json.loads(
            self.microdata.rdf_from_source(response.body,
                                           'json-ld').decode('utf8'))
        #print(raw_json)
        result = parse_json(raw_json)
        return result

    #vananh
    @staticmethod
    def seperate_attributes_topcv(job, dom):
        print("ok cv")
        inital_description = job['description']
        #kiem tra tieng anh - true return None
        description_dom = etree.HTML(inital_description)
        first_benefit = ""
        first_requirement = ""
        if "jobBenefits" not in job:
            raw_benefits = description_dom.xpath(
                "//*[contains(text(),'Quyền lợi')]/following-sibling::*")
            raw_benefits_str = ""
            for bnf in raw_benefits:
                bnf_str = etree.tostring(bnf,
                                         method='html',
                                         encoding="unicode")
                raw_benefits_str = raw_benefits_str + bnf_str
            first_benefit = etree.tostring(raw_benefits[0],
                                           method='html',
                                           encoding="unicode")
            jobBenefits = raw_benefits_str
            job["jobBenefits"] = jobBenefits
        if "experienceRequirements" not in job:
            raw_requirements = description_dom.xpath(
                "//*[contains(text(),'Yêu cầu')]/following-sibling::*")
            requirements_str = ""
            req_length = len(raw_requirements)
            i = 0
            while i < req_length:
                req_str = etree.tostring(raw_requirements[i],
                                         method='html',
                                         encoding="unicode")
                if (first_benefit == req_str):
                    folowing_req_str = etree.tostring(raw_requirements[i - 1],
                                                      method='html',
                                                      encoding="unicode")
                    requirements_str = requirements_str.replace(
                        folowing_req_str, "")
                    break
                requirements_str = requirements_str + req_str
                i += 1
            first_requirement = etree.tostring(raw_requirements[0],
                                               method='html',
                                               encoding="unicode")
            experienceRequirements = requirements_str
            job["experienceRequirements"] = experienceRequirements
        #
        if first_requirement.strip() != "":
            std_description = description_dom.xpath(
                "//*[contains(text(),'Mô tả')]/following-sibling::*")
            std_description_str = ""
            i = 0
            std_description_length = len(std_description)
            while i < std_description_length:
                des_str = etree.tostring(std_description[i],
                                         method='html',
                                         encoding="unicode")
                if first_requirement == des_str:
                    folowing_des_str = etree.tostring(std_description[i - 1],
                                                      method='html',
                                                      encoding="unicode")
                    std_description_str = std_description_str.replace(
                        folowing_des_str, "")
                    break
                std_description_str = std_description_str + des_str
                i += 1
            job["description"] = std_description_str
        #lay so luong tuyen dung:

        job_available_node = dom.xpath(
            "//div[@id='col-job-right']//div[@id='box-info-job']//div[@class='job-info-item']//*[contains(text(),'cần tuyển')]/following-sibling::*[1]"
        )
        #print("so luong:")
        #print(job_available_node)
        if (len(job_available_node) == 0):
            job_available_node = dom.xpath(
                "///*[@data-original-title='Số lượng cần tuyển']")
        if (len(job_available_node) > 0):
            job_available_text = job_available_node[0].text
            if "không giới hạn" in job_available_text.lower():
                job["totalJobOpenings"] = 50
            elif "người" in job_available_text.lower():
                num_job_available = (job_available_text.split(" ")[0])
                if (num_job_available.isdigit()):
                    job["totalJobOpenings"] = int(num_job_available)
            else:
                job["totalJobOpenings"] = 1
        else:
            job["totalJobOpenings"] = 1
        #print(job["totalJobOpenings"])
        return job

    @staticmethod
    def seperate_attributes_timviecnhanh(job, dom):
        '''
        https://www.timviecnhanh.com/dxmbhn/nhan-vien-kinh-doanh-bds-kv-quan-4-quan-7-nha-be-dat-xanh-mien-bac-ho-chi-minh-id4352813.html
        '''
        jobOpenings = 0
        if "totalJobOpenings" not in job:
            job_available_values = dom.xpath(
                "//*[@id='left-content']//*[contains(text(),'Số lượng tuyển dụng')]/parent::*/text()"
            )
            if len(job_available_values) == 0:
                job_available_values = dom.xpath(
                    "//div[@class='info-left']//*[contains(text(),'Số lượng cần tuyển')]/parent::*/text()"
                )
            for value in job_available_values:
                #print("value")
                #print(value)
                temp = value.strip()
                if temp != "" and temp.isdigit():
                    job["totalJobOpenings"] = int(temp)
                    jobOpenings = job["totalJobOpenings"]
                elif temp != "" and "giới hạn" in temp:
                    job["totalJobOpenings"] = 10
                    jobOpenings = job["totalJobOpenings"]
            if jobOpenings == 0:
                job["totalJobOpenings"] = 2
        #print("timviecnhanh")
        #print(job["totalJobOpenings"])
        #print(job)
        return job

    @staticmethod
    def is_vi_language(raw_text):
        tag_re = re.compile(r'<[^>]+>')
        text = tag_re.sub('', raw_text)
        text = text.strip()
        result = detect(text)
        if result != "vi":
            return False
        return True
Exemplo n.º 10
0
 def _process(self, graph, baseURI, orig_source):
     from pyMicrodata import pyMicrodata
     processor = pyMicrodata(base=baseURI)
     processor.graph_from_source(
         orig_source, graph=graph, rdfOutput=False)
Exemplo n.º 11
0
 def __init__(self, graph, doc_lines, url=""):
     super(MicrodataValidator, self).__init__(graph, doc_lines, url=url)
     self.parser = pyMicrodata()
     self.graph = self.graph.microdata_graph  # use the microdata half of the compound
Exemplo n.º 12
0
sys.path.insert(0,"/Users/ivan/Library/Python")

# You may want to adapt this to your environment...
import sys, getopt

from pyMicrodata import pyMicrodata, __version__

###########################################

test_path      = "/Users/ivan/W3C/github/microdata-rdf/tests/"
test_file_base = test_path + ("%04d" % int(sys.argv[1]))
#test_file_base = test_path + ("sdo_eg_md_%d" % int(sys.argv[1]))
test_html = test_file_base + ".html"
test_ttl  = test_file_base + ".ttl"

processor = pyMicrodata()
print processor.rdf_from_source(test_html)
print "----"
with open(test_ttl) as f :
	for l in f:
		print l,

print "----"
with open(test_html) as f :
	for l in f:
		print l,




Exemplo n.º 13
0
class Crawler(Spider):
    name = "crawler"
    microdata = pyMicrodata()
    no_duplicated_items = 0
    context = None
    standard_sample = None
    map_schema = None
    data_reduction = None
    parse_job = None
    #vananh
    currentDomain = ""
    no_not_vi_doc = 0
    home = 0

    custom_settings = {
        # 'FEED_FORMAT': 'json',
        # 'FEED_URI': 'topcv.json',
        'ITEM_PIPELINES': {
            'pipelines.MongoPipeline': 300
        },
        'MONGO_URI': MONGO_URI,
        'MONGO_DATABASE': MONGO_DATABASE,
        'MONGO_COLLECTION': MONGO_COLLECTION,
        'USER_AGENT':
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0",
        'HTTPERROR_ALLOW_ALL': True,
        'COOKIES_ENABLED': False,
    }
    handle_httpstatus_list = [404, 410]

    def __init__(self, name=None, **kwargs):
        self.domain = kwargs.get('domain')
        Crawler.currentDomain = self.domain
        super(Crawler, self).__init__(name, **kwargs)

    def start_requests(self):
        if os.path.exists(get_context_file(self.domain)):
            with open(get_context_file(self.domain), mode='r',
                      encoding='utf8') as f:
                self.context = json.load(f)
                f.close()
            if not self.context['is_finished']:
                raise Exception('Context file is not completed')
            else:
                if self.context['data_format'] == 'json+ld':
                    self.parse_job = self.parse_job_json
                else:
                    self.parse_job = self.parse_job_microdata

                self.standard_sample = self.get_standard_sample(
                    STANDARD_ATTRIBUTES_FN)
                self.map_schema = self.get_map_schema(self.context['schema'])
                self.data_reduction = self.get_data_reduction(
                    MONGO_URI, MONGO_DATABASE, MONGO_COLLECTION)
                self.inserted_data = []
                self.inserted_data_reduction = DataReduction(
                    3, self.inserted_data)
                #self.eng_collection = pymongo.MongoClient(MONGO_URI)[MONGO_DATABASE]["english_job"]
        else:
            raise Exception('Context file name not existed: ' +
                            get_context_file(self.domain))
        yield Request(url=self.context['start_url'], callback=self.parse)

    def parse(self, response):
        next_page = response.xpath(self.context['selectors']['next_page'] +
                                   '/@href').get()
        job_urls = response.xpath(self.context['selectors']['job_url'] +
                                  '/@href').getall()
        #job_urls = []
        default_url = "https://www.timviecnhanh.com/tuyen-nhan-vien-phuc-vu-nha-hang-part-time-ho-chi-minh-"
        #loi --3041261 3021301 3021261
        #3191261 chua
        #nham 3891261,3991261 3907733
        #3891261-3901261 ok
        #nham 3491261,3591261 chua - 3562917 ok

        for i in range(3271261, 3281261):
            job_url = "https://www.timviecnhanh.com/tuyen-ke-toan-tong-hop-ho-chi-minh-" + str(
                i) + ".html"
            #job_url = 'https://www.timviecnhanh.com/tuyen-ke-toan-van-phong-3084598.html'
            #headers = {'User-Agent': 'whatever'}
            yield Request(url=get_correct_url(job_url, response),
                          callback=self.parse_job,
                          errback=self.error_parse)
        '''
        for job_url in job_urls:
            # job_url = response.urljoin(job_url)
            yield Request(url=get_correct_url(job_url, response), callback=self.parse_job)
        
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield Request(url=get_correct_url(next_page, response), callback=self.parse)
        '''

    def error_parse(self, response):
        print(response.status)
        print("errrrrr")

    def parse_job_json(self, response):
        job_url = response.request.url
        #error_log = response.request.url + '-----' + str(response.status) + '\n'
        #self.logger.error(error_log)
        jobs = self.get_json_from_response_json(response)
        job_selectors = self.context['selectors']['job_selectors']
        for job in jobs:
            #neu ko co industry thi tim lan can
            if "industry" not in job:
                print("not in")
                job["url"] = job_url
                yield self.get_from_neighbor(response, job)
                continue
            language = job["language"]
            job = self.change_to_right_form(job)
            if job_selectors is not None:
                for field, selector in job_selectors.items():
                    print(selector)
                    job[field] = ','.join(
                        text.strip()
                        for text in response.xpath(selector +
                                                   '/text()').extract()
                        if text is not None)
                job = self.normalize(job, job_url)
                if job is not None:
                    job["language"] = language
                    yield job

    def parse_job_microdata(self, response):
        job_url = response.request.url
        jobs = self.get_json_from_response_microdata(response)
        job_selectors = self.context['selectors']['job_selectors']
        for job in jobs:
            job = self.change_to_right_form(job)
            if job_selectors is not None:
                for field, selector in job_selectors.items():
                    #print(selector)
                    job[field] = ','.join(
                        text.strip()
                        for text in response.xpath(selector +
                                                   '/text()').extract()
                        if text is not None)
                job = self.normalize(job, job_url)
                yield job

    @staticmethod
    def get_json_from_response_json(response):
        result = []
        #vananh
        ''' #for topcv
        if response.url == "https://www.topcv.vn/viec-lam":
            Crawler.home = Crawler.home + 1
            return result
        '''
        #
        dom = etree.HTML(response.body.decode("utf8"))
        #print("-------raw------------")
        #print(dom.xpath("//title/text()"),'-------',response.url,'-----',response.status)
        #print("---------------------")
        #for timviecnhanh
        raw_title_list = dom.xpath("//title/text()")
        if len(raw_title_list) == 0:
            error_log = 'title0' + response.url + '-----' + str(
                response.status) + '\n'
            self.logger.error(error_log)
            return result

        raw_title = raw_title_list[0]
        if raw_title.lower() == "error":
            Crawler.home = Crawler.home + 1
            return result
        json_node = dom.xpath("//script[text()]")
        extract_job = None
        '''
        job_node = dom.xpath("//script[@type='application/ld+json']/text()")
        for jb in job_node:
            #print(jb)
            jb_str = jb.strip()
            jb_str = jb_str.replace('""', '"')
            jb_obj = json.loads(jb_str,strict=False)
            print("lalal")
            print(jb_obj["industry"])
        '''
        for node in json_node:
            try:
                job_str = node.text.strip()
                job_str = job_str.replace('""', '"')
                job = json.loads(job_str, strict=False)
                if job['@type'] == 'JobPosting':
                    print("---------jobPosting-----")
                    #van anh
                    #dich tai day
                    extract_job = job
                    vi_lang = Crawler.is_vi_language(job['description'])
                    if not vi_lang:
                        job["language"] = "en"
                        temp_job = job
                        if Crawler.currentDomain == "topcv":
                            print(response.url)
                            job = Crawler.seperate_attributes_topcv(
                                temp_job, dom, False)
                            Crawler.no_not_vi_doc = Crawler.no_not_vi_doc + 1
                            result.append(job)
                            return result
                        elif Crawler.currentDomain == "timviecnhanh":
                            print(response.url)
                            job = Crawler.extract_job_openings_tvn(
                                temp_job, dom)
                            Crawler.no_not_vi_doc = Crawler.no_not_vi_doc + 1
                            result.append(job)
                            return result
                    else:
                        job["language"] = "vi"
                        temp_job = job
                        if Crawler.currentDomain == "topcv":
                            print(response.url)
                            job = Crawler.seperate_attributes_topcv(
                                temp_job, dom)
                        elif Crawler.currentDomain == "timviecnhanh":
                            print(response.url)
                            job = Crawler.extract_job_openings_tvn(
                                temp_job, dom)

                    result.append(job)

            except ValueError as e:
                pass

        if extract_job is None:
            if Crawler.currentDomain == "timviecnhanh":
                print(response.url)
                job = Crawler.seperate_attributes_timviecnhanh(dom)
                if job is not None:
                    result.append(job)
        return result

    def get_json_from_response_microdata(self, response):
        raw_json = json.loads(
            self.microdata.rdf_from_source(response.body,
                                           'json-ld').decode('utf8'))
        result = parse_json(raw_json)
        return result

    def change_to_right_form(self, job):
        norm_job = self.standard_sample.copy()
        #print(norm_job)
        flatten_job = flatten_dict(job)

        for key, value in self.map_schema.items():
            real_value = flatten_job.get(key)
            if real_value is None:
                continue
            else:
                attribute = norm_job
                for attribute_level in value[:-1]:
                    attribute = attribute.get(attribute_level)
                if type(real_value) is str:
                    attribute[value[-1]] = re.sub(r'<[^<>]*>', '',
                                                  str(real_value))
                elif type(attribute[value[-1]]) == dict and type(
                        real_value) == list:
                    attribute[value[-1]] = real_value[0]
                else:
                    attribute[value[-1]] = real_value
        #print("norm_job")
        #print(norm_job)
        return norm_job

    def normalize(self, job, url):
        result = normalize_job(job)
        result['url'] = url

        # Check duplicate
        #phai ktra nhung tin da insert truoc
        if self.data_reduction.is_match(self.get_filter_data(job)):
            self.no_duplicated_items += 1
            result = None
            return result
        else:
            if self.inserted_data_reduction.is_match(
                    self.get_filter_data(job)):
                self.no_duplicated_items += 1
                result = None
                return result
            else:
                #self.inserted_data.append(self.get_filter_data(job))
                self.inserted_data_reduction.add_job(self.get_filter_data(job))
        return result

    @staticmethod
    def get_standard_sample(file_name):
        if os.path.exists(file_name):
            with open(file_name, mode='r', encoding='utf8') as f:
                standard_sample = json.load(f)
                f.close()
        else:
            raise Exception('Not exist standard file: ' + file_name)

        return standard_sample

    @staticmethod
    def get_map_schema(schema):
        return {key: value.split('_') for key, value in schema.items()}

    def get_data_reduction(self, uri, database, collection):
        collection = pymongo.MongoClient(uri)[database][collection]
        #Lay ra ten, noi tuyen dung, dia diem, ko lay id cua danh sach tuyen dung
        jobs = list(
            collection.find({}, {
                'title': 1,
                'hiringOrganization.name': 1,
                'jobLocation.address.addressRegion': 1,
                'validThrough': 1,
                'datePosted': 1,
                '_id': 0
            }))
        data = [self.get_filter_data(job) for job in jobs]

        data_reduction = DataReduction(3, data)
        return data_reduction

    @staticmethod
    def get_filter_data(job):
        title = job['title']
        hiring_organization_name = job['hiringOrganization']['name']
        if type(job['jobLocation']) is list:
            address_region = ','.join([
                location['address']['addressRegion']
                for location in job['jobLocation']
            ])
        else:
            address_region = job['jobLocation']['address']['addressRegion']
        #vananh
        #validThrough =
        #trong db la date => chuyen ve str
        valid_through = job['validThrough']
        #validThroughDate = pd.to_datetime(date_str)
        #valid_through = str(validThroughDate.year) + "-" + str(validThroughDate.month) + "-" + str(validThroughDate.day)
        date_posted = job['datePosted']
        return [
            title, hiring_organization_name, address_region, date_posted,
            valid_through
        ]

    #van anh
    #vananh
    @staticmethod
    def seperate_attributes_topcv(job, dom, is_vi=True):
        '''
        if not is_vi:
            #lay so luong tuyen dung:
            job_available_node = dom.xpath("//div[@id='col-job-right']//div[@id='box-info-job']//div[@class='job-info-item']//*[contains(text(),'cần tuyển')]/following-sibling::*[1]")
            if(len(job_available_node) == 0):
                job_available_node = dom.xpath("///*[@data-original-title='Số lượng cần tuyển']")
            if(len(job_available_node) > 0):
                job_available_text = job_available_node[0].text
                if "không giới hạn" in job_available_text.lower():
                    job["totalJobOpenings"] = 10
                elif "người" in job_available_text.lower():
                    num_job_available = (job_available_text.split(" ")[0])
                    if(num_job_available.isdigit()):
                        job["totalJobOpenings"] = int(num_job_available)
                else:
                    job["totalJobOpenings"] = 2
            else:
                job["totalJobOpenings"] = 2
            job["language"] = "en"
            return job
        '''
        inital_description = job['description']
        description_dom = etree.HTML(inital_description)
        first_benefit = ""
        first_requirement = ""
        if "jobBenefits" not in job:
            raw_benefits = description_dom.xpath(
                "//*[contains(text(),'Quyền lợi')]/following-sibling::*")
            print(len(raw_benefits))
            raw_benefits_str = ""
            for bnf in raw_benefits:
                bnf_str = etree.tostring(bnf,
                                         method='html',
                                         encoding="unicode")
                raw_benefits_str = raw_benefits_str + bnf_str
            if len(raw_benefits) > 0:
                first_benefit = etree.tostring(raw_benefits[0],
                                               method='html',
                                               encoding="unicode")
                jobBenefits = raw_benefits_str
                job["jobBenefits"] = jobBenefits
            else:
                job["jobBenefits"] = ""
        if "experienceRequirements" not in job:
            raw_requirements = description_dom.xpath(
                "//*[contains(text(),'Yêu cầu')]/following-sibling::*")
            requirements_str = ""
            req_length = len(raw_requirements)
            i = 0
            while i < req_length:
                req_str = etree.tostring(raw_requirements[i],
                                         method='html',
                                         encoding="unicode")
                if (first_benefit == req_str):
                    folowing_req_str = etree.tostring(raw_requirements[i - 1],
                                                      method='html',
                                                      encoding="unicode")
                    requirements_str = requirements_str.replace(
                        folowing_req_str, "")
                    break
                requirements_str = requirements_str + req_str
                i += 1
            if len(raw_requirements
                   ) > 1:  #neu = 1 la req = "" den benefit luon
                first_requirement = etree.tostring(raw_requirements[0],
                                                   method='html',
                                                   encoding="unicode")
                experienceRequirements = requirements_str
                job["experienceRequirements"] = experienceRequirements
            else:
                job["experienceRequirements"] = ""
        #
        if first_requirement.strip() != "":
            print("hehe")
            std_description = description_dom.xpath(
                "//*[contains(text(),'Mô tả')]/following-sibling::*")
            std_description_str = ""
            i = 0
            std_description_length = len(std_description)
            while i < std_description_length:
                des_str = etree.tostring(std_description[i],
                                         method='html',
                                         encoding="unicode")
                if first_requirement == des_str:
                    folowing_des_str = etree.tostring(std_description[i - 1],
                                                      method='html',
                                                      encoding="unicode")
                    std_description_str = std_description_str.replace(
                        folowing_des_str, "")
                    break
                std_description_str = std_description_str + des_str
                i += 1
            job["description"] = std_description_str
        #sua loi out of range
        print("exp ", job["experienceRequirements"])
        print("be ", job["jobBenefits"] == "")
        if job["experienceRequirements"] == "" or job["jobBenefits"] == "":
            job["jobBenefits"] = seperate.extract_info(inital_description,
                                                       "quyền lợi")
            job["description"] = seperate.extract_info(inital_description,
                                                       "mô tả")
            job["experienceRequirements"] = seperate.extract_info(
                inital_description, "yêu cầu")
        if job["experienceRequirements"] == "" and job[
                "jobBenefits"] == "" and job["description"] == "":
            #print("lala")
            meta_description = dom.xpath("//meta[@name='description']")
            for temp in meta_description:
                job["jobBenefits"] = ""
                job["description"] = temp.attrib['content']
                job["experienceRequirements"] = temp.attrib['content']
            '''
            print(meta_description[0])
            content = meta_description[0].attrib['content']
            print("lalal2")
            print(content)
            '''
        #lay so luong tuyen dung:
        job_available_node = dom.xpath(
            "//div[@id='col-job-right']//div[@id='box-info-job']//div[@class='job-info-item']//*[contains(text(),'cần tuyển')]/following-sibling::*[1]"
        )
        if (len(job_available_node) == 0):
            job_available_node = dom.xpath(
                "///*[@data-original-title='Số lượng cần tuyển']")
        if (len(job_available_node) > 0):
            job_available_text = job_available_node[0].text
            if "không giới hạn" in job_available_text.lower():
                job["totalJobOpenings"] = 10
            elif "người" in job_available_text.lower():
                num_job_available = (job_available_text.split(" ")[0])
                if (num_job_available.isdigit()):
                    job["totalJobOpenings"] = int(num_job_available)
            else:
                job["totalJobOpenings"] = 2
        else:
            job["totalJobOpenings"] = 2

        if is_vi:
            job["language"] = "vi"
        else:
            job["language"] = "en"

        #print(job["totalJobOpenings"])
        return job

    @staticmethod
    def seperate_attributes_timviecnhanh(dom):
        job = {}
        meta_description = dom.xpath("//meta[@property='og:description']")
        for temp in meta_description:
            job["jobBenefits"] = ""
            job["description"] = temp.attrib['content']
            job["experienceRequirements"] = ""
        vi_lang = Crawler.is_vi_language(job['description'])
        if not vi_lang:
            Crawler.no_not_vi_doc = Crawler.no_not_vi_doc + 1
            return None
        job["language"] = "vi"
        raw_title = dom.xpath("//title/text()")[0]
        raw_title = raw_title.strip()
        title_list = raw_title.split("|")
        if len(title_list) > 1:
            raw_title = title_list[0].strip()
        job["title"] = raw_title
        job["validThrough"] = seperate.extract_info_tvn(
            job["description"], "ngày hết hạn")
        job["validThrough"] = seperate.normalize_date_tvn(job["validThrough"])
        job["hiringOrganization"] = {}
        job["hiringOrganization"]["name"] = seperate.extract_info_tvn(
            job["description"], "công ty")
        job["hiringOrganization"]["name"] = seperate.normalize_org_name_tvn(
            job["hiringOrganization"]["name"])
        raw_salary = seperate.extract_info_tvn(job["description"], "lương")
        job["baseSalary"] = {}
        job["baseSalary"] = seperate.extract_salary_tvn(raw_salary)
        job["totalJobOpenings"] = 2
        job["jobLocation"] = {"address": {}}
        #location
        raw_location = dom.xpath(
            "//div[@class='bottom-article']//a[contains(text(), 'Việc làm')]/text()"
        )
        city = raw_location[0].strip()
        city = city.replace('Việc làm', '')
        city = city.strip()
        if city == "":
            city = "Việt Nam"
        job["jobLocation"]["address"]["addressLocality"] = city
        job["jobLocation"]["address"]["streetAddress"] = city
        job["jobLocation"]["address"]["addressCountry"] = "Việt Nam"
        category = dom.xpath(
            "//div[@class='line-full breadcrumb-line']//ol/li[4]//a/text()")
        job["industry"] = category[0]
        #print(job)
        return job

    @staticmethod
    def extract_job_openings_tvn(job, dom):
        jobOpenings = 0
        if "totalJobOpenings" not in job:
            job_available_values = dom.xpath(
                "//*[@id='left-content']//*[contains(text(),'Số lượng tuyển dụng')]/parent::*/text()"
            )
            if len(job_available_values) == 0:
                job_available_values = dom.xpath(
                    "//div[@class='info-left']//*[contains(text(),'Số lượng cần tuyển')]/parent::*/text()"
                )
            for value in job_available_values:
                #print("value")
                #print(value)
                temp = value.strip()
                if temp != "" and temp.isdigit():
                    job["totalJobOpenings"] = int(temp)
                    jobOpenings = job["totalJobOpenings"]
                elif temp != "" and "giới hạn" in temp:
                    job["totalJobOpenings"] = 10
                    jobOpenings = job["totalJobOpenings"]
            if jobOpenings == 0:
                job["totalJobOpenings"] = 2

        return job

    @staticmethod
    def is_vi_language(raw_text):
        tag_re = re.compile(r'<[^>]+>')
        text = tag_re.sub('', raw_text)
        text = text.strip()
        result = detect(text)
        if result != "vi":
            return False
        return True

    def get_from_neighbor(self, response, ini_job):
        #print("-----neighbor_job----")
        dom = etree.HTML(response.body.decode("utf8"))
        neighbor_urls = dom.xpath("//*[@id='job-hot-content']//tr[1]//a[1]")
        if len(neighbor_urls) == 0:

            neighbor_urls = dom.xpath(
                "//*[@id='job-week-content']//tr[1]//a[1]")
        result = []
        for neighbor_url in neighbor_urls:
            url = neighbor_url.attrib["href"]
            neighbor_request = Request(url=get_correct_url(url, response),
                                       callback=self.get_job_from_neighbor)
            neighbor_request.cb_kwargs["ini_job"] = ini_job
            #print("-----ok-----")
            return neighbor_request

    def get_job_from_neighbor(self, response, ini_job):
        print("xxxxxxxxxxxxxxxxxxx")
        neighbor_jobs = self.get_json_from_response_json(response)
        for neighbor_job in neighbor_jobs:
            ini_job["industry"] = neighbor_job["industry"]
        job_selectors = self.context['selectors']['job_selectors']
        job = ini_job
        language = job["language"]
        job_url = job["url"]
        job = self.change_to_right_form(job)
        if job_selectors is not None:
            for field, selector in job_selectors.items():
                job[field] = ','.join(
                    text.strip()
                    for text in response.xpath(selector + '/text()').extract()
                    if text is not None)
            job = self.normalize(job, job_url)
            if job is not None:
                job["language"] = language
                yield job

    def close(self, spider, reason):
        print("Number of english items: ", self.no_not_vi_doc)
        print("Number of broken items: ", self.home)
        print('Number of duplicated items: %d' % self.no_duplicated_items)
        print("Finished!")
Exemplo n.º 14
0
class SchemaCrawler(Spider):
    name = "schema_crawler"
    microdata = pyMicrodata()
    schema = None
    get_job_sample = None
    samples = []
    selectors = {
        # 'job_url': "//*[@id='box-job-result']/div[1]/div/div/div[2]/h4/a",
        # 'next_page': "//*[@id='box-job-result']/div[2]/ul/li[last()]/a",
    }
    # start_url = 'https://www.topcv.vn/viec-lam/moi-nhat.html?utm_source=click-search-job&utm_medium=page-job&utm_campaign=tracking-job'
    context = {}
    domain = None

    def __init__(self, name=None, **kwargs):
        self.start_url = kwargs.get('start_url')
        self.selectors['job_url'] = kwargs.get('job_url')
        self.selectors['next_page'] = kwargs.get('next_page')
        self.domain = kwargs.get('domain')
        #self.driver = webdriver.Firefox()
        super(SchemaCrawler, self).__init__(name, **kwargs)

    def start_requests(self):
        self.context['start_url'] = self.start_url
        self.context['domain'] = self.domain
        '''
        if not os.path.exists(get_context_file(self.domain)):
            if not os.path.exists(STANDARD_ATTRIBUTES_FN):
                raise Exception('Not exist standard file: ' + STANDARD_ATTRIBUTES_FN)
            yield Request(url=self.start_url, callback=self.get_data_format)
        '''
        #vananh
        yield Request(url=self.start_url, callback=self.get_data_format)

    def parse(self, response):
        pass

    def get_data_format(self, response):
        #chi lay 1 url dau tien
        sample_job_url = response.xpath(self.selectors['job_url'] +
                                        '/@href').get()
        #vananh
        print(len(sample_job_url))
        #print(sample_job_url)
        yield Request(url=get_correct_url(sample_job_url, response),
                      callback=self.decide_data_format)

    def decide_data_format(self, response):
        can_crawl = True
        if self.is_data_json_format(response):
            print("json")
            #buoc gan, chua goi
            self.get_job_sample = self.get_job_sample_json
            self.context['data_format'] = 'json+ld'
        elif self.is_data_microdata_format(response):
            self.get_job_sample = self.get_job_sample_microdata
            self.context['data_format'] = 'microdata'
        else:
            print('Cannot crawl')
            can_crawl = False
        #vananh
        if can_crawl:
            yield Request(url=self.start_url,
                          callback=self.get_job_url_samples)

    def get_job_url_samples(self, response):
        job_urls = response.meta.setdefault('job_urls', [])
        next_page = response.xpath(self.selectors['next_page'] +
                                   '/@href').get()
        #print(next_page)
        #Lay 20 job_urls
        job_urls += response.xpath(self.selectors['job_url'] +
                                   '/@href').getall()
        #print(job_urls)

        if next_page is not None and len(job_urls) < MAX_NO_SAMPLES:
            yield Request(url=get_correct_url(next_page, response),
                          callback=self.get_job_url_samples,
                          meta={'job_urls': job_urls})
        else:
            yield Request(url=get_correct_url(job_urls[0], response),
                          callback=self.get_job_sample,
                          meta={'job_urls': job_urls[1:MAX_NO_SAMPLES]})

    def decide_schema(self):
        print("VanAnh\n\n")
        print("so luong samples: ", len(self.samples))
        schema = JobSchemaDetection(self.samples, MODEL_DIR,
                                    STANDARD_ATTRIBUTES_FN,
                                    WEIGHT_MODEL_FN).get_mapping_schema()
        self.context['schema'] = schema
        self.context['selectors'] = self.selectors
        self.context['is_finished'] = False
        self.logger.error(self.context)
        with open(get_context_file(self.domain), mode='w',
                  encoding='utf8') as f:
            json.dump(self.context, f)
            f.close()

    def get_job_sample_json(self, response):
        samples = response.meta.setdefault('samples', [])
        print("-------")
        print(response.meta['job_urls'])
        print("------")
        job_urls = response.meta['job_urls']
        #print(response.meta)
        samples += self.get_json_from_response_json(response)
        #print(samples)
        #lay dan cac job_urls de boc tach???
        if len(job_urls) > 0:
            yield Request(url=get_correct_url(job_urls[0], response),
                          callback=self.get_job_sample_json,
                          meta={
                              'samples': samples,
                              'job_urls': job_urls[1:]
                          })
        else:
            self.samples = samples
            #self.decide_schema()

    def get_job_sample_microdata(self, response):
        samples = response.meta.setdefault('samples', [])
        job_urls = response.meta['job_urls']
        samples.append(self.get_json_from_response_microdata(response))

        if len(job_urls) > 0:
            yield Request(url=get_correct_url(job_urls[0], response),
                          callback=self.get_job_sample_microdata,
                          meta={
                              'samples': samples,
                              'job_urls': job_urls[1:]
                          })
        else:
            self.samples = samples
            self.decide_schema()

    def is_data_json_format(self, response):
        return len(self.get_json_from_response_json(response)) > 0

    def is_data_microdata_format(self, response):
        return len(self.get_json_from_response_microdata(response)) > 0

    @staticmethod
    def get_json_from_response_json(response):
        result = []
        dom = etree.HTML(response.body.decode("utf8"))
        json_node = dom.xpath(
            "//script[text()]")  #xac dinh cac doan script json+ld
        for node in json_node:
            try:
                job = json.loads(node.text, strict=False)
                if job['@type'] == 'JobPosting':
                    #van anh
                    #tach benefit va requirement ra khoi description

                    inital_description = job['description']
                    description_dom = etree.HTML(inital_description)
                    first_benefit = ""
                    first_requirement = ""
                    if "jobBenefits" not in job:
                        raw_benefits = description_dom.xpath(
                            "//*[contains(text(),'Quyền lợi')]/following-sibling::*"
                        )
                        raw_benefits_str = ""
                        for bnf in raw_benefits:
                            bnf_str = etree.tostring(bnf,
                                                     method='html',
                                                     encoding="unicode")
                            raw_benefits_str = raw_benefits_str + bnf_str
                        first_benefit = etree.tostring(raw_benefits[0],
                                                       method='html',
                                                       encoding="unicode")
                        jobBenefits = raw_benefits_str
                        job["jobBenefits"] = jobBenefits
                    if "experienceRequirements" not in job:
                        raw_requirements = description_dom.xpath(
                            "//*[contains(text(),'Yêu cầu')]/following-sibling::*"
                        )
                        requirements_str = ""
                        req_length = len(raw_requirements)
                        i = 0
                        while i < req_length:
                            req_str = etree.tostring(raw_requirements[i],
                                                     method='html',
                                                     encoding="unicode")
                            if (first_benefit == req_str):
                                folowing_req_str = etree.tostring(
                                    raw_requirements[i - 1],
                                    method='html',
                                    encoding="unicode")
                                requirements_str = requirements_str.replace(
                                    folowing_req_str, "")
                                break
                            requirements_str = requirements_str + req_str
                            i += 1
                        """
                        for req in raw_requirements:
                            req_str = etree.tostring(req,method='html',encoding="unicode")
                            if(first_benefit == req_str):
                                break
                            requirements_str = requirements_str + req_str
                        """
                        first_requirement = etree.tostring(raw_requirements[0],
                                                           method='html',
                                                           encoding="unicode")
                        experienceRequirements = requirements_str
                        job["experienceRequirements"] = experienceRequirements
                    #
                    if first_requirement.strip() != "":
                        std_description = description_dom.xpath(
                            "//*[contains(text(),'Mô tả')]/following-sibling::*"
                        )
                        std_description_str = ""
                        i = 0
                        std_description_length = len(std_description)
                        while i < std_description_length:
                            des_str = etree.tostring(std_description[i],
                                                     method='html',
                                                     encoding="unicode")
                            if first_requirement == des_str:
                                folowing_des_str = etree.tostring(
                                    std_description[i - 1],
                                    method='html',
                                    encoding="unicode")
                                std_description_str = std_description_str.replace(
                                    folowing_des_str, "")
                                break
                            std_description_str = std_description_str + des_str
                            i += 1
                        '''
                        for des in std_description:
                            des_str = etree.tostring(des,method='html',encoding="unicode")
                            if first_requirement == des_str:
                                break
                            std_description_str = std_description_str + des_str
                        '''
                        job["description"] = std_description_str
                    print("*******************")
                    print(job)
                    print("\n\n")
                    print("**************")
                    #
                    result.append(job)

            except (ValueError, TypeError):
                pass
        return result

    def get_json_from_response_microdata(self, response):
        print("microdata")
        raw_json = json.loads(
            self.microdata.rdf_from_source(response.body,
                                           'json-ld').decode('utf8'))
        print(raw_json)
        result = parse_json(raw_json)
        return result

    #vananh
    def seperate_attributes_topcv(self, job):
        inital_description = job['description']
        description_dom = etree.HTML(inital_description)
        first_benefit = ""
        first_requirement = ""
        if "jobBenefits" not in job:
            raw_benefits = description_dom.xpath(
                "//*[contains(text(),'Quyền lợi')]/following-sibling::*")
            raw_benefits_str = ""
            for bnf in raw_benefits:
                bnf_str = etree.tostring(bnf,
                                         method='html',
                                         encoding="unicode")
                raw_benefits_str = raw_benefits_str + bnf_str
            first_benefit = etree.tostring(raw_benefits[0],
                                           method='html',
                                           encoding="unicode")
            jobBenefits = raw_benefits_str
            job["jobBenefits"] = jobBenefits
        if "experienceRequirements" not in job:
            raw_requirements = description_dom.xpath(
                "//*[contains(text(),'Yêu cầu')]/following-sibling::*")
            requirements_str = ""
            req_length = len(raw_requirements)
            i = 0
            while i < req_length:
                req_str = etree.tostring(raw_requirements[i],
                                         method='html',
                                         encoding="unicode")
                if (first_benefit == req_str):
                    folowing_req_str = etree.tostring(raw_requirements[i - 1],
                                                      method='html',
                                                      encoding="unicode")
                    requirements_str = requirements_str.replace(
                        folowing_req_str, "")
                    break
                requirements_str = requirements_str + req_str
                i += 1
            """
            for req in raw_requirements:
                req_str = etree.tostring(req,method='html',encoding="unicode")
                if(first_benefit == req_str):
                    break
                requirements_str = requirements_str + req_str
            """
            first_requirement = etree.tostring(raw_requirements[0],
                                               method='html',
                                               encoding="unicode")
            experienceRequirements = requirements_str
            job["experienceRequirements"] = experienceRequirements
        #
        if first_requirement.strip() != "":
            std_description = description_dom.xpath(
                "//*[contains(text(),'Mô tả')]/following-sibling::*")
            std_description_str = ""
            i = 0
            std_description_length = len(std_description)
            while i < std_description_length:
                des_str = etree.tostring(std_description[i],
                                         method='html',
                                         encoding="unicode")
                if first_requirement == des_str:
                    folowing_des_str = etree.tostring(std_description[i - 1],
                                                      method='html',
                                                      encoding="unicode")
                    std_description_str = std_description_str.replace(
                        folowing_des_str, "")
                    break
                std_description_str = std_description_str + des_str
                i += 1
            '''
            for des in std_description:
                des_str = etree.tostring(des,method='html',encoding="unicode")
                if first_requirement == des_str:
                    break
                std_description_str = std_description_str + des_str
            '''
            job["description"] = std_description_str

        print("*******************")
        print(job)
        print("\n\n")
        print("**************")
        return job