Exemplo n.º 1
0
    def parse_article(self, response):
        url = response.url
        #time = "".join(response.xpath('//span[@class="post-list-info"][1]//a[1]/text()').extract()) + ":00"
        #time = '2016-3-20 20:27:00'
        time = "".join(
            response.xpath('//div[@class="postinfo"][1]/text()').extract()
        ).strip()[3:] + ":00"
        title = "".join(response.xpath('//h1/text()').extract())
        contentwrapper = response.xpath('//div[@class="t_msgfont"][1]')
        content = "\n".join(contentwrapper.xpath('string(.)').extract())
        #content = "\n".join(response.xpath('//div[@class="t_msgfont"][1]/text()').extract())
        mails = get_mail(content)
        email = mails[0] if mails else ''
        # TODO use machine learning to tag the info
        tags = 'a,b'

        item = JobItem()
        item['title'] = title
        item['url'] = url
        item['email'] = email
        item['content'] = content
        item['time'] = time
        item['tags'] = tags
        item['type'] = 'intern'
        yield item
Exemplo n.º 2
0
    def parse_article(self, response):
        url = response.url
        time = response.xpath(
            '//div[@class="nav hl"][1]//a[@class="plant"][2]/text()'
        )[0].extract()
        time = time.strip()
        #print time,url
        title = "".join(
            response.xpath(
                '//ul[@class="list sec"]//li[1]/text()').extract())[3:]
        content = "\n".join(
            response.xpath(
                '//ul[@class="list sec"]//li[2]//div[@class="sp"][1]/text()')
            [:-2].extract())
        mails = get_mail(content)
        email = mails[0] if mails else ''
        # TODO use machine learning to tag the info
        tags = 'a,b'

        item = JobItem()
        item['title'] = title
        item['url'] = url
        item['email'] = email
        item['content'] = content
        item['time'] = time
        item['tags'] = tags
        item['type'] = 'job'
        yield item
Exemplo n.º 3
0
    def parse_article(self, response):
        url = response.url
        #print response.body
        time = response.xpath(
            '//form[@name="delpost"]/div[1]//table[@class="t_msg"]//tr[1]/td/div[1]/div[3]/text()'
        ).extract()[0].strip()[4:] + ':00'
        title = ''.join(
            response.xpath('//tr[@class="header"]/td/text()').extract()).strip(
            ).split(u'标题:')[1].strip()
        contentwrapper = response.xpath(
            '//form[@name="delpost"]/div[1]//table[@class="t_msg"]//div[@class="t_msgfont"]'
        )
        content = '\n'.join(contentwrapper.xpath('string(.)').extract())
        mails = get_mail(content)
        email = mails[0] if mails else ''
        tags = 'a,b'
        if (u'实习' in content) or (u'兼职' in content):
            types = 'intern'
        else:
            types = 'job'

        item = JobItem()
        item['title'] = title
        item['url'] = url
        item['email'] = email
        item['content'] = content
        item['time'] = time
        item['tags'] = tags
        item['type'] = types
        yield item
Exemplo n.º 4
0
                contentwrapper = response.xpath(
                    '//*[@id="Main"]/div[2]/div[3]/div')
            else:
                contentwrapper = response.xpath(
                    '//*[@id="Main"]/div[2]/div[2]/div')
        except Exception, e:
            #Write the traceback info into a file.
            traceback.print_exc(file=open('tb.txt', 'w+'))

        content = "\n".join(contentwrapper.xpath('string(.)').extract())
        if (u'实习' in title) or (u'实习' in content):
            job_type = 'intern'
        else:
            job_type = 'job'

        mails = get_mail(content)
        email = mails[0] if mails else ''
        # TODO use machine learning to tag the info
        tags = 'a,b'

        item = JobItem()
        try:
            item['title'] = title
            item['url'] = url
            item['email'] = email
            item['content'] = content
            item['time'] = time_t
            item['tags'] = tags
            item['type'] = job_type
            yield item
        except Exception, e: