Exemplo n.º 1
0
 def page_time(self):
     try:
         t = htmlfind.findTag(self.get_cur_doc().cur_content, 'div',
                              'class="resume-info"')
         if not t:
             t = htmlfind.findTag(self.get_cur_doc().cur_content, 'div',
                                  'class="tab"')  #猎头页面
             if not t:
                 return None
             return TimeHandler.fmt_time(t[0])
         return TimeHandler.fmt_time(t[0])
     except Exception as e:
         self._save_not_need_cv(self.get_cur_doc().cur_jdid)
Exemplo n.º 2
0
    def page_time(self):
        cur_doc = self.get_cur_doc().cur_content
        if isinstance(cur_doc, unicode):
            cur_doc = cur_doc.encode('utf-8')

        s = re.search(r'发布日期:</dt>.*?<dd class="text_dd">(.*?)</dd>', cur_doc, re.S)
        if s:
            return TimeHandler.fmt_time(s.group(1))

        tag = htmlfind.findTag(cur_doc, 'div', 'class="jtag inbox"')
        if tag:
            m = re.search(r'(\d*-?\d+-\d+发布)', tag[0])
            if m:
                t = TimeHandler.fmt_time(m.group(1))
                return t
Exemplo n.º 3
0
 def page_time(self):
     tag = htmlfind.findTag(self.get_cur_doc().cur_content, 'div', 'class="jtag inbox"')
     if tag:
         m = re.search(ur'(\d*-?\d+-\d+发布)', tag[0])
         if m:
             t = TimeHandler.fmt_time(m.group(1))
             return t
Exemplo n.º 4
0
    def page_time(self):
        #TODO
        tag = htmlfind.findTag(self.get_cur_doc().cur_content, 'li',
                               'class="posted" itemprop="datePosted"')

        if tag and len(tag) > 0:
            return TimeHandler.fmt_time(tag[0])

        return None
Exemplo n.º 5
0
    def page_time(self):

        tag = spider.util.htmlfind(self.get_cur_doc().cur_content, 'class="publish-time"', 0)
        try:
            tag = tag.get_text()
        except:
            Log.errorbin("invalid jd content %s" % self.get_cur_doc().cur_url, self.get_cur_doc().cur_content)
            return None

        return TimeHandler.fmt_time(tag)
Exemplo n.º 6
0
    def page_time(self):
        cur_content = self.get_cur_doc().cur_content
        if isinstance(cur_content, unicode):
            cur_content = cur_content.encode('utf-8')

        find = re.search(r'<em>更新时间:(.*)</em>', cur_content, re.S)
        if find:
            return TimeHandler.fmt_time(find.group(1))

        return None
Exemplo n.º 7
0
    def page_time(self):
        #TODO
        #tag = spider.util.htmlfind(self.get_cur_doc().cur_content, 'class="publish_time"', 0)
        tag = re.search('class="uptime common-icon"></em>(.*?)</dd>',
                        self.get_cur_doc().cur_content)
        try:
            #tag = tag.get_text()
            tag = tag.group(1)
        except:
            Log.errorbin("invalid jd content %s" % self.get_cur_doc().cur_url,
                         self.get_cur_doc().cur_content)
            raise

        return TimeHandler.fmt_time(tag)
Exemplo n.º 8
0
    def page_time(self):

        cur_content = self.get_cur_doc().cur_content
        if isinstance(cur_content, unicode):
            cur_content = cur_content.encode('utf-8')

        tag = htmlfind.findTag(cur_content, 'span', 'class="last-modified"')
        try:
            tag = htmlfind.remove_tag(tag[0], 1)
        except:
            Log.errorbin("invalid jd pubtime %s" % self.get_cur_doc().cur_url,
                         self.get_cur_doc().cur_content)
            raise
        if isinstance(tag, unicode):
            tag = tag.encode('utf-8')

        return TimeHandler.fmt_time(tag)
Exemplo n.º 9
0
 def page_time(self):
     m = re.search(r'"icons24 icons24-time"></i>(.*?)</span>',
                   self.get_cur_doc().cur_content, re.S)
     if m:
         ft = m.group(1)
         return TimeHandler.fmt_time(ft)
Exemplo n.º 10
0
 def page_time(self):
     localtime = time.localtime(time.time())
     localtime = time.strftime('%Y-%m-%d', localtime)
     t = TimeHandler.fmt_time(localtime)
     return t