Exemplo n.º 1
0
    def _filter_links(self,
                      links,
                      text=None,
                      text_regex=None,
                      url=None,
                      url_regex=None,
                      predicate=None):
        predicates = []
        if text is not None:
            predicates.append(lambda link: link.string == text)
        if text_regex is not None:
            predicates.append(
                lambda link: re_compile(text_regex).search(link.string or ''))
        if url is not None:
            predicates.append(lambda link: link.get('href') == url)
        if url_regex is not None:
            predicates.append(lambda link: re_compile(url_regex).search(
                link.get('href', '')))
        if predicate:
            predicate.append(predicate)

        def f(link):
            for p in predicates:
                if not p(link):
                    return False
            return True

        return [link for link in links if f(link)]
Exemplo n.º 2
0
 def _match(self, mapping, value):
     for pat, what in utils.group(mapping, 2):
         rx = utils.re_compile('^' + pat + '$')
         result = rx.match(value)
         if result:
             return what, [x and urllib.unquote(x) for x in result.groups()]
     return None, None
Exemplo n.º 3
0
 def _match(self, mapping, value):
     for pat, what in utils.group(mapping, 2):
         rx = utils.re_compile("^" + pat + "$")
         result = rx.match(value)
         if result:
             return what, [x and urllib.unquote(x) for x in result.groups()]
     return None, None
Exemplo n.º 4
0
 def _match(self, mapping, value):
     for pat, what in utils.group(mapping, 2):
         if isinstance(what, basestring):
             what, result = utils.re_subm("^" + pat + "$", what, web.ctx.path)
         else:
             result = utils.re_compile("^" + pat + "$").match(web.ctx.path)
         if result:  # it's a match
             return what, [x and urllib.unquote(x) for x in result.groups()]
     return None, None
Exemplo n.º 5
0
    def _match(self, mapping, value):
        for pat, what in mapping:
            if isinstance(what, basestring):
                what, result = utils.re_subm('^' + pat + '$', what, value)
            else:
                result = utils.re_compile('^' + pat + '$').match(value)

            if result: # it's a match
                return what, [x for x in set(result.groups()).difference(set(result.groupdict().values()))], result.groupdict()   #microhuang
        return None, None, None
Exemplo n.º 6
0
    def _match(self, mapping, value):
        for pat, what in mapping:
            if isinstance(what, basestring):
                what, result = utils.re_subm('^' + pat + '$', what, value)
            else:
                result = utils.re_compile('^' + pat + '$').match(value)

            if result:  # it's a match
                return what, [x for x in result.groups()]
        return None, None
Exemplo n.º 7
0
    def _match(self, mapping, value):
        for pat, what in mapping:
            if isinstance(what, basestring):
                what, result = utils.re_subm('^' + pat + '$', what, value)
            else:
                result = utils.re_compile('^' + pat + '$').match(value)

            if result: # it's a match
                return what, [x for x in result.groups()]
        return None, None
Exemplo n.º 8
0
    def _filter_links(self, links, text=None, text_regex=None, url=None, url_regex=None, predicate=None):
        predicates = []
        if text is not None:
            predicates.append(lambda link: link.string == text)
        if text_regex is not None:
            predicates.append(lambda link: re_compile(text_regex).search(link.string or ""))
        if url is not None:
            predicates.append(lambda link: link.get("href") == url)
        if url_regex is not None:
            predicates.append(lambda link: re_compile(url_regex).search(link.get("href", "")))
        if predicate:
            predicate.append(predicate)

        def f(link):
            for p in predicates:
                if not p(link):
                    return False
            return True

        return [link for link in links if f(link)]
Exemplo n.º 9
0
    def validates_email(self, key, email):
        msg = 'Invalid email address'
        if '@' not in email:
            raise ValidationError(msg)

        user, domain = email.split('@')
        user_regex = re_compile(
            r"(^[-!#$%&'*+/=?^_`{}|~0-9A-Z]+(\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)*\Z"
            r'|^"([\001-\010\013\014\016-\037!#-\[\]-\177]|\\[\001-\011\013\014\016-\177])*"\Z)',
            re.IGNORECASE)
        if not user_regex.match(user):
            raise ValidationError(msg)

        domain_regex = re_compile(
            r'((?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+)(?:[A-Z0-9-]{2,63}(?<!-))\Z',
            re.IGNORECASE)
        if not domain_regex.match(domain):
            raise ValidationError(msg)

        return email
Exemplo n.º 10
0
def compile_templates(root):
    """Compiles templates to python code."""
    re_start = re_compile('^', re.M)

    for dirpath, dirnames, filenames in os.walk(root):
        filenames = [
            f for f in filenames if not f.startswith('.')
            and not f.endswith('~') and not f.startswith('__init__.py')
        ]

        for d in dirnames[:]:
            if d.startswith('.'):
                dirnames.remove(d)  # don't visit this dir

        out = open(os.path.join(dirpath, '__init__.py'), 'w')
        out.write('from web.template import CompiledTemplate, ForLoop\n\n')
        if dirnames:
            out.write("import " + ", ".join(dirnames))

        for f in filenames:
            path = os.path.join(dirpath, f)

            if '.' in f:
                name, _ = f.split('.', 1)
            else:
                name = f

            text = open(path).read()
            text = Template.normalize_text(text)
            code = Template.generate_code(text, path)
            code = re_start.sub('    ', code)

            _gen = '' + \
            '\ndef %s():' + \
            '\n    loop = ForLoop()' + \
            '\n    _dummy  = CompiledTemplate(lambda: None, "dummy")' + \
            '\n    join_ = _dummy._join' + \
            '\n    escape_ = _dummy._escape' + \
            '\n' + \
            '\n%s' + \
            '\n    return __template__'

            gen_code = _gen % (name, code)
            out.write(gen_code)
            out.write('\n\n')
            out.write('%s = CompiledTemplate(%s(), %s)\n\n' %
                      (name, name, repr(path)))

            # create template to make sure it compiles
            t = Template(open(path).read(), path)
        out.close()
Exemplo n.º 11
0
def compile_templates(root):
    """Compiles templates to python code."""
    re_start = re_compile('^', re.M)

    for dirpath, dirnames, filenames in os.walk(root):
        filenames = [
            f for f in filenames if not f.startswith('.')
            and not f.endswith('~') and not f.startswith('__init__.py')
        ]

        for d in dirnames[:]:
            if d.startswith('.'):
                dirnames.remove(d)  # don't visit this dir

        out = open(os.path.join(dirpath, '__init__.py'), 'w')
        out.write(
            'from web.template import CompiledTemplate, ForLoop, TemplateResult\n\n'
        )
        if dirnames:
            out.write("import " + ", ".join(dirnames))

        out.write("_dummy = CompiledTemplate(lambda: None, 'dummy')\n")
        out.write("join_ = _dummy._join\n")
        out.write("escape_ = _dummy._escape\n")
        out.write("\n")

        for f in filenames:
            path = os.path.join(dirpath, f)

            if '.' in f:
                name, _ = f.split('.', 1)
            else:
                name = f

            text = open(path).read()
            text = Template.normalize_text(text)
            code = Template.generate_code(text, path)

            code = code.replace("__template__", name, 1)

            out.write(code)

            out.write('\n\n')
            out.write('%s = CompiledTemplate(%s, %s)\n\n' %
                      (name, name, repr(path)))

            # create template to make sure it compiles
            t = Template(open(path).read(), path)
        out.close()
Exemplo n.º 12
0
    def _match(self, mapping, value):
        for pat, what in mapping:
            if isinstance(what, application):
                if value.startswith(pat):
                    f = lambda: self._delegate_sub_application(pat, what)
                    return f, None
                else:
                    continue
            elif isinstance(what, basestring):
                what, result = utils.re_subm('^' + pat + '$', what, value)
            else:
                result = utils.re_compile('^' + pat + '$').match(value)

            if result:  # it's a match
                return what, [x for x in result.groups()]
        return None, None
Exemplo n.º 13
0
    def _match(self, mapping, value):
        for pat, what in utils.group(mapping, 2):
            if isinstance(what, application):
                if value.startswith(pat):
                    f = lambda: self._delegate_sub_application(pat, what)
                    return f, None
                else:
                    continue
            elif isinstance(what, basestring):
                what, result = utils.re_subm("^" + pat + "$", what, value)
            else:
                result = utils.re_compile("^" + pat + "$").match(value)

            if result:  # it's a match
                return what, [x and urllib.unquote(x) for x in result.groups()]
        return None, None
Exemplo n.º 14
0
 def _match(self, mapping, value):
     for pat, what in mapping:
         if isinstance(what, application):
             if value.startswith(pat):
                 f = lambda: self._delegate_sub_application(pat, what)
                 return f, None
             else:
                 continue
         elif isinstance(what, basestring):
             what, result = utils.re_subm('^' + pat + '$', what, value)
         else:
             result = utils.re_compile('^' + pat + '$').match(value)
             
         if result: # it's a match
             return what, [x for x in result.groups()]
     return None, None
Exemplo n.º 15
0
def compile_templates(root):
    """Compiles templates to python code."""
    re_start = re_compile('^', re.M)
    
    for dirpath, dirnames, filenames in os.walk(root):
        filenames = [f for f in filenames if not f.startswith('.') and not f.endswith('~') and not f.startswith('__init__.py')]

        for d in dirnames[:]:
            if d.startswith('.'):
                dirnames.remove(d) # don't visit this dir

        out = open(os.path.join(dirpath, '__init__.py'), 'w')
        out.write('from web.template import CompiledTemplate, ForLoop\n\n')
        if dirnames:
            out.write("import " + ", ".join(dirnames))

        for f in filenames:
            path = os.path.join(dirpath, f)

            if '.' in f:
                name, _ = f.split('.', 1)
            else:
                name = f
                
            text = open(path).read()
            text = Template.normalize_text(text)
            code = Template.generate_code(text, path)
            code = re_start.sub('    ', code)
                        
            _gen = '' + \
            '\ndef %s():' + \
            '\n    loop = ForLoop()' + \
            '\n    _dummy  = CompiledTemplate(lambda: None, "dummy")' + \
            '\n    join_ = _dummy._join' + \
            '\n    escape_ = _dummy._escape' + \
            '\n' + \
            '\n%s' + \
            '\n    return __template__'
            
            gen_code = _gen % (name, code)
            out.write(gen_code)
            out.write('\n\n')
            out.write('%s = CompiledTemplate(%s, %s)\n\n' % (name, name, repr(path)))

            # create template to make sure it compiles
            t = Template(open(path).read(), path)
        out.close()
Exemplo n.º 16
0
def compile_templates(root):
    """Compiles templates to python code."""
    re_start = re_compile("^", re.M)

    for dirpath, dirnames, filenames in os.walk(root):
        filenames = [
            f for f in filenames if not f.startswith(".") and not f.endswith("~") and not f.startswith("__init__.py")
        ]

        out = open(os.path.join(dirpath, "__init__.py"), "w")
        out.write("from web.template import CompiledTemplate, ForLoop\n\n")
        if dirnames:
            out.write("import " + ", ".join(dirnames))

        for f in filenames:
            path = os.path.join(dirpath, f)

            # create template to make sure it compiles
            t = Template(open(path).read(), path)

            if "." in f:
                name, _ = f.split(".", 1)
            else:
                name = f

            code = Template.generate_code(open(path).read(), path)
            code = re_start.sub("    ", code)

            _gen = (
                ""
                + "\ndef %s():"
                + "\n    loop = ForLoop()"
                + '\n    _dummy  = CompiledTemplate(lambda: None, "dummy")'
                + "\n    join_ = _dummy._join"
                + "\n    escape_ = _dummy._escape"
                + "\n"
                + "\n%s"
                + "\n    return __template__"
            )

            gen_code = _gen % (name, code)
            out.write(gen_code)
            out.write("\n\n")
            out.write("%s = CompiledTemplate(%s(), %s)\n\n" % (name, name, repr(path)))
        out.close()
Exemplo n.º 17
0
def compile_templates(root):
    """Compiles templates to python code."""
    re_start = re_compile("^", re.M)

    for dirpath, dirnames, filenames in os.walk(root):
        filenames = [
            f for f in filenames if not f.startswith(".") and not f.endswith("~") and not f.startswith("__init__.py")
        ]

        for d in dirnames[:]:
            if d.startswith("."):
                dirnames.remove(d)  # don't visit this dir

        out = open(os.path.join(dirpath, "__init__.py"), "w")
        out.write("from web.template import CompiledTemplate, ForLoop, TemplateResult\n\n")
        if dirnames:
            out.write("import " + ", ".join(dirnames))

        out.write("_dummy = CompiledTemplate(lambda: None, 'dummy')\n")
        out.write("join_ = _dummy._join\n")
        out.write("escape_ = _dummy._escape\n")
        out.write("\n")

        for f in filenames:
            path = os.path.join(dirpath, f)

            if "." in f:
                name, _ = f.split(".", 1)
            else:
                name = f

            text = open(path).read()
            text = Template.normalize_text(text)
            code = Template.generate_code(text, path)

            code = code.replace("__template__", name, 1)

            out.write(code)

            out.write("\n\n")
            out.write("%s = CompiledTemplate(%s, %s)\n\n" % (name, name, repr(path)))

            # create template to make sure it compiles
            t = Template(open(path).read(), path)
        out.close()
Exemplo n.º 18
0
def compile_templates(root):
    """Compiles templates to python code."""
    re_start = re_compile('^', re.M)
    
    for dirpath, dirnames, filenames in os.walk(root):
        filenames = [f for f in filenames if not f.startswith('.') and not f.endswith('~') and not f.startswith('__init__.py')]

        for d in dirnames[:]:
            if d.startswith('.'):
                dirnames.remove(d) # don't visit this dir

        out = open(os.path.join(dirpath, '__init__.py'), 'w')
        out.write('from web.template import CompiledTemplate, ForLoop, TemplateResult\n\n')
        if dirnames:
            out.write("import " + ", ".join(dirnames))
        out.write("\n")

        for f in filenames:
            path = os.path.join(dirpath, f)

            if '.' in f:
                name, _ = f.split('.', 1)
            else:
                name = f
                
            text = open(path).read()
            text = Template.normalize_text(text)
            code = Template.generate_code(text, path)

            code = code.replace("__template__", name, 1)
            
            # inject "join_ = ..; escape_ = .." into the code. 
            # That is required to make escape functionality work correctly.
            code = code.replace("\n", "\n    join_ = %s._join; escape_ = %s._escape\n" % (name, name), 1)

            out.write(code)

            out.write('\n\n')
            out.write('%s = CompiledTemplate(%s, %s)\n\n' % (name, name, repr(path)))

            # create template to make sure it compiles
            t = Template(open(path).read(), path)
        out.close()
Exemplo n.º 19
0
 def find_indent(text):
     rx = re_compile('  +')
     match = rx.match(text)    
     first_indent = match and match.group(0)
     return first_indent or ""
Exemplo n.º 20
0
 def find_indent(text):
     rx = re_compile('  +')
     match = rx.match(text)    
     first_indent = match and match.group(0)
     return first_indent or ""
Exemplo n.º 21
0
__all__ = ["render"]

import re, urlparse, pprint, traceback, sys
from Cheetah.Compiler import Compiler
from Cheetah.Filters import Filter
from utils import re_compile, memoize, dictadd
from net import htmlquote, websafe
from webapi import ctx, header, output, input, cookies, loadhooks

def upvars(level=2):
    """Guido van Rossum sez: don't use this function."""
    return dictadd(
      sys._getframe(level).f_globals,
      sys._getframe(level).f_locals)

r_include = re_compile(r'(?!\\)#include \"(.*?)\"($|#)', re.M)
def __compiletemplate(template, base=None, isString=False):
    if isString: 
        text = template
    else: 
        text = open('templates/'+template).read()
    # implement #include at compile-time
    def do_include(match):
        text = open('templates/'+match.groups()[0]).read()
        return text
    while r_include.findall(text): 
        text = r_include.sub(do_include, text)

    execspace = _compiletemplate.bases.copy()
    tmpl_compiler = Compiler(source=text, mainClassName='GenTemplate')
    tmpl_compiler.addImportedVarNames(execspace.keys())
Exemplo n.º 22
0
 def _vaild_session_id(self,session_id):
     rx = utils.re_compile('^[0-9a-fA-F]+$')
     if rx.match(session_id):
         return True
     
     return False
Exemplo n.º 23
0
def ssdut_news_list(page_raw):
    ''' parse the news_list page,
    get a list of news, the same squence as the page,

    result.soup
          .page_no
          .news_list
          .total_records
    '''
    result = Storage()
    soup = bsoup(page_raw)
    result.soup = soup

    # get current page number
    r = soup.find(text=ur"\u4e0b\u4e00\u9875")  # text=u"下一页"
    if r:
        '''not the last page'''
        next_page_link = r.parent.attrs[0][1]
        #logging.debug("r.parent.attrs = %r" % r.parent.attrs)
        r = re_compile(r'/p/(\d+)')
        page_no = r.search(next_page_link).group(1)
        page_no = int(page_no)  # - 1
    else:
        ''' the last page'''
        r = soup.find(text=ur'\u4e0a\u4e00\u9875')
        prev_page_link = r.parent.attrs[0][1]
        #logging.debug("r.parent.attrs = %r" % r.parent.attrs)
        r = re_compile(r'/p/(\d+)')
        page_no = r.search(prev_page_link).group(1)
        page_no = int(page_no)  # + 1
    result.page_no = page_no

    # get the news list
    res = soup.findAll(attrs={"bgcolor": "#EEEEEE"})
    news_list = []
    counter = 1
    for r in res:
        a = r.findChildren("a")
        date_str = r.find(text=re_compile("\d{4}-\d{2}-\d{2}")).encode("utf-8")
        news_list.append({
            "link":
            a[0].get("href").encode("utf-8"),
            "title":
            a[0].text.encode("utf-8"),
            "source":
            a[1].text.encode("utf-8"),
            "source_link":
            a[1].get("href").encode("utf-8"),
            "date_str":
            date_str,
            "date":
            datetime.date(*[int(n) for n in date_str.split("-")]),
            "no":
            counter,
        })
        counter += 1
        #logging.debug("source = %s, source_link = %s" %
        #              (news_list[-1]['source'], news_list[-1]['source_link']))
    result.news_list = news_list

    # tital news num
    # 共\d+ t条记录
    s = soup.find(text=re_compile(ur"\u5171\d+ \u6761\u8bb0\u5f55"))
    r = re_compile(ur"\u5171(\d+)")
    result.total_records = int(r.search(s).group(1))

    return result
Exemplo n.º 24
0
 def _valid_session_id(self, session_id):
     rx = utils.re_compile('^[0-9a-fA-F]+$')
     return rx.match(session_id)
Exemplo n.º 25
0
__all__ = ["render"]

import re, urlparse, pprint, traceback, sys
from Cheetah.Compiler import Compiler
from Cheetah.Filters import Filter
from utils import re_compile, memoize, dictadd
from net import htmlquote, websafe
from webapi import ctx, header, output, input, cookies

def upvars(level=2):
    """Guido van Rossum sez: don't use this function."""
    return dictadd(
      sys._getframe(level).f_globals,
      sys._getframe(level).f_locals)

r_include = re_compile(r'(?!\\)#include \"(.*?)\"($|#)', re.M)
def __compiletemplate(template, base=None, isString=False):
    if isString: 
        text = template
    else: 
        text = open('templates/'+template).read()
    # implement #include at compile-time
    def do_include(match):
        text = open('templates/'+match.groups()[0]).read()
        return text
    while r_include.findall(text): 
        text = r_include.sub(do_include, text)

    execspace = _compiletemplate.bases.copy()
    tmpl_compiler = Compiler(source=text, mainClassName='GenTemplate')
    tmpl_compiler.addImportedVarNames(execspace.keys())
Exemplo n.º 26
0
def ssdut_news_parse(raw):
    ''' parse the raw page src,

    store all result in a Storage object.
    all strings are unicode

    result.soup
        BeautifulSoup object
    result.raw
        raw page src
    result.hash
        sha1 hash of the page
    result.title
        title
    result.source
        来源
    result.date_str - date in string
    result.date - date object
    result.body
        html src of the news body
    result.clean_body
        unescaped src of the news body,
    result.publisher
        发表人
    '''
    soup = bsoup(raw)
    result = Storage()

    # raw page / hash
    result.raw = raw
    result.soup = soup

    # title
    s = soup.find(attrs={'class': re_compile('title')})
    result.title = s.text

    # source
    text = soup.find(text=re_compile(r"^http://ssdut.dlut.edu.cn"))
    r = re_compile(
        ur"(\d+-\d+-\d+)\u3000\u3000\u6765\u6e90:(.+)\u5173\u6ce8:")
    res = r.findall(text)[0]
    result.source = res[1].rstrip()

    # date
    result.date_str = res[0]
    result.date = datetime.date(*[int(n) for n in result.date_str.split('-')])

    # content (body)
    c = soup.find(attrs={'class': re_compile('content')})
    result.body = unicode(c)

    # content (body)  unescaped
    texts = c.findAll(text=True)
    all_texts = '\n'.join(texts)
    result.clean_body = html_parser.unescape(all_texts)

    # publisher (could be find at the bottom of page)
    s = soup.find(
        attrs={
            "style": "font-size:14px;float:left;text-align:right;width:80%"
        })
    r = re_compile(ur"\u53d1\u8868\u4eba\uff1a(.+)")
    #logging.debug("publisher string = %r " % s)

    try:
        name = r.findall(s.text)[0]
    except:
        logging.warn(" %s has no publisher " % result.title)
        name = ""  # no publisher: like this: index.php/News/8692.html
    result.publisher = name.rstrip().lstrip()

    # use utf-8 encoding
    for k in ['title', 'source', 'body', 'clean_body', 'publisher']:
        result[k] = result[k].encode('utf-8')


    hash_src = result.body + result.title + result.publisher
    if isinstance(hash_src, str):
        hash_src = unicode(hash_src, "utf-8", "ignore")
    elif isinstance(hash_src, unicode):
        pass
    else:
        pass
    result.sha1 = sha1(hash_src.encode("utf-8")).hexdigest()
    result.search_text = ''.join([result.title, result.source,
                                  result.clean_body, result.publisher,
                                  result.sha1])
    return result
Exemplo n.º 27
0
    def _vaild_session_id(self, session_id):
        rx = utils.re_compile('^[0-9a-fA-F]+$')
        if rx.match(session_id):
            return True

        return False
Exemplo n.º 28
0
def ssdut_news_parse(raw):
    ''' parse the raw page src,

    store all result in a Storage object.
    all strings are unicode

    result.soup
        BeautifulSoup object
    result.raw
        raw page src
    result.hash
        sha1 hash of the page
    result.title
        title
    result.source
        来源
    result.date_str - date in string
    result.date - date object
    result.body
        html src of the news body
    result.clean_body
        unescaped src of the news body,
    result.publisher
        发表人
    '''
    soup = bsoup(raw)
    result = Storage()

    # raw page / hash
    result.raw = raw
    result.soup = soup

    # title
    s = soup.find(attrs={'class': re_compile('title')})
    result.title = s.text

    # source
    text = soup.find(text=re_compile(r"^http://ssdut.dlut.edu.cn"))
    r = re_compile(ur"(\d+-\d+-\d+)\u3000\u3000\u6765\u6e90:(.+)\u5173\u6ce8:")
    res = r.findall(text)[0]
    result.source = res[1].rstrip()

    # date
    result.date_str = res[0]
    result.date = datetime.date(*[int(n) for n in result.date_str.split('-')])

    # content (body)
    c = soup.find(attrs={'class': re_compile('content')})
    result.body = unicode(c)

    # content (body)  unescaped
    texts = c.findAll(text=True)
    all_texts = '\n'.join(texts)
    result.clean_body = html_parser.unescape(all_texts)

    # publisher (could be find at the bottom of page)
    s = soup.find(
        attrs={
            "style": "font-size:14px;float:left;text-align:right;width:80%"
        })
    r = re_compile(ur"\u53d1\u8868\u4eba\uff1a(.+)")
    #logging.debug("publisher string = %r " % s)

    try:
        name = r.findall(s.text)[0]
    except:
        logging.warn(" %s has no publisher " % result.title)
        name = ""  # no publisher: like this: index.php/News/8692.html
    result.publisher = name.rstrip().lstrip()

    # use utf-8 encoding
    for k in ['title', 'source', 'body', 'clean_body', 'publisher']:
        result[k] = result[k].encode('utf-8')

    hash_src = result.body + result.title + result.publisher
    if isinstance(hash_src, str):
        hash_src = unicode(hash_src, "utf-8", "ignore")
    elif isinstance(hash_src, unicode):
        pass
    else:
        pass
    result.sha1 = sha1(hash_src.encode("utf-8")).hexdigest()
    result.search_text = ''.join([
        result.title, result.source, result.clean_body, result.publisher,
        result.sha1
    ])
    return result
Exemplo n.º 29
0
 def _valid_session_id(self, session_id):
     rx = utils.re_compile('^[0-9a-fA-F]+$')
     return rx.match(session_id)
Exemplo n.º 30
0
def ssdut_news_list(page_raw):
    ''' parse the news_list page,
    get a list of news, the same squence as the page,

    result.soup
          .page_no
          .news_list
          .total_records
    '''
    result = Storage()
    soup = bsoup(page_raw)
    result.soup = soup

    # get current page number
    r = soup.find(text=ur"\u4e0b\u4e00\u9875")  # text=u"下一页"
    if r:
        '''not the last page'''
        next_page_link = r.parent.attrs[0][1]
        #logging.debug("r.parent.attrs = %r" % r.parent.attrs)
        r = re_compile(r'/p/(\d+)')
        page_no = r.search(next_page_link).group(1)
        page_no = int(page_no)  # - 1
    else:
        ''' the last page'''
        r = soup.find(text=ur'\u4e0a\u4e00\u9875')
        prev_page_link = r.parent.attrs[0][1]
        #logging.debug("r.parent.attrs = %r" % r.parent.attrs)
        r = re_compile(r'/p/(\d+)')
        page_no = r.search(prev_page_link).group(1)
        page_no = int(page_no)  # + 1
    result.page_no = page_no

    # get the news list
    res = soup.findAll(attrs={"bgcolor": "#EEEEEE"})
    news_list = []
    counter = 1
    for r in res:
        a = r.findChildren("a")
        date_str = r.find(text=re_compile("\d{4}-\d{2}-\d{2}")).encode("utf-8")
        news_list.append(
            {
                "link": a[0].get("href").encode("utf-8"),
                "title": a[0].text.encode("utf-8"),
                "source": a[1].text.encode("utf-8"),
                "source_link": a[1].get("href").encode("utf-8"),
                "date_str": date_str,
                "date": datetime.date(
                    *[int(n) for n in date_str.split("-")]),
                "no": counter,
            })
        counter += 1
        #logging.debug("source = %s, source_link = %s" %
        #              (news_list[-1]['source'], news_list[-1]['source_link']))
    result.news_list = news_list

    # tital news num
    # 共\d+ t条记录
    s = soup.find(text=re_compile(ur"\u5171\d+ \u6761\u8bb0\u5f55"))
    r = re_compile(ur"\u5171(\d+)")
    result.total_records = int(r.search(s).group(1))

    return result