Пример #1
0
def process(line):
   inclass=False
   for tag, patterns in tags.items():
       for key, pattern in patterns:
           if textfilterre(line[key], patterns=[pattern]):
               line['tags'].append(tag)
               break
   return line
Пример #2
0
def process(line):
    # Referer
    # unquote http_referer
    try:
        tmp=uunquote(line['http_referer']).decode('utf8')
    except:
        tmp=uunquote(line['http_referer'])
    # simplify referers
    urlobj=urlparse(tmp)
    if (tmp.startswith('http://www.facebook.com/l.php?u=') or
        tmp.startswith('http://m.facebook.com/l.php?u=')):
        line['http_referer']='http://www.facebook.com/'
    elif urlobj.netloc.startswith('www.google.') and urlobj.path=="/imgres":
        query=parse_qs(urlobj.query)
        line['image']=query.get('imgurl',[''])[0]
        line['imageref']=query.get('imgrefurl',[''])[0]
        line['http_referer']='%s://%s/imgres?imgurl=%s' % (urlobj.scheme,
                                                           urlobj.netloc,
                                                           line['image'])
    elif urlobj.netloc.startswith('www.google.') and urlobj.path=="/url":
        query=parse_qs(urlobj.query)
        line['http_referer']='%s://%s/url=%s' % (urlobj.scheme,
                                                 urlobj.netloc,
                                                 uunquote(query.get('url',[''])[0]))
    elif urlobj.netloc.startswith('webcache.googleusercontent.') and urlobj.path=="/search":
        query=parse_qs(urlobj.query)
        try:
            u=uunquote(query.get('q',[':'])[0]).split(':',1)[1].decode('utf8')
        except:
            u=uunquote(query.get('q',[':'])[0]).split(':',1)[1]
        line['http_referer']='%s://%s/url=%s' % (urlobj.scheme,
                                                 urlobj.netloc,
                                                 u)
    else:
        line['http_referer']=tmp
    # extref?
    if line['http_referer'] not in ['', '-'] and not textfilterre(line['http_referer'], patterns=ownhosts):
        line['tags'].append('extref')
    return line