예제 #1
0
def process(func, *args, **kwargs):

    def wrapper(*args, **kwargs):
        url, urlhash, status, domain, content = func(*args, **kwargs)
        if not content or not isinstance(content, unicode):
            return []
        try:
            url = url.encode('utf8')
        except Exception, e:
            print e
        rootdomain = tldextracter.extract_rootdomain(url)
        if not rootdomain:
            return []
        try:
            absolute_content = lxml.html.make_links_absolute(content, url)
            tree = lxml.html.fromstring(absolute_content)
        except Exception, e:
            print e
            return []
예제 #2
0
파일: db.py 프로젝트: pombredanne/recrawler
def filter_recent(r, jobs):
    filtered_jobs = []
    for job in jobs:
        try:
            task = json.loads(job)
            url = task['url']
            del task
            rootdomain = tldextracter.extract_rootdomain(url)
        except:
            r.lpush(config.QUEUE, job)

        try:
            if not r.exists('%s_status' % rootdomain):
                r.set('%s_status' % rootdomain, '')
                r.expire('%s_status' % rootdomain, config.EXPIRE)
                filtered_jobs.append(job)
            else:
                r.lpush(config.QUEUE, job)
        except Exception, e:
            print e
            return jobs
예제 #3
0
`domain` VARCHAR(100) NOT NULL DEFAULT '',
`url` VARCHAR(512) NOT NULL DEFAULT ''
) Engine=INNoDB DEFAULT CHARSET=utf8;
    '''
    pass


def insert_site(name, language, url):

    if isinstance(url, unicode):
        try:
            url = url.strip().encode('utf8')
        except Exception, e:
            print e
            pass
    rootdomain = tldextracter.extract_rootdomain(url)
    domainhash = cityhash.CityHash64(rootdomain)
    query = PySQLPool.getNewQuery(connection)
    query.Query('''select domainhash from news_sites where domainhash = %s;''',
                domainhash)
    if query.record:
        return False
    else:
        query.Query('insert into news_sites(domainhash, language, name, '
                    'domain, url) values(%s, %s, %s, %s, %s);',
                    (domainhash, language, name, rootdomain, url))
        query.Pool.Commit()
        return True


if __name__ == '__main__':