Python slavy 예제들, monutils.crawling.slavy.slavy Python 예제들

예제 #1

0

파일 보기

파일: urllib2_json_example.py 프로젝트: marce0202/app

def post_json(host='', accept='', accept_language='', accept_encoding='', referer='', cookie='', content_type='', post_dict={}, url = 'http://www.google.es'):

    sl = slavy()
    #accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
    #accept_language = "en-US,en;q=0.5"
    #accept_encoding = "gzip, deflate"
    #content_type = "application/x-www-form-urlencoded; charset=UTF-8"
    #referer = "http://espaciolaboral.org/bolsa-de-trabajo-y-empleo-espacio-laboral/"
    user_agent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0"

    sl.headers['Host'] = host
    sl.headers['User-Agent'] = user_agent
    sl.headers['Accept'] = accept
    sl.headers['Accept-Language'] = accept_encoding
    sl.headers['Accept-Encoding'] = accept_encoding
    sl.headers['Referer'] = referer
    sl.headers['Content-Type'] = content_type

    data = json.dumps(post_dict)
    #data = urllib.urlencode(post_dict)
    req = urllib2.Request(url=url, data=data, headers=sl.headers)
    response = urllib2.urlopen(req)
    #html = response.read()
    buf = StringIO(response.read())
    f = gzip.GzipFile(fileobj=buf)
    html = f.read()
    #offer_data = json.loads(buf)
    return html

예제 #2

0

파일 보기

파일: test_caducados.py 프로젝트: marce0202/app

def empty_extract(url):
    xtr = ''

    sl = slavy()
    sl.start(url)
    sl.metaExtract = True
    sl.extract(xtr)
    print ' # Inicio prueba EXTRACT VACIO\n ## URL:',url
    print '####################################################################################'
    sl.printM()
    print '####################################################################################'

예제 #3

0

파일 보기

파일: test_caducados.py 프로젝트: marce0202/app

def crawl_pattern(url,frase):
    check = False
    xtr = xtr_aux.format(frase,'</body>')
    try:

        sl = slavy()
        sl.start(url)
        sl.timeout = 10.0
        sl.maxConnectRetries = 1
        sl.extract("")
        # Comprobamos 404
        if len(sl.M) > 0:
            for item in sl.M:
                content = item.get("content", "")
                if not content:
                    print '[ERROR] comprobacion del 404'

        sl = slavy()
        sl.start(url)
        sl.WR = ["virtual:{0}".format(content)]
        sl.metaExtract = True
        sl.extract(xtr)
        
        if len(sl.M) > 0:
            check = True
            
    except Exception as e:
        print "Error en extract:", e
        check = False

    check = False

    if check:
        print '\n**** Encontrada frase de caducados: "{0}" *\n'.format(frase)
    else:
        print '\n[ERROR]: La frase no existe, la oferta no está caducada. *\n'

예제 #4

0

파일 보기

파일: utils_defs.py 프로젝트: marce0202/app

def make_slavy_post(url,dic_headers,dic_):
    
    if dic_headers:

        if dic_headers['Host']:
            sl.headers['Host'] = dic_headers['Host']
    
        if dic_headers['User-Agent']:
            sl.headers['User-Agent'] = dic_headers['User-Agent']
            
        if dic_headers['Accept']:
            sl.headers['Accept'] = dic_headers['Accept']

        if dic_headers['Accept-Language']:
            sl.headers['Accept-Language'] = dic_headers['Accept-Language']
    
        if dic_headers['Accept-Encoding']:
            sl.headers['Accept-Encoding'] = dic_headers['Accept-Encoding']
    
        if dic_headers['Referer']:
            sl.headers['Referer'] = dic_headers['Referer']
    
        if dic_headers['Content-Type']:
            sl.headers['Content-Type'] = dic_headers['Content-Type']
    
        if dic_headers['Content-Length']:
            sl.headers['Content-Length'] = dic_headers['Content-Length']
    
        if dic_headers['Connection']:
            sl.headers['Connection'] = dic_headers['Connection']

    sl = slavy()
    sl.start(url)
    sl.metaExtract = True
    sl.extract('',True)
    content = ''
    
    if sl.M:
        salida_txt = open('post.html', 'a')
        content = sl.M
        sl.printWR()
        sl.printM()
        salida_txt.write(u'{0}'.format(content))
        salida_txt.close()
        print 'Escribiendo en el archivo "post.html"'
    else:
        print '[ERROR] No se ha podido acceder a la url.'

예제 #5

0

파일 보기

파일: test_slavy_steps.py 프로젝트: marce0202/app

def arane(url,stp,stp2):
    sl = slavy()
    sl.start(url)
    sl.metaExtract = True
    
    if stp:
        print '\nPrimer step'
        sl.step(stp)
        sl.printWR()
        print '================='
        
    if stp2:
        print '\nSegundo step'
        sl.step(stp2)
        sl.printWR()
        print '================='
    
    print '|---- FIN ----|'

예제 #6

0

파일 보기

파일: arañe_browsjob.py 프로젝트: marce0202/app

def get_top_searches(url_browsjobs):
    '''
    Entra en la url y devuelve en una lista
    las categorias y los trabajos con la estructura
    browsejobs = ['categoria|:|trabajo']
    '''

    browsejobs = []
    sl = slavy()
    sl.start(url_browsjobs)
    sl.metaExtract = True
    sl.step(stp)
    sl.extract(xtr_title)
    for item in sl.M:
        categoria = re.sub('.*\/|-',' ',item.get('@url','')).strip()
        trabajo = re.sub('Jobs$','',item.get('job','')).strip()
        if categoria and trabajo:
            #print 'categoria:',categoria
            #print 'trabajo:',trabajo
            browsejobs.append(u'{0}|:|{1}'.format(categoria,trabajo))
    return browsejobs

예제 #7

0

파일 보기

파일: prueba_step_slavy.py 프로젝트: marce0202/app

def arane():
    sl = slavy()
    sl.start('http://www.cronoshare.com/entrar')
    sl.metaExtract = True
    sl.extract(xtr)

    ran = sl.M[0].get('ran','')
    password = hashlib.md5("6183d4f648179e12be33cce6ec6acda5"+ran).hexdigest()
    url = 'http://www.cronoshare.com/Actions/CuserLogin!email=fernando.jalon%40mon-digital.es&password={0}'.format(password)

    sl.WR = [url]
    sl.extract('')
    sl.M = []
    print '================='
    sl.WR = ['http://www.cronoshare.com/presupuesto-reformar-piso-en-zamora-zamora-tarea-566693-web']
    sl.extract('')
    sl.printM()
    #exit(0)
    #sl.step(stp)
    #sl.printWR()
    
    
    print '|---- FIN ----|'

예제 #8

0

파일 보기

파일: urllib2_json_example.py 프로젝트: marce0202/app

    #html = response.read()
    buf = StringIO(response.read())
    f = gzip.GzipFile(fileobj=buf)
    html = f.read()
    #offer_data = json.loads(buf)
    return html
    


print 'Starting scripting...'

url = 'http://espaciolaboral.org/jm-ajax/get_listings/'
host = "espaciolaboral.org"
accept = "*/*"
accept_language = 'en-US,en;q=0.5'
accept_encoding = 'gzip, deflate'
referer = 'http://espaciolaboral.org/bolsa-de-trabajo-y-empleo-espacio-laboral/'
cookie = ''
content_type = 'application/x-www-form-urlencoded; charset=UTF-8'
post_dict = 'search_keywords=&search_location=&filter_job_type%5B%5D=empleo-temporal&filter_job_type%5B%5D=medio-tiempo&filter_job_type%5B%5D=tiempo-completo&filter_job_type%5B%5D=&per_page=10&orderby=featured&order=DESC&page=2&show_pagination=false&form_data=search_keywords%3D%26search_location%3D%26filter_job_type%255B%255D%3Dempleo-temporal%26filter_job_type%255B%255D%3Dmedio-tiempo%26filter_job_type%255B%255D%3Dtiempo-completo%26filter_job_type%255B%255D%3D'
#post_dict = {}

html = post_json(host, accept, accept_language, accept_encoding, referer, cookie, content_type, post_dict, url)
print 'html:',html
sl = slavy()
sl.start(url)
sl.metaExtract = True
sl.WR = ["virtual:{0}".format(html)]
sl.extract('')
sl.printM()