def post_json(host='', accept='', accept_language='', accept_encoding='', referer='', cookie='', content_type='', post_dict={}, url = 'http://www.google.es'): sl = slavy() #accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" #accept_language = "en-US,en;q=0.5" #accept_encoding = "gzip, deflate" #content_type = "application/x-www-form-urlencoded; charset=UTF-8" #referer = "http://espaciolaboral.org/bolsa-de-trabajo-y-empleo-espacio-laboral/" user_agent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0" sl.headers['Host'] = host sl.headers['User-Agent'] = user_agent sl.headers['Accept'] = accept sl.headers['Accept-Language'] = accept_encoding sl.headers['Accept-Encoding'] = accept_encoding sl.headers['Referer'] = referer sl.headers['Content-Type'] = content_type data = json.dumps(post_dict) #data = urllib.urlencode(post_dict) req = urllib2.Request(url=url, data=data, headers=sl.headers) response = urllib2.urlopen(req) #html = response.read() buf = StringIO(response.read()) f = gzip.GzipFile(fileobj=buf) html = f.read() #offer_data = json.loads(buf) return html
def empty_extract(url): xtr = '' sl = slavy() sl.start(url) sl.metaExtract = True sl.extract(xtr) print ' # Inicio prueba EXTRACT VACIO\n ## URL:',url print '####################################################################################' sl.printM() print '####################################################################################'
def crawl_pattern(url,frase): check = False xtr = xtr_aux.format(frase,'</body>') try: sl = slavy() sl.start(url) sl.timeout = 10.0 sl.maxConnectRetries = 1 sl.extract("") # Comprobamos 404 if len(sl.M) > 0: for item in sl.M: content = item.get("content", "") if not content: print '[ERROR] comprobacion del 404' sl = slavy() sl.start(url) sl.WR = ["virtual:{0}".format(content)] sl.metaExtract = True sl.extract(xtr) if len(sl.M) > 0: check = True except Exception as e: print "Error en extract:", e check = False check = False if check: print '\n**** Encontrada frase de caducados: "{0}" *\n'.format(frase) else: print '\n[ERROR]: La frase no existe, la oferta no está caducada. *\n'
def make_slavy_post(url,dic_headers,dic_): if dic_headers: if dic_headers['Host']: sl.headers['Host'] = dic_headers['Host'] if dic_headers['User-Agent']: sl.headers['User-Agent'] = dic_headers['User-Agent'] if dic_headers['Accept']: sl.headers['Accept'] = dic_headers['Accept'] if dic_headers['Accept-Language']: sl.headers['Accept-Language'] = dic_headers['Accept-Language'] if dic_headers['Accept-Encoding']: sl.headers['Accept-Encoding'] = dic_headers['Accept-Encoding'] if dic_headers['Referer']: sl.headers['Referer'] = dic_headers['Referer'] if dic_headers['Content-Type']: sl.headers['Content-Type'] = dic_headers['Content-Type'] if dic_headers['Content-Length']: sl.headers['Content-Length'] = dic_headers['Content-Length'] if dic_headers['Connection']: sl.headers['Connection'] = dic_headers['Connection'] sl = slavy() sl.start(url) sl.metaExtract = True sl.extract('',True) content = '' if sl.M: salida_txt = open('post.html', 'a') content = sl.M sl.printWR() sl.printM() salida_txt.write(u'{0}'.format(content)) salida_txt.close() print 'Escribiendo en el archivo "post.html"' else: print '[ERROR] No se ha podido acceder a la url.'
def arane(url,stp,stp2): sl = slavy() sl.start(url) sl.metaExtract = True if stp: print '\nPrimer step' sl.step(stp) sl.printWR() print '=================' if stp2: print '\nSegundo step' sl.step(stp2) sl.printWR() print '=================' print '|---- FIN ----|'
def get_top_searches(url_browsjobs): ''' Entra en la url y devuelve en una lista las categorias y los trabajos con la estructura browsejobs = ['categoria|:|trabajo'] ''' browsejobs = [] sl = slavy() sl.start(url_browsjobs) sl.metaExtract = True sl.step(stp) sl.extract(xtr_title) for item in sl.M: categoria = re.sub('.*\/|-',' ',item.get('@url','')).strip() trabajo = re.sub('Jobs$','',item.get('job','')).strip() if categoria and trabajo: #print 'categoria:',categoria #print 'trabajo:',trabajo browsejobs.append(u'{0}|:|{1}'.format(categoria,trabajo)) return browsejobs
def arane(): sl = slavy() sl.start('http://www.cronoshare.com/entrar') sl.metaExtract = True sl.extract(xtr) ran = sl.M[0].get('ran','') password = hashlib.md5("6183d4f648179e12be33cce6ec6acda5"+ran).hexdigest() url = 'http://www.cronoshare.com/Actions/CuserLogin!email=fernando.jalon%40mon-digital.es&password={0}'.format(password) sl.WR = [url] sl.extract('') sl.M = [] print '=================' sl.WR = ['http://www.cronoshare.com/presupuesto-reformar-piso-en-zamora-zamora-tarea-566693-web'] sl.extract('') sl.printM() #exit(0) #sl.step(stp) #sl.printWR() print '|---- FIN ----|'
#html = response.read() buf = StringIO(response.read()) f = gzip.GzipFile(fileobj=buf) html = f.read() #offer_data = json.loads(buf) return html print 'Starting scripting...' url = 'http://espaciolaboral.org/jm-ajax/get_listings/' host = "espaciolaboral.org" accept = "*/*" accept_language = 'en-US,en;q=0.5' accept_encoding = 'gzip, deflate' referer = 'http://espaciolaboral.org/bolsa-de-trabajo-y-empleo-espacio-laboral/' cookie = '' content_type = 'application/x-www-form-urlencoded; charset=UTF-8' post_dict = 'search_keywords=&search_location=&filter_job_type%5B%5D=empleo-temporal&filter_job_type%5B%5D=medio-tiempo&filter_job_type%5B%5D=tiempo-completo&filter_job_type%5B%5D=&per_page=10&orderby=featured&order=DESC&page=2&show_pagination=false&form_data=search_keywords%3D%26search_location%3D%26filter_job_type%255B%255D%3Dempleo-temporal%26filter_job_type%255B%255D%3Dmedio-tiempo%26filter_job_type%255B%255D%3Dtiempo-completo%26filter_job_type%255B%255D%3D' #post_dict = {} html = post_json(host, accept, accept_language, accept_encoding, referer, cookie, content_type, post_dict, url) print 'html:',html sl = slavy() sl.start(url) sl.metaExtract = True sl.WR = ["virtual:{0}".format(html)] sl.extract('') sl.printM()