def agregarInformacionDocumento(self, url, contenido): """Metodo para obtener diferentes partes del documento""" try: unaUrl = URL(url) if not 'pdf' in extension(unaUrl.page): html = contenido unElemento = Element(self.descargarContenidoHtml(url)) body = self.getBody(unElemento) urlValues = self.getUrlValues(unElemento) titulo = self.getTitulo(unElemento) html = self.verificarContenidoVacio(html) body = self.verificarContenidoVacio(body) urlValues = self.verificarContenidoVacio(urlValues) titulo = self.verificarContenidoVacio(titulo) self.mongodb.setInformacionDocumento(html, url, titulo, urlValues, body) else: html = self.verificarContenidoVacio(contenido) body = "" urlValues = "" titulo = "" self.mongodb.setInformacionDocumento(html, url, titulo, urlValues, body) except Exception as e: print str(e)
def start(self,scraperLinks,progress,directorio,id_request,searchKey): unConfig = config() step=0 progress.set_totalScraping(len(scraperLinks)) progress.set_scrapingState('Ejecutando') # ordenar por el peso de los documentos self.rankear(scraperLinks,searchKey) scraperLinks = sorted(scraperLinks, key=lambda k: k['totalScore']) scraperLinks = self.unificarLista(scraperLinks) self.crearTop50(scraperLinks,directorio,unConfig) progress.totalNodes = len(scraperLinks) for link in scraperLinks: if not progress.get_stop(): step+=1 progress.set_scrapingProgress(step) url=URL(link['link']) fileNameJson = str(step).zfill(2)+"_"+url.domain+'.json' fileNameDocument = str(step).zfill(2)+"_"+url.domain if extension(url.page) == ".pdf": fileNameDocument += ".pdf" else: fileNameDocument += ".html" try: self.fileGenerator.json(link,fileNameJson,fileNameDocument,link,id_request,directorio) except Exception,e: print str(e) pass else: progress.set_scrapingState('Detenido') print 'Detenido' break
def descargarContenidoHtml(self, url): try: unaUrl = URL(url) if "pdf" in extension(unaUrl.page): return self.descargarPDF(unaUrl) else: return unaUrl.download() except Exception as e: try: return self.urlLibDescarga(url) except Exception as e: print "except " + str(e) print url
def descargarContenido(self, url): """Metodo para descargar el contenido de los documentos webs siendo url o pdf""" try: unaUrl = URL(url) if "pdf" in extension(unaUrl.page): return self.descargarPDF(unaUrl) else: return plaintext(unaUrl.download()) except Exception as e: try: return plaintext(self.urlLibDescarga(url)) except Exception as e: print "except " + str(e) print url
def buscar_Flickr(self, texto): """ Busca en Flickr 6 imagenes y las guarda """ engine = Flickr(license=None, throttle=0.5, language='es') i = 0 for result in engine.search(texto, count=6, cached=True, copyright=False): self.espera(i) directorio = os.path.join('imagenes', 'busqueda', str(i) + extension(result.url)) f = open(directorio, 'wb') f.write(result.download(timeout=10)) f.close() i += 1
# This example downloads an image from Flickr (http://flickr.com). # Acquiring the image data takes three Flickr queries: # - the first query with Flickr.search() retrieves a list of results, # - the second query is executed behind the scenes in the FlickResult.url property, # - the third query downloads the actual image data using this URL. # It is a good idea to cache results from Flickr locally, # which is what the cached=True parameter does. # You should obtain your own license key at: # http://www.flickr.com/services/api/ # Otherwise you will be sharing the default key with all users of this module. engine = Flickr(license=None) q = "duracell bunny" results = engine.search(q, size=MEDIUM, sort=RELEVANCY, cached=True) for img in results: # print img.url # Retrieving the actual image URL executes an additional query. print img.description print img.author print # Download and save the image: img = results[0] data = img.download() path = q.replace(" ", "_") + extension(img.url) f = open(path, "wb") f.write(data) f.close() print "Download:", img.url print "Saved as:", path
def test_extension(self): # Assert filename extension. v = web.extension(os.path.join("pattern", "test", "test-web.py.zip")) self.assertEqual(v, ".zip") print "pattern.web.extension()"
# For macOS SSL issue when downloading file(s) from external sources import ssl ssl._create_default_https_context = ssl._create_unverified_context # ### Accessing Web Pages from pattern.web import download page_html = download('https://en.wikipedia.org/wiki/Artificial_intelligence', unicode=True) from pattern.web import URL, extension page_url = URL( 'https://upload.wikimedia.org/wikipedia/commons/f/f1/RougeOr_football.jpg') file = open('football' + extension(page_url.page), 'wb') file.write(page_url.download()) file.close() # ### Finding URLs within Text from pattern.web import find_urls print(find_urls('To search anything, go to www.google.com', unique=True)) # ### Making Asynchronous Requests for Webpages from pattern.web import asynchronous, time, Google asyn_req = asynchronous(Google().search, 'artificial intelligence', timeout=4) while not asyn_req.done:
from pattern.web import Flickr, extension from pattern.web import RELEVANCY, LATEST, INTERESTING # Image sort order. from pattern.web import SMALL, MEDIUM, LARGE # Image size. # This example downloads an image from Flickr (http://flickr.com). # Acquiring the image data takes three Flickr queries: # - the first query with Flickr.search() retrieves a list of results, # - the second query is executed behind the scenes in the FlickResult.url property, # - the third query downloads the actual image data using this URL. # It is a good idea to cache results from Flickr locally, # which is what the cached=True parameter does. # You should obtain your own license key at: # http://www.flickr.com/services/api/ # Otherwise you will be sharing the default key with all users of this module. engine = Flickr(license=None) q = "duracell bunny" results = engine.search(q, size=MEDIUM, sort=RELEVANCY, cached=True) for img in results: #print img.url # Retrieving the actual image URL executes an additional query. print img.description print img.author print # Download and save the image: img = results[0] data = img.download() f = open(q.replace(" ","_") + extension(img.url), "w") f.write(data) f.close()
# This example downloads an image from Flickr (http://flickr.com). # Acquiring the image data takes three Flickr queries: # 1) Flickr.search() retrieves a list of results, # 2) FlickrResult.url retrieves the image URL (behind the scenes), # 3) FlickrResult.download() visits FlickrResult.url and downloads the content. # It is a good idea to cache results from Flickr locally, # which is what the cached=True parameter does. # You should obtain your own license key at: # http://www.flickr.com/services/api/ # Otherwise you will be sharing the default key with all users of pattern.web. engine = Flickr(license=None) q = "duracell bunny" results = engine.search(q, size=MEDIUM, sort=RELEVANCY, cached=False) for img in results: #print(img.url) # Retrieving the actual image URL executes a query. print(img.text) print(img.author) print("") # Download and save one of the images: img = results[0] data = img.download() path = q.replace(" ", "_") + extension(img.url) f = open(path, "wb") f.write(data) f.close() print("Download: %s" % img.url) print("Saved as: %s" % path)
def save_image(url, figure): url = URL(url) f = open('illustrations/' + figure + extension(url.page), 'wb') f.write(url.download()) f.close()
from pattern.web import Flickr, extension from pattern.web import RELEVANCY, LATEST, INTERESTING # Image sort order. from pattern.web import SMALL, MEDIUM, LARGE # Image size. # This example downloads an image from Flickr (http://flickr.com). # Acquiring the image data takes three Flickr queries: # - the first query with Flickr.search() retrieves a list of results, # - the second query is executed behind the scenes in the FlickResult.url property, # - the third query downloads the actual image data using this URL. # It is a good idea to cache results from Flickr locally, # which is what the cached=True parameter does. # You should obtain your own license key at: # http://www.flickr.com/services/api/ # Otherwise you will be sharing the default key with all users of this module. engine = Flickr(license=None) q = "duracell bunny" results = engine.search(q, size=MEDIUM, sort=RELEVANCY, cached=True) for img in results: #print img.url # Retrieving the actual image URL executes an additional query. print img.description print img.author print # Download and save the image: img = results[0] data = img.download() f = open(q.replace(" ", "_") + extension(img.url), "w") f.write(data) f.close()
def isPDF(self, param): url = URL(param) if "pdf" in extension(url.page): return 1 else: return 0
from pattern.web import URL, DOM, extension, MIMETYPE_IMAGE from pattern.web import Element, download import urllib import datetime #libraries to check urllib (legacy vs not), pattern, requests url = URL("http://www.dot.ca.gov/dist1/d1tmc/allcams.php") dom = DOM(url.download(cached = True)) i = 0 try : for e in dom.by_tag('img'): if (extension(e.attr['src']) == '.jpg'): print(e.attr['src']) urllib.request.urlretrieve(e.attr['src'], "data/test/urllib{0}.jpg".format(i)) #image = download(e.attr['src'], unicode= False, timeout= 5) #f = open("data/test/pattern{0}.jpg".format(i), 'wb') #f.write(image) i += 1 except: print ("error") """ image = "http://www1.dot.ca.gov/cwwp2/data/d1/cctv/image/us101northofcushingcreeklookingsouth/us101northofcushingcreeklookingsouth.jpg" url = URL(image) print (url.mimetype in MIMETYPE_IMAGE) urllib.request.urlretrieve(image, 'data/test2.jpg') """