def crawl(request): if request.method == 'POST': form = UrlForm(request.POST) if form.is_valid(): post_url = request.POST.get('url', '') domain = urlparse.urlsplit(post_url) domain_url = domain.scheme + '://' + domain.netloc + '/' try: source = urllib2.urlopen(post_url) except: logger.exception('URLopen can`t open the url') return HttpResponseRedirect('/my_account') logger.debug('Fetching images from url: %s', post_url) soup = BeautifulSoup(source) page_title_fb = soup.find("meta", {"property":"og:title"} ) page_title_tw = soup.find("meta", {"name":"twitter:title"} ) if page_title_fb and not page_title_tw: page_title = page_title_fb['content'] elif page_title_tw and not page_title_fb: page_title = page_title_tw['content'] else: page_title = soup.title.string page_title = page_title.encode('utf-8') try: site = site_url.objects.get( user=request.user, url=post_url ) if site: delete_site_by_id(site.id) site.user = request.user site.title = page_title site.url = post_url try: site.save() except Exception, e: logger.exception('Site url problem') return HttpResponseRedirect('/my_account') except Exception, e: site = site_url( user=request.user, title=page_title, url=post_url ) try: site.save() except Exception, e: logger.exception('Site url problem') return HttpResponseRedirect('/my_account') allowed_exts = ('png', 'jpg', 'gif', 'bmp') images_fb = soup.findAll("meta", {"property":"og:image"}) meta_images_urls = [] if images_fb: for img in images_fb: if img.get('content') is not None: if img.get('content').rsplit('.')[-1] in allowed_exts: db_img = site_image( url=site ) try: img_filename = img.get('content').split('/')[-1] img_temp = NamedTemporaryFile( dir='/media/D/virtual_env/bin/melon/melon/static/', delete=True ) img_temp.write(urllib2.urlopen(urlparse.urljoin( domain_url, img.get('content') )).read()) img_temp.flush() db_img.image_url.save(img_filename, File(img_temp)) db_img.save() except Exception, e: logger.exception('Saving images from facebook') continue meta_images_urls.append( img.get('content') )
img.get('content') ) images_tw = soup.find("meta", {"name":"twitter:image"} ) if images_tw: for img in images_tw: if meta_images_urls and img.get('content'): if any(img.get('content') in s for s in meta_images_urls): continue if img.get('content') is not None: if img.get('content').rsplit('.')[-1] in allowed_exts: db_img = site_image( url=site ) try: img_filename = img.get('content').split('/')[-1] img_temp = NamedTemporaryFile(delete=True) img_temp.write(urllib2.urlopen(urlparse.urljoin( domain_url, img.get('content') )).read()) img_temp.flush() db_img.image_url.save(img_filename, File(img_temp))