def get_search_result_parser(base_url, page_idx): page_url = re.sub("PAGEIDX", str(page_idx), base_url) opener = proxy_opener() html = opener.open( page_url ) #urllib2.build_opener(urllib2.ProxyHandler({"http": "http://localhost:3128"})).open(page_url) search_results_parser = BeautifulSoup(html) return search_results_parser
def _get_url(url): """ retrieves the created url """ #if not settings_local.PROXY: proxy_url = proxy_opener() html = proxy_url.open(url) #else: #response = urllib2.urlopen(url) #html = response.read() return html
def __getHTMLPage_Containing_SearchResult(url_base, index_offset) : # set up fields for any type of search search_results_per_page = 25 search_page_num = str(1 + (index_offset/search_results_per_page)) howFarDownThePage = index_offset % search_results_per_page url = url_base + "&page="+search_page_num # use a proxy handler as developing behind firewall proxy_url = proxy_opener() html = proxy_url.open(url) return html, howFarDownThePage
def __getHTMLPage_Containing_SearchResult(url_base, index_offset): # set up fields for any type of search search_results_per_page = 25 search_page_num = str(1 + (index_offset / search_results_per_page)) howFarDownThePage = index_offset % search_results_per_page url = url_base + "&page=" + search_page_num # use a proxy handler as developing behind firewall proxy_url = proxy_opener() html = proxy_url.open(url) return html, howFarDownThePage
def __get_image_properties_from_imageSpecific_page(id) : """ Slower but more thorough method for finding metadata """ page_url = BASE_IMAGE_PROPERTIES_URL + "?asset=" + id proxy_url = proxy_opener() html = proxy_url.open(page_url) page_html_parser = BeautifulSoup(html) containing_div = page_html_parser.find('div', id="info", style=True) # check for style, because there are two div with id info artist = containing_div.find('dd') # first dd title = artist.findNextSibling('dd').findNextSibling('dd') date = title.findNextSibling('dd') # note, not just numeric access = containing_div('dd')[-1] # last dd in containing_div meta = {'artist': artist.renderContents(), 'title': title.renderContents(), 'date': date.renderContents(), 'access': access.renderContents()} return (title.renderContents(), meta)
def __get_image_properties_from_imageSpecific_page(id): """ Slower but more thorough method for finding metadata """ page_url = BASE_IMAGE_PROPERTIES_URL + "?asset=" + id proxy_url = proxy_opener() html = proxy_url.open(page_url) page_html_parser = BeautifulSoup(html) containing_div = page_html_parser.find( 'div', id="info", style=True) # check for style, because there are two div with id info artist = containing_div.find('dd') # first dd title = artist.findNextSibling('dd').findNextSibling('dd') date = title.findNextSibling('dd') # note, not just numeric access = containing_div('dd')[-1] # last dd in containing_div meta = { 'artist': artist.renderContents(), 'title': title.renderContents(), 'date': date.renderContents(), 'access': access.renderContents() } return (title.renderContents(), meta)
jobinfo = JobInfo.objects.get(id=job.arg) except Exception, ex: print 'oh no, i except' arg = json.loads(jobinfo.arg) record = Record.objects.get(id=arg['record'], manager='unitedsearch') try: if jobinfo.status.startswith == 'Complete': return url = arg['url'] print 'ready to download image at: ' + url storage = get_storage() if storage: print 'storage in workers.py is valid at 32' else: print 'storage is invalid at 32' proxy = proxy_opener() # where you get a url error file = proxy.open(url) image_data = file.read() print 'unitedsearch.workers.py -- image_data: '+str(len(image_data)) size = len(image_data) image_file=StringIO.StringIO(image_data) if image_file: print 'have image file' else: print 'do not have image file' #size = file.info().get('content-length') #setattr(file, 'size', int(size if size else 0)) setattr(image_file, 'size', int(size if size else 0)) mimetype = file.info().get('content-type')
def _get_url(url): """ retrieves the created url """ proxy_url = proxy_opener() html = proxy_url.open(url) return html
def get_search_result_parser(base_url, page_idx) : page_url = re.sub("PAGEIDX", str(page_idx),base_url) opener = proxy_opener() html = opener.open(page_url)#urllib2.build_opener(urllib2.ProxyHandler({"http": "http://localhost:3128"})).open(page_url) search_results_parser = BeautifulSoup(html) return search_results_parser
jobinfo = JobInfo.objects.get(id=job.arg) except Exception, ex: print 'oh no, i except' arg = simplejson.loads(jobinfo.arg) record = Record.objects.get(id=arg['record'], manager='unitedsearch') try: if jobinfo.status.startswith == 'Complete': return url = arg['url'] print 'ready to download image at: ' + url storage = get_storage() if storage: print 'storage in workers.py is valid at 32' else: print 'storage is invalid at 32' proxy = proxy_opener() # where you get a url error file = proxy.open(url) image_data = file.read() print 'unitedsearch.workers.py -- image_data: ' + str(len(image_data)) size = len(image_data) image_file = StringIO.StringIO(image_data) if image_file: print 'have image file' else: print 'do not have image file' #size = file.info().get('content-length') #setattr(file, 'size', int(size if size else 0)) setattr(image_file, 'size', int(size if size else 0)) mimetype = file.info().get('content-type')
def _get_url(url): proxy_url = proxy_opener() html = proxy_url.open(url) return html