def parse_products(self, response): print "parse_products", response.url sel = Selector(response) breadcrumb = sel.xpath('//div[contains(@class,"breadCrumb")]') categories = [span for span in breadcrumb.xpath(".//span[@itemprop='title']/text()").extract()[1:]] categories.append(breadcrumb.xpath(".//span/text()").extract()[-1]) print categories for product in sel.xpath('//div[contains(@id,"quickLookItem")]'): # check if it is a multistore product if product.xpath('.//span[contains(@id, "numStoresQA")]'): print product.xpath(".//a/@href").extract()[0] url = product.xpath(".//a/@href").extract()[0] url = "/".join(url.split("/")[:-1])+"/prices" yield Request(urlparse.urljoin(response.url, url), callback=self.parse_multiple_store_product) else: # It is not a multistore product. Parse it. item = ShoppingdotcomItem() item["categories"] = categories item["product_name"] = product.xpath(".//span[contains(@id, 'nameQA')]/@title").extract()[0] if product.xpath(".//span[@class='placeholderImg']").extract(): item["image_urls"] = product.xpath(".//span[@class='placeholderImg']/text()").extract() else: item["image_urls"] = product.xpath(".//div[@class='gridItemTop']//img/@src").extract() item["product_urls"] = [urlparse.urljoin(response.url, product.xpath(".//a/@href").extract()[0])] item["stores"] = product.xpath(".//a[@class='newMerchantName']/text()").extract() item["prices"] = [price.replace("\n","") for price in product.xpath(".//span[@class='productPrice']/a/text()").extract()] yield item # Check if Next page link is there then yeild request with next URL if sel.xpath("//a[@name='PLN']").extract(): yield Request(urlparse.urljoin(response.url, sel.xpath("//a[@name='PLN']/@href").extract()[0]), self.parse_products) pass
def parse_start_url(self, response): print response.url sel = Selector(response) for url in sel.xpath("//a"): #print url.xpath("@href").extract() href = url.xpath("@href").extract()[0] if url.xpath("@href").extract() else None if href and href.split("/")[-1] == "products": yield Request(urlparse.urljoin(response.url, href), callback=self.parse_products) if href and href.find("xFA-") >= 0: href = href.replace("xFA-", "").split("~")[0]+"/products" yield Request(urlparse.urljoin(response.url, href), callback=self.parse_products) pass
def parse_products(self, response): print "parse_products", response.url sel = Selector(response) breadcrumb = sel.xpath('//div[contains(@class,"breadCrumb")]') categories = [ span for span in breadcrumb.xpath( ".//span[@itemprop='title']/text()").extract()[1:] ] categories.append(breadcrumb.xpath(".//span/text()").extract()[-1]) print categories for product in sel.xpath('//div[contains(@id,"quickLookItem")]'): # check if it is a multistore product if product.xpath('.//span[contains(@id, "numStoresQA")]'): print product.xpath(".//a/@href").extract()[0] url = product.xpath(".//a/@href").extract()[0] url = "/".join(url.split("/")[:-1]) + "/prices" yield Request(urlparse.urljoin(response.url, url), callback=self.parse_multiple_store_product) else: # It is not a multistore product. Parse it. item = ShoppingdotcomItem() item["categories"] = categories item["product_name"] = product.xpath( ".//span[contains(@id, 'nameQA')]/@title").extract()[0] if product.xpath(".//span[@class='placeholderImg']").extract(): item["image_urls"] = product.xpath( ".//span[@class='placeholderImg']/text()").extract() else: item["image_urls"] = product.xpath( ".//div[@class='gridItemTop']//img/@src").extract() item["product_urls"] = [ urlparse.urljoin(response.url, product.xpath(".//a/@href").extract()[0]) ] item["stores"] = product.xpath( ".//a[@class='newMerchantName']/text()").extract() item["prices"] = [ price.replace("\n", "") for price in product.xpath( ".//span[@class='productPrice']/a/text()").extract() ] yield item # Check if Next page link is there then yeild request with next URL if sel.xpath("//a[@name='PLN']").extract(): yield Request( urlparse.urljoin( response.url, sel.xpath("//a[@name='PLN']/@href").extract()[0]), self.parse_products) pass
def main(argv=sys.argv): """ Punto de entrada al programa """ url = "http://www.vientonomade.com.ar/index.php?option=com_content&view=category&" "layout=blog&id=8&Itemid=10" fetcher = httplib2.Http() get = partial(obtener_pagina, fetcher) while url: html = get(url) uri, links = buscar_links(html) for link in links: try: print urlparse.urljoin(url, link) except UnicodeEncodeError: pass url = uri and urlparse.urljoin(url, uri) or None
def parse_start_url(self, response): print response.url sel = Selector(response) for url in sel.xpath("//a"): #print url.xpath("@href").extract() href = url.xpath("@href").extract()[0] if url.xpath( "@href").extract() else None if href and href.split("/")[-1] == "products": yield Request(urlparse.urljoin(response.url, href), callback=self.parse_products) if href and href.find("xFA-") >= 0: href = href.replace("xFA-", "").split("~")[0] + "/products" yield Request(urlparse.urljoin(response.url, href), callback=self.parse_products) pass
def parse(url, body, **kwargs): for line in body.decode('gbk', errors='ignore').splitlines(): if line.lstrip().startswith('var docData'): l, r = line.find('{'), line.rfind('}') obj = json.loads(line[l:r + 1]) doc = obj['result']['docinfo'][0]['foolrinfo'] doc['title'] = obj['result']['sDocTitle'] doc['url'] = urlparse.urljoin('http://www.xici.net', obj['result']['strPageUrl']) doc['date'] = '20' + doc['LongDate'] doc['content'] = html.fromstring( doc['floorcontent']).text_content() tpl = Template(''' <html> <head> <meta content="text/html; charset=utf-8" http-equiv="content-type"> <title>{{doc['title']}}</title> </head> <body> <a id="title" href="{{doc['url']}}">{{doc['title']}}</a> <p id="date">{{doc['date']}}</p> <div id="content">{{doc['content']}}</div> </body> </html>''') return tpl.render(doc=doc).encode('gbk', errors='ignore') else: return '<html/>'
def submit(self, opener, res): """submit WAYF form with IDP :param opener: the urllib2 opener :param data: the form data as a dictionary :param res: the response object """ log.info('Submitting form to wayf') #Set IDP to correct IDP wayf_data = {} idp = self.idp data = self.data if not idp.get_idp() in data['origin']: raise WAYFException( "Can't find IdP '{0}' in WAYF's IdP list".format( idp.get_idp())) wayf_data['origin'] = data['origin'][idp.get_idp()] wayf_data['shire'] = data['shire']['value'] wayf_data['providerId'] = data['providerId']['value'] wayf_data['target'] = data['target']['value'] wayf_data['time'] = data['time']['value'] wayf_data['cache'] = 'false' wayf_data['action'] = 'selection' url = urlparse.urljoin(res.url, data['form']['action']) data = urllib.urlencode(wayf_data) request = Request(url + '?' + data) log.debug("POST: %s" % request.get_full_url()) response = opener.open(request) return request, response
def parse(url, body, **kwargs): for line in body.decode('gbk', errors='ignore').splitlines(): if line.lstrip().startswith('var docData'): l, r = line.find('{'), line.rfind('}') obj = json.loads(line[l:r+1]) doc = obj['result']['docinfo'][0]['foolrinfo'] doc['title'] = obj['result']['sDocTitle'] doc['url'] = urlparse.urljoin('http://www.xici.net', obj['result']['strPageUrl']) doc['date'] = '20'+doc['LongDate'] doc['content'] = html.fromstring(doc['floorcontent']).text_content() tpl = Template(''' <html> <head> <meta content="text/html; charset=utf-8" http-equiv="content-type"> <title>{{doc['title']}}</title> </head> <body> <a id="title" href="{{doc['url']}}">{{doc['title']}}</a> <p id="date">{{doc['date']}}</p> <div id="content">{{doc['content']}}</div> </body> </html>''') return tpl.render(doc=doc).encode('gbk', errors='ignore') else: return '<html/>'
def handle_starttag(self, tag, attrs): if tag == 'a': for (attribute, value) in attrs: if (attribute == 'href'): # if not full url, convert relative url to full url url = urlparse.urljoin(self.base_url, value) self.links.add(url)
def parse(self, response): delinquent_link = Selector(response).xpath( '//*[@id="box1"]/td[1]/li/font/i/a/@href').extract() urllib.urlretrieve(urlparse.urljoin(response.url, delinquent_link[0]), 'delinquent.zip') unzip('delinquent.zip', 'delinquent') with open(glob.glob('delinquent/*.csv')[0], 'rb') as csvfile: csvreader = csv.reader(csvfile, delimiter=',') for idx, column in enumerate(csvreader.next()): column = re.sub('["]', "", column).strip() if column.startswith("PARCELID"): parcelidcol = idx if column.startswith("OWNERNAME1"): ownernamecol = idx if column.startswith("PARCELLOCATION"): parcellocationcol = idx if column.startswith("CLS"): parcelclass = idx if column.startswith("ASMTBLDG"): buildingvalue = idx for row in csvreader: item = ReapItem() item['parcel_id'] = re.sub('["]', "", row[parcelidcol]).strip() item['parcel_location'] = row[parcellocationcol].strip() item['parcel_class'] = row[parcelclass].strip() item['building_value'] = row[buildingvalue].strip() request = scrapy.Request( "http://mctreas.org/master.cfm?parid={0}&taxyr={1}&own1={2}".format( item['parcel_id'], str(YEAR), row[ownernamecol]), callback=self.get_tax_eligibility) request.meta['item'] = item yield request
def install_artifacts(artifacts, dirstruct, installdir, basestaticurl): """ Install the artifacts. """ assert basestaticurl.endswith("/"), "Basestaticurl should end with /" installed = [] for reldir, artifactnames in dirstruct.items(): destdir = os.path.join(installdir, reldir) if not os.path.exists(destdir): log.warn(msg="Making install directory %s" % destdir) os.makedirs(destdir) else: assert os.path.isdir(destdir) for artifactname in artifactnames: destpath = os.path.abspath(os.path.join(destdir, artifactname)) if artifactname in artifacts.keys(): # The artifact must be loaded from jenkins theartifact = artifacts[artifactname] else: # It's probably a static file, we can get it from the static collection staticurl = urlparse.urljoin(basestaticurl, artifactname) theartifact = Artifact(artifactname, staticurl) theartifact.save(destpath) installed.append(destpath) return installed
def transform(row, table): 'Transform row "link" into full URL and add "state" based on "name"' data = row._asdict() data['link'] = urlparse.urljoin('https://pt.wikipedia.org', data['link']) data['name'], data['state'] = regexp_city_state.findall(data['name'])[0] return data
def submit(self, opener, res): """submit login form to COSign IdP :param opener: the urllib2 opener :param data: the form data as a dictionary :param res: the response object :param cm: a :class:`~slick.passmgr.CredentialManager` containing the URL to the service provider you want to connect to """ idp_data = {} cm = self.cm data = self.data url = urlparse.urljoin(res.url, data["form"]["action"]) log.info("Form Authentication from: %s" % url) idp_data[self.username_field] = cm.get_username() idp_data[self.password_field] = cm.get_password() idp_data["service"] = data["service"]["value"] idp_data["ref"] = data["ref"]["value"] data = urllib.urlencode(idp_data) request = Request(url, data=data) log.info("Submitting login form") log.debug("POST: %s" % request.get_full_url()) response = opener.open(request) return request, response
def submit(self, opener, res): """submit WAYF form with IDP :param opener: the urllib2 opener :param data: the form data as a dictionary :param res: the response object """ log.info("Submitting form to wayf") # Set IDP to correct IDP wayf_data = {} idp = self.idp data = self.data if not idp.get_idp() in data["origin"]: raise WAYFException("Can't find IdP '{0}' in WAYF's IdP list".format(idp.get_idp())) wayf_data["origin"] = data["origin"][idp.get_idp()] wayf_data["shire"] = data["shire"]["value"] wayf_data["providerId"] = data["providerId"]["value"] wayf_data["target"] = data["target"]["value"] wayf_data["time"] = data["time"]["value"] wayf_data["cache"] = "false" wayf_data["action"] = "selection" url = urlparse.urljoin(res.url, data["form"]["action"]) data = urllib.urlencode(wayf_data) request = Request(url + "?" + data) log.debug("POST: %s" % request.get_full_url()) response = opener.open(request) return request, response
def submit(self, opener, res): """submit login form to IdP :param opener: the urllib2 opener :param data: the form data as a dictionary :param res: the response object :param cm: a :class:`~slick.passmgr.CredentialManager` containing the URL to the service provider you want to connect to """ idp_data = {} cm = self.cm data = self.data # insert the hidden fields into the post data for k, v in data.items(): if "type" in v and "value" in v: if v.get("type") == "hidden": idp_data[k] = v.get("value") url = urlparse.urljoin(res.url, data["form"]["action"]) log.info("Form Authentication from: %s" % url) idp_data[self.username_field] = cm.get_username() idp_data[self.password_field] = cm.get_password() data = urllib.urlencode(idp_data) request = Request(url, data=data) log.info("Submitting login form") log.debug("POST: %s" % request.get_full_url()) response = opener.open(request) return request, response
def parse(self, response): delinquent_link = Selector(response).xpath( '//*[@id="box1"]/td[1]/li/font/i/a/@href').extract() urllib.urlretrieve(urlparse.urljoin(response.url, delinquent_link[0]), 'delinquent.zip') unzip('delinquent.zip', 'delinquent') with open(glob.glob('delinquent/*.csv')[0], 'rb') as csvfile: csvreader = csv.reader(csvfile, delimiter=',') for idx, column in enumerate(csvreader.next()): column = re.sub('["]', "", column).strip() if column.startswith("PARCELID"): parcelidcol = idx if column.startswith("OWNERNAME1"): ownernamecol = idx if column.startswith("PARCELLOCATION"): parcellocationcol = idx if column.startswith("CLS"): parcelclass = idx if column.startswith("ASMTBLDG"): buildingvalue = idx for row in csvreader: item = ReapItem() item['parcel_id'] = re.sub('["]', "", row[parcelidcol]).strip() item['parcel_location'] = row[parcellocationcol].strip() item['parcel_class'] = row[parcelclass].strip() item['building_value'] = row[buildingvalue].strip() request = scrapy.Request( "http://mctreas.org/master.cfm?parid={0}&taxyr={1}&own1={2}" .format(item['parcel_id'], str(YEAR), row[ownernamecol]), callback=self.get_tax_eligibility) request.meta['item'] = item yield request
def parse(self, response): delinquentLink = Selector(response).xpath( '//*[@id="box1"]/td[1]/li/font/i/a/@href').extract() urllib.urlretrieve(urlparse.urljoin(response.url, delinquentLink[0]), 'delinquent.zip') unzip('delinquent.zip', 'delinquent') with open(glob.glob('delinquent/*.csv')[0], 'rb') as csvfile: csvreader = csv.reader(csvfile, delimiter=',') for idx, column in enumerate(csvreader.next()): column = re.sub('["]', "", column).strip() if column.startswith("PARCELID"): parcelidcol = idx if column.startswith("OWNERNAME1"): ownernamecol = idx if column.startswith("PARCELLOCATION"): parcellocationcol = idx for row in csvreader: item = ReapItem() item['parcelid'] = re.sub('["]', "", row[parcelidcol]).strip() item['parcellocation'] = row[parcellocationcol].strip() if item['parcelid'].startswith('R72'): request = scrapy.Request( "http://mctreas.org/master.cfm?parid=" + item['parcelid'] + "&taxyr=2014" + "&own1=" + row[ownernamecol] + '\n', callback=self.getTaxEligibility) request.meta['item'] = item yield request
def check_config(): """ Check crucial configuration details for existence and workability. Runs checks to see whether bugtracker's URL is reachable, whether backend is available at the right filename, and whether the script has the key arguments it needs to run: URL, backend, and database details. The filename for the backend in the backends/ directory needs to be the same as the configuration argument specifying that backend. For instance, invoking the Launchpad backend uses 'lp', and so the filename is 'lp.py'. """ Config.check_params(['url', 'backend']) if Config.backend + ".py" not in Backend.get_all_backends(): raise InvalidConfig('Backend "' + Config.backend + '" does not exist') url = urlparse.urlparse(Config.url) check_url = urlparse.urljoin(url.scheme + '://' + url.netloc, '') print("Checking URL: " + check_url) req = Request(check_url) try: response = urlopen(req) except HTTPError, e: raise InvalidConfig('The server could not fulfill the request ' + str(e.msg) + '(' + str(e.code) + ')')
def check_config(): """ Check crucial configuration details for existence and workability. Runs checks to see whether bugtracker's URL is reachable, whether backend is available at the right filename, and whether the script has the key arguments it needs to run: URL, backend, and database details. The filename for the backend in the backends/ directory needs to be the same as the configuration argument specifying that backend. For instance, invoking the Launchpad backend uses 'lp', and so the filename is 'lp.py'. """ Config.check_params(['url', 'backend']) if Config.backend + ".py" not in Backend.get_all_backends(): raise InvalidConfig('Backend "' + Config.backend + '" does not exist') url = urlparse.urlparse(Config.url) check_url = urlparse.urljoin(url.scheme + '://' + url.netloc, '') print("Checking URL: " + check_url) req = Request(check_url) if Config.backend != 'github': try: response = urlopen(req) except HTTPError, e: raise InvalidConfig('The server could not fulfill the request ' + str(e.msg) + '(' + str(e.code) + ')') except URLError, e: raise InvalidConfig('We failed to reach a server. ' + str(e.reason))
def bot_send_video(gesture, video_url, video_preview_img, to_mid="u2ef38a8c1f3f1c2c63bdf9c0a629023c"): headers = {} headers['Content-type'] = 'application/json; charset=UTF-8' headers['X-Line-ChannelID'] = settings.CHANNEL_ID headers['X-Line-ChannelSecret'] = settings.CHANNEL_SECRET headers['X-Line-Trusted-User-With-ACL'] = settings.CHANNEL_MID api = 'https://trialbot-api.line.me/v1/events' body = {} body['to'] = [to_mid] body['toChannel'] = 1383378250 body['eventType'] = "138311608800106203" #gesture = Gesture.objects.all()[0] myurl = 'https://eldertranslator.herokuapp.com/' video_url = urlparse.urljoin(myurl, gesture.video.url) content = { "contentType": 3, "toType": 1, "originalContentUrl": video_url, "previewImageUrl": video_preview_img } body['content'] = content req = requests.post(api, data=json.dumps(body), headers=headers, verify=False) return req
def submit(self, opener, res): """submit WAYF form with IDP :param opener: the urllib2 opener :param data: the form data as a dictionary :param res: the response object """ log.info("Submitting form to wayf") # Set IDP to correct IDP wayf_data = {} idp = self.idp data = self.data idps = {} for d in data["user_idp"]: if isinstance(data["user_idp"][d], dict): idps.update(data["user_idp"][d]) if not idp.get_idp() in idps: raise WAYFException("Can't find IdP '%s' in WAYF's IdP list" % idp) wayf_data["user_idp"] = idps[idp.get_idp()] wayf_data["Select"] = "Select" if data["form"]["action"].startswith("?"): urlsp = urlparse.urlsplit(res.url) urlsp = urlparse.urlunsplit((urlsp[0], urlsp[1], urlsp[2], "", "")) url = res.url + data["form"]["action"] else: url = urlparse.urljoin(res.url, data["form"]["action"]) data = urllib.urlencode(wayf_data) request = Request(url, data) log.debug("POST: %s" % request.get_full_url()) response = opener.open(request) return request, response
def notify(cls, alert, *args, **kwargs): url = cls._config.get('notifications.hipchat.url') token = kwargs.get('token', cls._config.get('notifications.hipchat.token')) repeat = kwargs.get('repeat', 0) notify = kwargs.get('notify', False) color = 'green' if alert and not alert.get('is_alert') else kwargs.get('color', 'red') message_text = cls._get_subject(alert, custom_message=kwargs.get('message')) if kwargs.get('link', False): zmon_host = kwargs.get('zmon_host', cls._config.get('zmon.host')) alert_id = alert['alert_def']['id'] alert_url = urlparse.urljoin(zmon_host, '/#/alert-details/{}'.format(alert_id)) if zmon_host else '' link_text = kwargs.get('link_text', 'go to alert') message_text += ' -- <a href="{}" target="_blank">{}</a>'.format(alert_url, link_text) message = { 'message': message_text, 'color': color, 'notify': notify } try: logger.info( 'Sending to: ' + '{}/v2/room/{}/notification?auth_token={}'.format(url, urllib.quote(kwargs['room']), token) + ' ' + json.dumps(message)) r = requests.post( '{}/v2/room/{}/notification'.format(url, urllib.quote(kwargs['room'])), json=message, params={'auth_token': token}, headers={'Content-type': 'application/json'}) r.raise_for_status() except: logger.exception('Hipchat write failed!') return repeat
def _get_department_urls_from_url(url): soup = retrieve_soup(url) linked_urls = [ urlparse.urljoin(BASE_URL, a["href"]) for a in soup.find_all("a", href=True) ] return set(linked_url for linked_url in linked_urls if _is_department_url(linked_url, url))
def notify(cls, alert, *args, **kwargs): current_span = extract_span_from_kwargs(**kwargs) url = cls._config.get('notifications.hipchat.url') token = kwargs.get('token', cls._config.get('notifications.hipchat.token')) repeat = kwargs.get('repeat', 0) notify = kwargs.get('notify', False) alert_def = alert['alert_def'] message_format = kwargs.get('message_format', 'html') current_span.set_tag('alert_id', alert_def['id']) entity = alert.get('entity') is_changed = alert.get('alert_changed', False) is_alert = alert.get('is_alert', False) current_span.set_tag('entity', entity['id']) current_span.set_tag('alert_changed', bool(is_changed)) current_span.set_tag('is_alert', is_alert) current_span.log_kv({'room': kwargs.get('room')}) color = 'green' if alert and not alert.get('is_alert') else kwargs.get('color', 'red') message_text = cls._get_subject(alert, custom_message=kwargs.get('message')) if kwargs.get('link', False): zmon_host = kwargs.get('zmon_host', cls._config.get('zmon.host')) alert_id = alert['alert_def']['id'] alert_url = urlparse.urljoin(zmon_host, '/#/alert-details/{}'.format(alert_id)) if zmon_host else '' link_text = kwargs.get('link_text', 'go to alert') if message_format == 'html': message_text += ' -- <a href="{}" target="_blank">{}</a>'.format(alert_url, link_text) else: message_text += ' -- {} - {}'.format(link_text, alert_url) message = { 'message': message_text, 'color': color, 'notify': notify, 'message_format': message_format } try: logger.info( 'Sending to: ' + '{}/v2/room/{}/notification?auth_token={}'.format(url, urllib.quote(kwargs['room']), token) + ' ' + json.dumps(message)) r = requests.post( '{}/v2/room/{}/notification'.format(url, urllib.quote(kwargs['room'])), json=message, params={'auth_token': token}, headers={'Content-type': 'application/json'}) r.raise_for_status() except Exception: current_span.set_tag('error', True) current_span.log_kv({'exception': traceback.format_exc()}) logger.exception('Hipchat write failed!') return repeat
def get_new_urls(self, new_url, soup): #获取解析后的新url #<a target="_blank" href="/item/%E8%A7%A3%E9%87%8A%E5%99%A8">解释器</a> new_urls = set() links = soup.find_all('a', href=re.compile(r"/item/")) for link in links: new_parse_url = link['href'] new_full_url = urljoin(new_url, new_parse_url) new_urls.add(new_full_url) return new_urls
def relative_to_full_url(self, url): """ Resolve the URL based on the object's original_url """ from urllib2 import urlparse parsed = urlparse.urlparse(url) if not parsed.netloc: url = urlparse.urljoin(self.original_url, parsed.path) return url
def _kick_offline(self, con, condom): form = condom.xpath("//form[@id='form1']")[0] kvs,_ = self.process_form(form) # javascript:__doPostBack('gvOnLineUser','KickOut$0') kvs['__EVENTTARGET'] = 'gvOnLineUser' kvs['__EVENTARGUMENT'] = 'KickOut$0' action = urlparse.urljoin(con.request.url, form.attrib.get('action')) con = self.request_url(action, data=kvs, headers={'Referer': con.request.url}) return 'Navigate.aspx' in con.request.url
def relative_to_full_url(original_url, url): """ Resolve the URL based on the original_url """ from urllib2 import urlparse parsed = urlparse.urlparse(url) if not parsed.netloc: url = urlparse.urljoin(original_url, parsed.path) return url
def show(self): slcs_login_url = urlparse.urljoin(self.settings.slcs, 'login') idp_keys = list_idps(slcs_login_url).keys() idp_keys.sort() for i in idp_keys: self.idps.append_text(i) if i == self.settings.idp: self.idps.set_active(len(self.idps.get_model())-1) self.window.show_all()
def crawls(url): urlStream = urlopen(url) htmldoc = urlStream.read() soup = BeautifulSoup(htmldoc) links = [] images = soup.findAll("img", {"src": re.compile(r'\.(jpe?g)|(png)|(gif)$')}) for img in images: links.append(urlparse.urljoin(url, img["src"])) return links
def urlIterator(startUrl, nextCssSelector): '''Yields the url of a page while there is a next one found by the cssSelector''' #This function takes time because it has to parse the dom to get the next url url = startUrl while url: yield url nextTags = getElementsFromUrl(url, nextCssSelector) url = None for possibleNext in nextTags: if possibleNext.tag == 'a': href = possibleNext.get('href') # Absolute href url = urlparse.urljoin(startUrl, href) break else: newTag = possibleNext.find('a') if newTag != None: href = newTag.get('href') url = urlparse.urljoin(startUrl, href) break
def crawl_courses_from_program_page_url(url, program_code): soup = retrieve_soup(url) linked_urls = [ urlparse.urljoin(BASE_URL, a["href"]) for a in soup.find_all("a", href=True) ] course_urls = sorted(set(url for url in linked_urls if _is_course_url(url))) return filter(None, [ _crawl_course_data(course_url, program_code) for course_url in course_urls ])
def domIterator(startUrl, nextCssSelector): dom = getDOM(startUrl) nextSelector = CSSSelector(nextCssSelector) while dom is not None: yield dom nextTags = nextSelector(dom) dom = None for possibleNext in nextTags: if possibleNext.tag == 'a': url = possibleNext.get('href') url = urlparse.urljoin(startUrl, url) dom = getDOM(url) break
def submit(self, opener, res): """follow login link on ESOE Chooser page :param opener: the urllib2 opener :param data: the form data as a dictionary :param res: the response object """ url = urlparse.urljoin(res.url, self.url) request = Request(url) log.debug("GET: %s" % request.get_full_url()) response = opener.open(request) return request, response
def find_correct_element_url(params, el): els = el.xpath('//div[@class="wx-rb bg-blue wx-rb_v1 _item"]') print '---------------------------' print len(els) print '---------------------------' for cur_el in els: nick_name = cur_el.xpath('//div[@class="txt-box"]/h3/em/text()')[0] print nick_name if params.name == nick_name.encode('utf8'): url = cur_el.xpath('@href')[0] url = urlparse.urljoin(base_url, url) return url return ""
def _get_program_urls_from_department_url(url): soup = retrieve_soup(url) linked_urls = [ urlparse.urljoin(BASE_URL, a["href"]) for a in soup.find_all("a", href=True) ] program_urls = set() for potential_program_url in linked_urls: if _is_course_url(potential_program_url): potential_program_url = ("/".join( potential_program_url.split("/")[:-1])) if _is_program_url(potential_program_url, url): program_urls.add(potential_program_url) return program_urls
def __init__(self, service_url, infrastructure_account, verify=True, oauth2=False): if not service_url: raise ConfigurationError('EntitiesWrapper improperly configured. URL is missing!') self.infrastructure_account = infrastructure_account self.__service_url = urlparse.urljoin(service_url, 'api/v1/') self.__session = requests.Session() self.__session.headers.update({'User-Agent': get_user_agent()}) self.__session.verify = verify if oauth2: self.__session.headers.update({'Authorization': 'Bearer {}'.format(tokens.get('uid'))})
def main(url): # Example URL: http://picturesofshit.com/v/2009/10-15_-_Dudescademy/ img_size_qry_string = '?g2_imageViewsIndex=1' # Go to gallery and grab links to high resolution photos gallery = urlopen(url) soup = BeautifulSoup(gallery.read()) links = [tag.attrMap['href'] + img_size_qry_string for tag in soup.findAll(href=re.compile('JPG.html'))] # Create download directory based on url dir = re.search('[_+]([a-zA-Z0-9]+)/$', url).groups()[0] if not os.path.exists(dir): os.makedirs(dir) # Go to each link, grab the image source, and download links = [urlparse.urljoin(url, link) for link in links] for link in links: gallery_image = urlopen(link) soup = BeautifulSoup(gallery_image.read()) image_url = urlparse.urljoin(url, soup.find('img', 'ImageFrame_none').attrMap['src']) file_name = re.search('([^/]+)$', image_url).groups()[0] file = os.path.join(dir, file_name) print 'Downloading %s' % file_name urlretrieve(image_url, file) print '--- Downloads Complete ---'
def submit(self, opener, res): """submit IdP form to SP :param opener: the urllib2 opener :param data: the form data as a dictionary :param res: the response object """ log.info("Submitting SAML Verification form") data = self.data url = urlparse.urljoin(res.url, data["form"]["action"]) data = urllib.urlencode({"SAMLRequest": data["SAMLRequest"]["value"]}) request = Request(url, data=data) log.debug("POST: %s" % request.get_full_url()) response = opener.open(request) return request, response
def submit(self, opener, res): """submit IdP form to SP :param opener: the urllib2 opener :param data: the form data as a dictionary :param res: the response object """ log.info('Submitting IdP SAML form') data = self.data url = urlparse.urljoin(res.url, data['form']['action']) data = urllib.urlencode({'SAMLResponse': data['SAMLResponse']['value'], 'RelayState': data['RelayState']['value']}) request = Request(url, data=data) log.debug("POST: %s" % request.get_full_url()) response = opener.open(request) return request, response
def request(self): for ticker, i in self.tickers.items(): if i: path = self.real_time_path.format(ticker.lower()) req = self.sess.get(urlparse.urljoin(self.base_url, path)) if req.ok: try: price = self.parse(req.text) self.callback(json.dumps({ticker.upper(): price})) yield {ticker: price} except Exception as e: logging.error(e) del self.tickers[ticker] else: logging.error(req.reason) else: del self.tickers[ticker]
def check_config(): """ """ Config.check_params(['url','backend']) if Config.backend+".py" not in Backend.get_all_backends(): raise InvalidConfig('Backend "'+ Config.backend + '" does not exist') url = urlparse.urlparse(Config.url) check_url = urlparse.urljoin(url.scheme + '://' + url.netloc,'') print("Checking URL: " + check_url) req = Request(check_url) try: response = urlopen(req) except HTTPError, e: raise InvalidConfig('The server could not fulfill the request ' + str(e.msg) + '('+ str(e.code)+')')
def check_config(): """ """ Config.check_params(['url', 'backend']) if Config.backend + ".py" not in Backend.get_all_backends(): raise InvalidConfig('Backend "' + Config.backend + '" does not exist') url = urlparse.urlparse(Config.url) check_url = urlparse.urljoin(url.scheme + '://' + url.netloc, '') print("Checking URL: " + check_url) req = Request(check_url) try: response = urlopen(req) except HTTPError, e: raise InvalidConfig('The server could not fulfill the request ' + str(e.msg) + '(' + str(e.code) + ')')
def dump(): for fmt,fxt in MIME_TYPES.iteritems(): dump_path = path.join(DUMP_DIR, path.basename(fmt)) makedirs(dump_path) for url in [ urlparse.urljoin(BASE_URL, p) for p in URL_PATHS ]: logger.info("Request metadata in '%s' from\n %s\n" % (fmt, url)) req = Request(url) req.add_header('Accept', fmt) res = urlopen(req) fname = '%s.%s' % (path.basename(urlparse.urlparse(url).path), fxt) fname = path.join(dump_path, fname) logger.info("Write metadata into file './%s'\n" % fname) with open(fname, 'w') as fout: fout.write(res.read())
def scrapOreily(indexUrl, outName): '''Generates an html page from the index located at indexUrl''' links = scraptools.getElementsFromUrl(url, '#bodyContent ol a:nth-child(1)') f = open(outName, 'w') f.write(head) f.write(getHTMLContent(indexUrl)) for link in links: relativeLink = link.get('href') print relativeLink absoluteLink = urlparse.urljoin(url, relativeLink) f.write(getHTMLContent(absoluteLink)) f.write('</body></html>') f.close()
def get_user_json(user, profile): # try: # user_package = User_packages.objects.get(user=user, status="A") # package = user_package.package.price # except User_packages.DoesNotExist: # package = 0 user_investment = User_packages.objects.filter(user=user).annotate( investment=Sum('package__price')).values() investment = user_investment[0]['investment'] if user_investment else 0 today = UTC.normalize(UTC.localize(datetime.datetime.utcnow())) pkg = get_package(user) pkg_dt = pkg.created_at.strftime("%D") if pkg else None return dict( id=user.id, avi_id=profile.user_auto_id, relationship=get_relationship(user), name="%s %s" % (user.first_name, user.last_name), # content="Total Transactional Volume: %s" % (tot_txn_vol(user)), sponsor_id=None if profile.sponser_id is None else profile.sponser_id.id, placement_id=None if profile.placement_id is None else profile.placement_id.id, placement_position=profile.placement_position, image=ICON, link=dict( href=urlparse.urljoin("https://www.avicrypto.us", "/network") + "#"), image_Name="inactive" if investment is 0 else "active", investment=investment, transaction=tot_txn_vol(user), binary=binary_txns(user, EPOCH_BEGIN, today), direct=direct_txns(user, EPOCH_BEGIN, today), roi=roi_txns(user, EPOCH_BEGIN, today), direct_left=direct_child(user, EPOCH_BEGIN, today, leg='l'), direct_right=direct_child(user, EPOCH_BEGIN, today, leg='r'), binary_left=binary_child(user, EPOCH_BEGIN, today, leg='l'), binary_right=binary_child(user, EPOCH_BEGIN, today, leg='r'), left_members_count=get_user_count(user, 'l'), right_members_count=get_user_count(user, 'r'), package_active_date=pkg_dt)
def notify(cls, alert, per_entity=False, include_alert=True, message='', repeat=0, **kwargs): url = 'https://events.pagerduty.com/generic/2010-04-15/create_event.json' repeat = kwargs.get('repeat', 0) # Auth key! service_key = kwargs.get('service_key', cls._config.get('notifications.pagerduty.servicekey')) zmon_host = kwargs.get('zmon_host', cls._config.get('zmon.host')) if not service_key: raise NotificationError('Service key is required!') entity = alert.get('entity') is_alert = alert.get('is_alert') event_type = 'trigger' if is_alert else 'resolve' alert_id = alert['alert_def']['id'] key = 'ZMON-{}'.format(alert_id) if not per_entity else 'ZMON-{}-{}'.format(alert_id, entity['id']) description = message if message else cls._get_subject(alert) message = { 'service_key': service_key, 'event_type': event_type, 'incident_key': key, 'description': description, 'client': 'ZMON', 'client_url': urlparse.urljoin(zmon_host, '/#/alert-details/{}'.format(alert_id)) if zmon_host else '', 'details': json.dumps(alert, cls=JsonDataEncoder) if include_alert else '', } try: logger.info('Sending to %s %s', url, message) headers = {'User-Agent': get_user_agent(), 'Content-type': 'application/json'} r = requests.post(url, json=message, headers=headers, timeout=5) r.raise_for_status() except Exception as ex: logger.exception('Notifying Pagerduty failed %s', ex) return repeat
def __init__(self, service_url, infrastructure_account, verify=True, oauth2=False): if not service_url: raise ConfigurationError( 'EntitiesWrapper improperly configured. URL is missing!') self.infrastructure_account = infrastructure_account self.__service_url = urlparse.urljoin(service_url, 'api/v1/') self.__session = requests.Session() self.__session.headers.update({'User-Agent': get_user_agent()}) self.__session.verify = verify if oauth2: self.__session.headers.update( {'Authorization': 'Bearer {}'.format(tokens.get('uid'))})
def _request(self, endpoint, q, method='get'): try: url = urlparse.urljoin(self.__service_url, endpoint) request = getattr(self.__session, method.lower()) if method.lower() == 'post': response = request(url, json=q) else: response = request(url, params={'query': json.dumps(q)}) if response.ok: return response.json() else: raise CheckError( 'EntitiesWrapper query failed: {} with status {}:{}'.format(q, response.status_code, response.text)) except requests.Timeout: raise HttpError('timeout', self.__service_url), None, sys.exc_info()[2] except requests.ConnectionError: raise HttpError('connection failed', self.__service_url), None, sys.exc_info()[2]
def parse_multiple_store_product(self, response): print "parse_multiple_store_product", response.url sel = Selector(response) breadcrumb = sel.xpath('//div[contains(@class,"breadCrumb")]') categories = [ span for span in breadcrumb.xpath( ".//span[@itemprop='title']/text()").extract()[1:] ] print categories item = ShoppingdotcomItem() item["categories"] = categories item["product_name"] = sel.xpath( "//h1[@class='productTitle']/text()").extract()[0] item["image_urls"] = list( set(sel.xpath("//div[@class='imgBorder']//img/@src").extract())) item["product_urls"] = [] item["stores"] = [] item["prices"] = [] for div in sel.xpath("//div[contains(@id,'offerItem-')]"): item["product_urls"].append( urlparse.urljoin( response.url, div.xpath(".//a[@class='visitBtn']/@href").extract()[0])) item["stores"].append( div.xpath(".//img[contains(@id,'DCTmerchLogo')]/@title"). extract()[0]) if div.xpath(".//span[contains(@class,'toSalePrice')]"): item["prices"].append( re.findall( "\S+\d+\.\d+", div.xpath( ".//span[contains(@class,'toSalePrice')]/text()"). extract()[0])[0]) else: item["prices"].append( re.findall( "\S+\d+\.\d+", div.xpath(".//span[contains(@id,'priceQA')]/text()"). extract()[0])[0]) yield item pass
def validate(amt, src_addr, addr, txn_id, coin="btc"): """Validates the given address and transaction of the given crypto payment type which can by anyone of BTC, ETH, XRP""" txn_res, addr_res = COIN[coin](src_addr, txn_id) assert coin in ("btc", "eth", "xrp") if coin == "btc": return is_valid_btc_paid(amt, addr, src_addr, txn_id, addr_res, txn_res) elif coin == "eth": txn_res = txn_res.json() addr_res = addr_res.json() if (txn_res.get("status", None) == "1" and addr_res.get("status", None) == "1"): uri = "api?module=account&action=txlist&address=%s&startblock=0&endblock=99999999&sort=asc&apikey=%s" % ( src_addr, ETHER_KEY) res = requests.get(urlparse.urljoin(ETH_HOST, uri)).json() return is_valid_eth_paid(amt, src_addr, addr, txn_id, res) elif coin == "xrp": j_txn = txn_res.json() j_addr = addr_res.json() if j_addr['result'] == "success" and j_txn['result'] == "success": return is_valid_xrp_paid(amt, txn_id, src_addr, addr, j_txn) raise Exception("you should not be here")
def _request(self, endpoint, q, method='get'): try: url = urlparse.urljoin(self.__service_url, endpoint) request = getattr(self.__session, method.lower()) if method.lower() == 'post': response = request(url, json=q) else: response = request(url, params={'query': json.dumps(q)}) if response.ok: return response.json() else: raise CheckError( 'EntitiesWrapper query failed: {} with status {}:{}'. format(q, response.status_code, response.text)) except requests.Timeout: raise HttpError('timeout', self.__service_url), None, sys.exc_info()[2] except requests.ConnectionError: raise HttpError('connection failed', self.__service_url), None, sys.exc_info()[2]
def parse_book_0(self, response): sel = Selector(response) item = MetaItem() item['title'] = sel.xpath('//h1/text()').extract_first() item['category'] = sel.xpath('//span[contains(@itemprop,"category")]/text()').extract_first() item['author'] = sel.xpath('//span[contains(@itemprop,"author")]/text()').extract_first() item['desc'] = sel.xpath('//div[contains(@itemprop, "description")]/node()').extract() # find chapter el_chapter = sel.xpath('//li[contains(@itemprop, "itemListElement")]/node()') el_chapter.extract() array = [] for index, s in enumerate(el_chapter): ch = dict() ch['num'] = index + 1 content_url = urlparse.urljoin(self.base_domain, s.xpath('@href').extract()[0]) ch['url'] = content_url ch['name'] = s.xpath('span/text()').extract_first() array.append(ch) yield Request(content_url, meta={'chapter': ch}, callback=self.parse_content_0, priority=PRIORITY_MID) item['chapter'] = array yield item
class Crawl: f = open('employee_detail.html', 'w') f.truncate() f.close() seed = 'http://www.reuters.com/finance/markets/indices' all_links = set() links = list() try: r = requests.get(seed) if r.status_code == 200: print('Fetching in page links...') # print r.status_code content = r.content soup = BeautifulSoup(content, "lxml") tags = soup('a') flg = 0 for a in tags: href = a.get("href") if href is not None: new_url = urlparse.urljoin(seed, href) if new_url.find("sector") != -1: print new_url links.append( new_url) # 'links' contains URLs of all 10 sectors elif r.status_code == 403: print "Error: 403 Forbidden url" elif r.status_code == 404: print "Error: 404 URL not found" else: print "Make sure you have everything correct." except requests.exceptions.ConnectionError, e: print "Oops! Connection Error. Try again"