def scrp_craigs(neighbor): neighbor = neighbor.replace(' ', '+') my_url = 'https://pittsburgh.craigslist.org/search/apa?query=' + neighbor + '&availabilityMode=0&sale_date=all+dates' # print(my_url) request = url(my_url) htmlscrap = request.read() request.close() page_soup = soup(htmlscrap, "html.parser") # parsing as html container = page_soup.find('ul', {'class': 'rows'}) result_rows = container.findAll('li', {'class': 'result-row'}) names = [i.find('a', {'class': 'result-title hdrlnk'}) for i in result_rows[0:50]] prices = [i.find('span', {'class': 'result-price'}) for i in result_rows[0:50]] url_rows = [i.find('a').get('href') for i in result_rows[: 50]] result_name = [] result_price = [] for i in names: try: result_name.append(i.get_text()) except AttributeError: result_name.append('N/A') for i in prices: try: result_price.append(i.get_text().lstrip()) except AttributeError: result_price.append('N/A') # print(result_name) result_address = open_url(url_rows) result = [] for i in range(len(result_name)): result.append([result_name[i], result_price[i], result_address[i], url_rows[i]]) return result
def acceptNavigationRequest(self, frame, request, navigation_type): if navigation_type == QWebPage.NavigationTypeLinkClicked and self.linkDelegationPolicy( ) == QWebPage.DontDelegateLinks and request.url().scheme() in ('sip', 'sips'): blink = QApplication.instance() contact, contact_uri = URIUtils.find_contact( request.url().toString()) session_manager = SessionManager() session_manager.create_session(contact, contact_uri, [StreamDescription('audio')]) blink.main_window.raise_() blink.main_window.activateWindow() return False return super(WebPage, self).acceptNavigationRequest(frame, request, navigation_type)
def scraping(): script_dir = os.path.abspath(os.path.dirname(sys.argv[0]) or '.') nutrition_info_path = os.path.join( script_dir, './resource/meal_problem/nutrition_info.csv') final_rating_data_path = os.path.join( script_dir, './resource/meal_problem/final_rating_data.csv') all_url = [ 'https://www.epicurious.com/recipes-menus/what-to-cook-this-weekend-february-22-24-gallery', 'https://www.epicurious.com/recipes-menus/what-to-cook-this-weekend-february-8-10-gallery', "https://www.epicurious.com/ingredients/acorn-delicata-kabocha-spaghetti-squash-winter-recipes-gallery", 'https://www.epicurious.com/recipes-menus/easy-dinner-recipes-for-cook90-gallery', 'https://www.epicurious.com/recipes-menus/our-favorite-cook90-lunches-gallery', 'https://www.epicurious.com/recipes-menus/make-ahead-weeknight-dinners-stew-soup-freezer-casserole-quick-easy' '-recipes-gallery', ] ep_urls = set() for i in all_url: initializer = url(i) res = bs(initializer.read(), "html.parser") for div in res.findAll( 'div', {'class': 'gallery-slide-caption__dek-container'}): ep_urls.update([div.find('a')['href']]) p = multiprocessing.Pool(4) output = p.map(Recipe, ep_urls) ar = [] for i in output: ar.append(i.__dict__) df = pd.DataFrame(ar) df = df.dropna(axis=0) df = df[df['personal_rating'].map(len) > 9] df = df.sort_values(by=['title']) df.to_csv(nutrition_info_path) p_r = pd.DataFrame(columns=['title', 'user', 'rating']) count = 0 for i in range(df.shape[0]): for j in df.iloc[i, 6]: p_r.loc[count] = [df.iloc[i, 0], j[0], j[1]] count += 1 user_reviews2 = p_r['user'].value_counts()[ p_r['user'].value_counts() > 0].index trun_recipes_user_review = p_r[p_r['user'].isin(user_reviews2)] trun_recipes_user_review = trun_recipes_user_review.drop_duplicates( ['user', 'title']) trun_recipes_user_review_matrix = trun_recipes_user_review.pivot( index='user', columns='title', values='rating') final_rating_data = pd.DataFrame( columns=trun_recipes_user_review_matrix.columns) for i in range(trun_recipes_user_review_matrix.shape[0] // 50): temp = trun_recipes_user_review_matrix.iloc[50 * i:50 * i + 50].mean( skipna=True, axis=0) temp.name = 'user' + str(i) final_rating_data.loc[i] = temp final_rating_data.to_csv(final_rating_data_path)
def sound(text): t = encode((('text', text), ('format', 'wav'), ('lang', 'ru-RU'), ('speaker', 'zahar'), ('key', '1e0146ea-0b80-425b-8057-fd2feb8ef590'))) ans = url('https://tts.voicetech.yandex.net/generate?' + t).read() f = open('bereza.wav', 'wb') f.write(ans) play('bereza.wav', fn)
def add_news(wiki_url): """ input : an url from wikipedia action : add the url to the bdd and its computed keywords """ with url(wiki_url) as response: html = response.read() wiki_keywords = keywords(html).split('\n') wiki_keywords = wiki_keywords[:10] ID_news = set_news(html,wiki_keywords) return ID_news, wiki_keywords
def createRequest(self, op, request, device): # prefer valid cache instead of network self.reqCount = self.reqCount + 1 if re.match(".*?pubads\.g\.doubleclick\.net*", request.url().toString()): request.setUrl(QUrl("http://img6a.flixcart.com/www/prod/images/flipkart_india-e5f5aa9f.png")) # print "REQ :",self.reqCount," ",request.url() request.setAttribute(QNetworkRequest.CacheLoadControlAttribute, QNetworkRequest.PreferCache) reply = QNetworkAccessManager.createRequest(self, op, request, device) reply.error.connect(self.logNetworkErrors) return reply
def seed(): """ action : add multiple urls to the bdd and its computed keywords and show a progress bar """ urls = ["https://fr.wikipedia.org/wiki/Redis", "https://fr.wikipedia.org/wiki/C_(langage)", "https://fr.wikipedia.org/wiki/Licence_BSD", "https://fr.wikipedia.org/wiki/NoSQL" ] bar = Bar('seed of redis database', max=len(urls)) for wiki_url in urls: with url(wiki_url) as response: html = response.read() wiki_keywords = keywords(html).split('\n') set_news(html,wiki_keywords[:10]) bar.next() bar.finish()
def createRequest(self, op, request, device): # prefer valid cache instead of network self.reqCount = self.reqCount + 1 if re.match(".*?pubads\.g\.doubleclick\.net*", request.url().toString()): request.setUrl( QUrl( "http://img6a.flixcart.com/www/prod/images/flipkart_india-e5f5aa9f.png" )) # print "REQ :",self.reqCount," ",request.url() request.setAttribute(QNetworkRequest.CacheLoadControlAttribute, QNetworkRequest.PreferCache) reply = QNetworkAccessManager.createRequest(self, op, request, device) reply.error.connect(self.logNetworkErrors) return reply
def getAllPages(): pizza_url = "https://www.yelp.com/search?find_desc=pizza&find_loc=New+York+NY&ns=1" request = url(pizza_url) htmlscrap = request.read() request.close page_soup = soup(htmlscrap, "html.parser") #parsing as html #review_counter=i.findAll("li",{"class":"review-count responsive-small-display-inline-block"}) body = page_soup.findAll("h3", {"class": "search-result-title"}) pageRef = {} for i in range(1, len(body)): #Skipping the first record body1 = body[i].findAll("a", {"class": "biz-name js-analytics-click"}) name = body1[0].find("span") href = body1[0]["href"] # print(body1) name_val = name.string # name = name.text.replace("</span>","") # print(href) # print(name_val) pageRef[href] = name_val for p in pageRef: print(p)
def open_url(args): try: iter(args) result_ = [] for i in args: # print(i) open_request = url(i) html_scrap = open_request.read() # open_request.close() house_page = soup(html_scrap, 'html.parser') try: address = house_page.find('div', {'class': 'mapaddress'}).get_text() result_.append(address) except AttributeError: result_.append('N/A') # print(i) # print(address) # print(i for i in result_) # result = [i.get_text() for i in result_] return result_ except TypeError: return 'not data'
def parseReviews(review_count): x = 0 filename = "datasets.csv" #saving data as csv f = open(filename, "w") headers = "Name,Friend Count,Photo Count,Review Count,Elite Member,Funny Count,Cool Count,Useful Count,Review Length,Checkin Count\n" #these are the features that are scraped f.write(headers) total_rev_rating = 0 for count in range( review_count ): #regex could have been used here but this is to increment the url page(keeping it simple.) my_url = "https://www.yelp.com/biz/julianas-pizza-brooklyn-5?o?sort_by=date_desc" request = url(my_url) #taking url as a paramter htmlscrap = request.read() request.close() page_soup = soup(htmlscrap, "html.parser") #parsing as html body_t = page_soup.findAll("script", {"type": "application/ld+json"}) body_text = body_t[0].text json1_data = json.loads(body_text) json_reviews1 = json1_data['review'][count] rating = json_reviews1['reviewRating'] rating_value = rating['ratingValue'] description = json_reviews1['description'] date = json_reviews1['datePublished'] reviewTemp = Review(description, rating_value, date) total_rev_rating = total_rev_rating + rating_value #print(rating_value) average = total_rev_rating / review_count print(average) # print(ratingValue) # print(description) # print(reviewTemp.descr) #urls = [el['url'] for el in json.loads(body_t.text)['itemListElement']] #cprint(urls) #print(type(body_t)) f.close()
def _Dynamic_Fetch(self, request, response): """Trivial implementation of URLFetchService::Fetch(). Args: request: the fetch to perform, a URLFetchRequest response: the fetch response, a URLFetchResponse """ if len(request.url()) >= _MAX_URL_LENGTH: logging.error('URL is too long: %s...' % request.url()[:50]) raise apiproxy_errors.ApplicationError( urlfetch_service_pb.URLFetchServiceError.INVALID_URL) (protocol, host, path, query, fragment) = ( urllib.parse.urlsplit(request.url().decode())) payload = None if request.method() == urlfetch_service_pb.URLFetchRequest.GET: method = 'GET' elif request.method() == urlfetch_service_pb.URLFetchRequest.POST: method = 'POST' payload = request.payload() elif request.method() == urlfetch_service_pb.URLFetchRequest.HEAD: method = 'HEAD' elif request.method() == urlfetch_service_pb.URLFetchRequest.PUT: method = 'PUT' payload = request.payload() elif request.method() == urlfetch_service_pb.URLFetchRequest.DELETE: method = 'DELETE' elif request.method() == urlfetch_service_pb.URLFetchRequest.PATCH: method = 'PATCH' payload = request.payload() else: logging.error('Invalid method: %s', request.method()) raise apiproxy_errors.ApplicationError( urlfetch_service_pb.URLFetchServiceError.INVALID_URL) if not (protocol == 'http' or protocol == 'https'): logging.error('Invalid protocol: %s', protocol) raise apiproxy_errors.ApplicationError( urlfetch_service_pb.URLFetchServiceError.INVALID_URL) if not host: logging.error('Missing host.') raise apiproxy_errors.ApplicationError( urlfetch_service_pb.URLFetchServiceError.INVALID_URL) self._SanitizeHttpHeaders(_UNTRUSTED_REQUEST_HEADERS, request.header_list()) deadline = _API_CALL_DEADLINE if request.has_deadline(): deadline = request.deadline() validate_certificate = _API_CALL_VALIDATE_CERTIFICATE_DEFAULT if request.has_mustvalidateservercertificate(): validate_certificate = request.mustvalidateservercertificate() fetch_function = self._GetFetchFunction(request.url()) fetch_function(request.url().decode(), payload, method, request.header_list(), request, response, follow_redirects=request.followredirects(), deadline=deadline, validate_certificate=validate_certificate)
from urllib.request import urlopen as url # importing urllib for url request from bs4 import BeautifulSoup as soup x = 0 filename = "datasets.csv" # saving data as csv f = open(filename, "w") # these are the features that are scraped headers = "Name,Friend Count,Photo Count,Review Count,Elite Member,Funny Count,Cool Count,Useful Count,Review Length,Checkin Count\n" f.write(headers) for _ in range( 4 ): # regex could have been used here but this is to increment the url page(keeping it simple.) my_url = "https://www.yelp.ca/search?find_desc=Indian&find_loc=Toronto%2C+ON" + \ str(x) request = url(my_url) # taking url as a paramter htmlscrap = request.read() request.close() page_soup = soup(htmlscrap, "html.parser") # parsing as html # the class name where all the features are contained container = page_soup.findAll("div", {"class": "review review--with-sidebar"}) # print(len(container)) for i in container: # print(containers) friend_counter = i.findAll( "li", {"class": "friend-count responsive-small-display-inline-block"}) friend_count = friend_counter[0].b.text
def _Dynamic_Fetch(self, request, unused_response): if request.url() == 'exception': raise IOError('the remote error') elif request.url() == 'application_error': raise apiproxy_errors.ApplicationError(23, 'details')
def check_citation(self, citation): #NOTE: when implementing, wrap the method in a try catch and print out any error + the citation status try: pattern = re.compile("[ ][0-9]{4}") result = pattern.search(citation) self.year = result.group(0)[1:] except: raise Exception("Unable to find year in citation.") self.citation_status = MLACitationStatus.AUTHOR cursor = 0 while True: ascii_value = ord(citation[cursor]) # check if the current character is not " &-'." or any alphanumeric in English or Latin-1 if citation[cursor:cursor + 2] != ". " and ( ascii_value == 32 or ascii_value == 39 or 44 <= ascii_value <= 46 or 65 <= ascii_value <= 90 or 97 <= ascii_value <= 122 or 192 <= ascii_value <= 255): cursor += 1 else: break if cursor != 0: author_section = "" if citation[cursor:cursor + 2] == ". ": author_section = citation[:cursor + 1] else: raise Exception( "Bad formatting in the author section (unknown error).") # three or more authors if ", et al." in author_section: temp = author_section.replace(", et al", "") authors = temp.split(", ") filteredAuthor = [self.filter_latin(i) for i in authors] if re.match("^[A-Za-z][A-Za-z-' ]+$", filteredAuthor[0]) is not None \ and re.match("^[A-Z][A-Za-z-'. ]+[.]$", filteredAuthor[1]) is not None: self.authors.append(authors[0] + ", et al.") else: raise Exception("Bad formatting in the author section: '" + author_section + "'") # two authors elif ", and " in author_section: authors = author_section.split(", and ") if ", " not in authors[0]: raise Exception("Bad formatting in the author section: '" + author_section + "'") firstAuthor = authors[0].split(", ") filteredFirstAuthor = [ self.filter_latin(i) for i in firstAuthor ] if re.match("^[A-Za-z][A-Za-z-' ]+$", filteredFirstAuthor[0]) is not None \ and re.match("^[A-Z][A-Za-z-'. ]+$", filteredFirstAuthor[1]) is not None: self.authors.append(firstAuthor[0]) else: raise Exception("Bad formatting in the author section: '" + author_section + "'") if " " not in authors[1]: raise Exception("Bad formatting in the author section: '" + author_section + "'") secondAuthor = authors[1].split(" ", 1) filteredSecondAuthor = [ self.filter_latin(i) for i in secondAuthor ] if re.match("^[A-Z][A-Za-z-']+$", filteredSecondAuthor[0]) is not None \ and re.match("^[A-Za-z][A-Za-z-'. ]+[.]$", filteredSecondAuthor[1]) is not None: self.authors.append(filteredSecondAuthor[1][:-1]) elif re.match("^[A-Za-z][.]$", filteredSecondAuthor[1]) is not None: author_cursor = cursor + 2 actualSecondAuthor = "" while citation[author_cursor:author_cursor + 2] != ". ": actualSecondAuthor += citation[author_cursor] author_cursor += 1 self.authors.append(actualSecondAuthor) else: raise Exception("Bad formatting in the author section: '" + author_section + "'") # one author elif ", " in author_section: authors = author_section.split(", ") filteredAuthor = [self.filter_latin(i) for i in authors] if re.match("^[A-Za-z][A-Za-z-' ]+$", filteredAuthor[0]) is not None \ and re.match("^[A-Z][A-Za-z-' ]+[.]$", filteredAuthor[1]) is not None: self.authors.append(authors[0]) else: raise Exception("Bad formatting in the author section: '" + author_section + "'") elif "et. al." in author_section or "et.al." in author_section: raise Exception( "'Et al.' should not have a period after the 'Et'.") # no match; bad formatting else: raise Exception("Bad formatting in the author section: '" + author_section + "'") self.citation_status = MLACitationStatus.TITLE cursor += 1 # check the title section if citation[cursor:cursor + 3] == "<i>": cursor += 3 elif citation[cursor + 1:cursor + 4] == "<i>": cursor += 4 elif citation[cursor + 1] == "\"": cursor += 2 elif citation[cursor - 1:cursor + 1] == ".\"": raise Exception("Bad formatting in the title section.") title = "" while citation[cursor] != ".": title += citation[cursor] cursor += 1 title = title.replace("\"", "") title = title.replace("</i>", "") if title[0] == " ": title = title[1:] if citation[cursor + 1] == "\"": cursor += 2 else: cursor += 1 #now cursor should be at the beginning of italics result = url("https://brettterpstra.com/titlecase/?title=" + title) title_cased_title = result.read().decode('utf-8') if title != title_cased_title: self.warnings.append( "the title might contain improper capitalization: '" + title + "'") self.title = title # check for url self.citation_status = MLACitationStatus.URL extractor = URLExtract() if extractor.has_urls(citation): urls = extractor.find_urls(citation) self.url = urls[0][:-1] if self.url + "." not in citation: raise Exception("Bad formatting in the URL section.") if citation[cursor:cursor + 3] != "<i>" and citation[cursor + 1:cursor + 4] != "<i>": self.warnings.append( "the container may not exist or may not be italicized") elif citation[cursor:cursor + 3] == "<i>" and citation[cursor + 1:cursor + 4] == "<i>": self.warnings.append( "the container might exist when not necessary (if the citation is about a book), or the block immediately following the title may be improperly italicized." ) if self.url != "": citation.replace(self.url + ".", "") # check for other info # right now, it's too complex to validate the entire MLA citation without prior knowledge on what type of citation it is, # so the other info is just stored without checking self.citation_status = MLACitationStatus.OTHER_INFO remainingText = citation[cursor:] info = remainingText.split(", ") self.otherInfo = [i for i in info]
def __init__(self, page): print('attempting to build from: ' + page) try: self.build_recipe(bs(url(page), 'html.parser')) except Exception as x: print('Could not build from %s, %s' % (page, x))
#Testerino #Libs from urllib.request import urlopen as url from urllib.parse import urljoin from bs4 import BeautifulSoup from util import parse_damage_stats import csv import MySQLdb #Loading the page into bs from weapon page url page_link = 'http://darksouls.wikidot.com/weapons' page = url(page_link) soup = BeautifulSoup(page, 'html.parser') #Writing to local mysql db conn = MySQLdb.connect(host='localhost', user='******', passwd='Fr0ntranger') cursor = conn.cursor() cursor.execute('USE dark_souls') # page_content = soup.findAll('div', {'id': 'page-content'}) weapon_table = page_content[0].table.tr.findAll('td')[1] weapon_list = weapon_table.findAll('a') print('There are ' + str(len(weapon_list)) + ' weapons in Dark Souls 1: ') #Write weapon stats to a csv with open('weapon_data.csv', 'w') as csv_file: writer = csv.writer(csv_file) header = ["Name", "Physical", "Magic", "Fire", "Lightning", "Total AR"]
from bs4 import BeautifulSoup as bs from urllib.request import urlopen as url file = open("comic.txt", "r") name = input("Enter Name of Comic") var = bs(file, "html.parser") print("URL parsered to BS4") list = var.findAll("img", {"rel": "noreferrer"}) index = 1 for images in list: file_loc = str(images["src"]) temp = url(file_loc) output = open((+ "Comics\ file" + str(index) + ".jpg"), "wb") output.write(temp.read()) output.close() print("Downloaded file " + str(index) + " of " + str(len(list))) index = index + 1 break print("Download Complete")
from winsound import PlaySound as play from winsound import SND_FILENAME as fn from urllib.request import urlopen as url from urllib.parse import urlencode as encode URL = "http://baneks.ru/" aneks = [] def sound(text): t = encode((('text', text), ('format', 'wav'), ('lang', 'ru-RU'), ('speaker', 'zahar'), ('key', '1e0146ea-0b80-425b-8057-fd2feb8ef590'))) ans = url('https://tts.voicetech.yandex.net/generate?' + t).read() f = open('bereza.wav', 'wb') f.write(ans) play('bereza.wav', fn) for i in range(1, int(input()) + 1): html = url(URL + str(i)).read().decode("utf8") start = html.find('<meta name="description" content="') + len('<meta name="description" content="') end = html.find('<meta name="keywords"') - 7 aneks.append(html[start:end]) for i in range(0, len(aneks)): sound(aneks[i])
from ssl import SSLContext, PROTOCOL_TLSv1 from urllib.request import urlopen as url import datetime recognize = cv2.cv2.face.LBPHFaceRecognizer_create() recognize.read('trainer/trainer.yml') cascade = 'haarcascade_frontalface_default.xml' faceClassifier = cv2.CascadeClassifier(cascade) fontStyle = cv2.FONT_HERSHEY_SIMPLEX webcamServerIP = 'https://192.168.1.93:8080/shot.jpg' while True: now = datetime.datetime.now() contxt = SSLContext(PROTOCOL_TLSv1) inf = url(webcamServerIP, context=contxt).read() npImg = n.array(bytearray(inf), dtype=n.uint8) i = cv2.imdecode(npImg, -1) grayscale = cv2.cvtColor(i, cv2.COLOR_BGR2GRAY) face = faceClassifier.detectMultiScale(grayscale, 1.3, 5) unknownCounter = 0 with open("personnel.log", "w") as f: for x, y, w, z in face: cv2.rectangle(i, (x - 20, y - 20), (x + w + 20, y + z + 20), (0, 255, 0), 4) ID, person = recognize.predict(grayscale[y:y + z, x:x + w]) log = []
from winsound import PlaySound as play from winsound import SND_FILENAME as fn from urllib.request import urlopen as url from urllib.parse import urlencode as encode URL = "http://baneks.ru/" aneks = [] def sound(text): t = encode((('text', text), ('format', 'wav'), ('lang', 'ru-RU'), ('speaker', 'zahar'), ('key', '1e0146ea-0b80-425b-8057-fd2feb8ef590'))) ans = url('https://tts.voicetech.yandex.net/generate?' + t).read() f = open('bereza.wav', 'wb') f.write(ans) play('bereza.wav', fn) for i in range(1, int(input()) + 1): html = url(URL + str(i)).read().decode("utf8") start = html.find('<meta name="description" content="') + len( '<meta name="description" content="') end = html.find('<meta name="keywords"') - 7 aneks.append(html[start:end]) for i in range(0, len(aneks)): sound(aneks[i])
def _RetrieveURL(url, payload, method, headers, request, response, follow_redirects=True, deadline=_API_CALL_DEADLINE, validate_certificate=_API_CALL_VALIDATE_CERTIFICATE_DEFAULT): """Retrieves a URL over network. Args: url: String containing the URL to access. payload: Request payload to send, if any; None if no payload. If the payload is unicode, we assume it is utf-8. method: HTTP method to use (e.g., 'GET') headers: List of additional header objects to use for the request. request: A urlfetch_service_pb.URLFetchRequest proto object from original request. response: A urlfetch_service_pb.URLFetchResponse proto object to populate with the response data. follow_redirects: optional setting (defaulting to True) for whether or not we should transparently follow redirects (up to MAX_REDIRECTS) deadline: Number of seconds to wait for the urlfetch to finish. validate_certificate: If true, do not send request to server unless the certificate is valid, signed by a trusted CA and the hostname matches the certificate. Raises: Raises an apiproxy_errors.ApplicationError exception with INVALID_URL_ERROR in cases where: - The protocol of the redirected URL is bad or missing. - The port is not in the allowable range of ports. Raises an apiproxy_errors.ApplicationError exception with TOO_MANY_REDIRECTS in cases when MAX_REDIRECTS is exceeded """ last_protocol = '' last_host = '' if isinstance(payload, str): payload = payload.encode('utf-8') for redirect_number in range(MAX_REDIRECTS + 1): parsed = urllib.parse.urlsplit(url) protocol, host, path, query, fragment = parsed port = urllib.parse.splitport(urllib.parse.splituser(host)[1])[1] if not _IsAllowedPort(port): logging.error( 'urlfetch received %s ; port %s is not allowed in production!' % (url, port)) raise apiproxy_errors.ApplicationError( urlfetch_service_pb.URLFetchServiceError.INVALID_URL) if protocol and not host: logging.error('Missing host on redirect; target url is %s' % url) raise apiproxy_errors.ApplicationError( urlfetch_service_pb.URLFetchServiceError.INVALID_URL) if not host and not protocol: host = last_host protocol = last_protocol adjusted_headers = { 'User-Agent': 'AppEngine-Google; (+http://code.google.com/appengine)', 'Host': host, 'Accept-Encoding': 'gzip', } if payload is not None: adjusted_headers['Content-Length'] = str(len(payload)) if method == 'POST' and payload: adjusted_headers['Content-Type'] = 'application/x-www-form-urlencoded' passthrough_content_encoding = False for header in headers: if header.key().decode().title().lower() == 'user-agent': adjusted_headers['User-Agent'] = ( '%s %s' % (header.value().decode(), adjusted_headers['User-Agent'])) else: if header.key().decode().lower() == 'accept-encoding': passthrough_content_encoding = True adjusted_headers[header.key().decode().title()] = ( header.value().decode()) if payload is not None: escaped_payload = payload.encode('string_escape') else: escaped_payload = '' logging.debug('Making HTTP request: host = %r, ' 'url = %r, payload = %.1000r, headers = %r', host, url, escaped_payload, adjusted_headers) try: if protocol == 'http': connection_class = http.client.HTTPConnection elif protocol == 'https': if (validate_certificate and _CanValidateCerts() and CERT_PATH): connection_class = fancy_urllib.create_fancy_connection( ca_certs=CERT_PATH) else: connection_class = http.client.HTTPSConnection else: error_msg = 'Redirect specified invalid protocol: "%s"' % protocol logging.error(error_msg) raise apiproxy_errors.ApplicationError( urlfetch_service_pb.URLFetchServiceError.INVALID_URL, error_msg) if _CONNECTION_SUPPORTS_TIMEOUT: connection = connection_class(host, timeout=deadline) else: connection = connection_class(host) last_protocol = protocol last_host = host if query != '': full_path = path + '?' + query else: full_path = path if not _CONNECTION_SUPPORTS_TIMEOUT: orig_timeout = socket.getdefaulttimeout() try: if not _CONNECTION_SUPPORTS_TIMEOUT: socket.setdefaulttimeout(deadline) connection.request(method, full_path, payload, adjusted_headers) http_response = connection.getresponse() if method == 'HEAD': http_response_data = '' else: http_response_data = http_response.read() finally: if not _CONNECTION_SUPPORTS_TIMEOUT: socket.setdefaulttimeout(orig_timeout) connection.close() except _fancy_urllib_InvalidCertException as e: raise apiproxy_errors.ApplicationError( urlfetch_service_pb.URLFetchServiceError.SSL_CERTIFICATE_ERROR, str(e)) except _fancy_urllib_SSLError as e: app_error = ( urlfetch_service_pb.URLFetchServiceError.DEADLINE_EXCEEDED if 'timed out' in e.message else urlfetch_service_pb.URLFetchServiceError.SSL_CERTIFICATE_ERROR) raise apiproxy_errors.ApplicationError(app_error, str(e)) except socket.timeout as e: raise apiproxy_errors.ApplicationError( urlfetch_service_pb.URLFetchServiceError.DEADLINE_EXCEEDED, str(e)) except (http.client.error, socket.error, IOError) as e: raise apiproxy_errors.ApplicationError( urlfetch_service_pb.URLFetchServiceError.FETCH_ERROR, str(e)) if http_response.status in REDIRECT_STATUSES and follow_redirects: url = http_response.getheader('Location', None) if url is None: error_msg = 'Redirecting response was missing "Location" header' logging.error(error_msg) raise apiproxy_errors.ApplicationError( urlfetch_service_pb.URLFetchServiceError.MALFORMED_REPLY, error_msg) if (http_response.status != http.client.TEMPORARY_REDIRECT and method not in PRESERVE_ON_REDIRECT): logging.warn('Received a %s to a %s. Redirecting with a GET', http_response.status, method) method = 'GET' payload = None else: response.set_statuscode(http_response.status) if (http_response.getheader('content-encoding') == 'gzip' and not passthrough_content_encoding): gzip_stream = io.StringIO(http_response_data) gzip_file = gzip.GzipFile(fileobj=gzip_stream) http_response_data = gzip_file.read() response.set_content(http_response_data[:MAX_RESPONSE_SIZE]) for header_key in list(http_response.msg.keys()): for header_value in http_response.msg.get_all(header_key): if (header_key.lower() == 'content-encoding' and header_value == 'gzip' and not passthrough_content_encoding): continue if header_key.lower() == 'content-length' and method != 'HEAD': header_value = str(len(response.content())) header_proto = response.add_header() header_proto.set_key(header_key.encode()) header_proto.set_value(header_value.encode()) if len(http_response_data) > MAX_RESPONSE_SIZE: response.set_contentwastruncated(True) if request.url() != url: response.set_finalurl(url.encode()) break else: error_msg = 'Too many repeated redirects' logging.error(error_msg) raise apiproxy_errors.ApplicationError( urlfetch_service_pb.URLFetchServiceError.TOO_MANY_REDIRECTS, error_msg)
from bs4 import BeautifulSoup as soup from urllib.request import urlopen as url apple = url('https://www.flipkart.com/search?q=apple+mobiles&as=on&as-show=on&otracker=AS_Query_OrganicAutoSuggest_1_5_na_na_na&otracker1=AS_Query_OrganicAutoSuggest_1_5_na_na_na&as-pos=1&as-type=RECENT&suggestionId=apple+mobiles&requestId=5e21fc5c-dff2-4485-8f4c-657dcba0ce2c&as-searchtext=apple') # opening the conection mobile_html = apple.read() apple.close() mobile_soup = soup(mobile_html, "html.parser") # print(mobile_soup.head) # print(mobile_soup.p) # grabs the product iphones = mobile_soup.findAll("div", {"class": "_3wU53n"}) print(len(iphones)) # total items firstproduct = iphones[0] tenthprdct = iphones[10] print(firstproduct) print(tenthprdct) # no of ratings of mobiles mob_rating = mobile_soup.findAll("span", {"class": "_38sUEc"}) print(len(mob_rating)) print(mob_rating[10].text) # specs of mobiles mob_specs = mobile_soup.findAll("ul", {"class": "vFw0gD"})
from urllib.request import urlopen as url from pathlib import Path as pth import re from scipy.special import spherical_jn as jn, spherical_yn as yn import numpy as np import matplotlib.pyplot as plt variant = 9 C=300000000 PI=3.1415926535 file = pth('./taskfile.txt') if not file.exists(): txt = url('https://jenyay.net/uploads/Student/Modelling/task_02.txt').read() f0 = file.open('wb') f0.write(txt) f0.close() if file.exists(): f1 = file.open() lines = [x for x in f1] p = re.compile(r'[0-9\.\-e]+') m = p.findall(lines[variant-1]) print(m[1:]) f1.close() D = float(m[1]) fmin = float(m[2]) fmax = float(m[3]) f = np.linspace(fmin, fmax, 400) r = D/2
from random import randint from bs4 import BeautifulSoup from urllib.request import urlopen as url # Loading the page into bs from weapon page url page_link = 'http://dnd5e.wikidot.com/' page = url(page_link) soup = BeautifulSoup(page, 'html.parser') page_content = soup.findAll('div', {'id': 'page-content'}) def scrape_classes(): classes = {} class_links = {} class_categories = page_content[0].findAll('div', {'class': 'col-md-7'}) for c in class_categories: single_class = c.find('h1') if single_class: class_name = single_class.find('a').text if c.find('p'): sub_classes_ref = c.p.findAll('a') sub_classes = [] for i in sub_classes_ref: sub_classes.append(i.text) class_links[i.text] = i.get('href') classes[class_name] = sub_classes