async def parse(url): driver = get_driver() page_source = await get_page_source(driver, url) driver.close() parsed_dict = {url: parse_html(page_source)} return parsed_dict
def get_source_data(logger): source = HtmlSource(logger) for station_name in source.get_stations_list(): try: html = source.get_station_html(station_name) yield station_name, parse_html(html) logger.info('action=[get_source_data] result=[success] ' 'station=[%s]', station_name) except RequestError as error: logger.error('action=[get_source_data] result=[fail] station=[%s] ' '%s', station_name, make_one_line(error)) except Exception as error: logger.error('action=[get_source_data] result=[error] station=[%s] ' 'reason=[%s]', station_name, make_one_line(error))
def get_source_data(logger): source = HtmlSource(logger) for station_name in source.get_stations_list(): try: html = source.get_station_html(station_name) yield station_name, parse_html(html) logger.info( 'action=[get_source_data] result=[success] ' 'station=[%s]', station_name) except RequestError as error: logger.error( 'action=[get_source_data] result=[fail] station=[%s] ' '%s', station_name, make_one_line(error)) except Exception as error: logger.error( 'action=[get_source_data] result=[error] station=[%s] ' 'reason=[%s]', station_name, make_one_line(error))
def parse_uri(content): from datetime import datetime start = datetime.now() # javascript: scheme if content.lower().find('javascript:') == 0: parser.process_content(content[len('javascript:'):], contexts.URI_JS) content = urllib.unquote(content).decode("utf8") js_parser.parse_js(content) # data: scheme elif content.lower().find('data:') == 0: content = content[len('data:'):] # placeholder is in content type if content.find(",") == -1: # invalid format of data: scheme parser.process_content(content, contexts.URI_UNKNOWN_DATA) return parser.process_content(content[0:content.find(",")], contexts.URI_CONTENT_TYPE) # extracts content-type, encoding and charset # if encoding not found, uses urlencode # if encoding urlencode and charset not found, uses utf8 enctype = "urlencode" if content.find(";") != -1 and content.find(",") > content.find(";"): content_type = content[0:content.find(";")] encoding = content[content.find(";") + 1:content.find(",")] # placeholder is in encoding parser.process_content(encoding, contexts.URI_UNKNOWN_DATA) if encoding.tolower() == "base64": enctype = "base64" elif encoding.tolower().find("charset=") == 0: charset = encoding[0, encoding.tolower().find("charset=")] else: charset = "utf8" else: content_type = content[0:content.find(",")] charset = "utf8" # decode content content = content[content.find(",")] if enctype == "base64": content = b64decode(content) else: content = urllib.unquote(content).decode(charset) # subprocess content according to the content type if content_type.lower() == "text/html": parser.process_content(content, contexts.URI_HTML_DATA) html_parser.parse_html(content) elif content_type.lower() == "text/css": parser.process_content(content, contexts.URI_CSS_DATA) css_parser.parse_css_stylesheet(content) elif content_type.lower() == "text/javascript" or content_type.lower( ) == "application/x-javascript" or content_type.lower( ) == "application/javascript": parser.process_content(content, contexts.URI_JS_DATA) js_parser.parse_js(content) else: parser.process_content(content, contexts.URI_OTHER_DATA) # other schemes else: parser.process_content(content, contexts.URI_URL) end = datetime.now() library.uri_us += end - start
def __call__(self): response = requests.get(DOWNLOAD_URL) parse_html() self.finished.emit(response.text)
def download_data(): print('task thread is:', threading.current_thread()) print('downloading') response = requests.get(DOWNLOAD_URL) parse_html() return response.text
def _button_clicked(self): response = requests.get(DOWNLOAD_URL) parse_html() self._textarea.setPlainText(response.text)
def crawl(crawler_database, start_url): crawler_database.connect_db() if crawler_database.check_db(): # Restarting the previous crawling process print("Restarting previous crawl process.") else: # Starting a new crawling process from start_url website = wm.extract_root(start_url) if len(website) > 1: crawler_database.cursor.execute( 'INSERT OR IGNORE INTO Websites ' '(url) VALUES ( ? )', (website, )) crawler_database.cursor.execute( 'INSERT OR IGNORE INTO Pages ' '(url, html, new_rank) ' 'VALUES ( ?, NULL, 1.0 )', (start_url, )) crawler_database.connection.commit() # Getting the current website crawler_database.cursor.execute('''SELECT url FROM Websites''') for row in crawler_database.cursor: websites.append(str(row[0])) print(websites) crawl_counter = 0 while True: # Crawl Loop if crawl_counter < 1: crawl_counter = int(input('How many pages do you want to crawl? ')) if crawl_counter == 0: break print("Crawling Iteration:", crawl_counter) crawl_counter -= 1 # Getting a random row where HTML Code and error is NULL crawler_database.cursor.execute('SELECT id,url FROM Pages WHERE ' 'html is NULL and error is NULL ' 'ORDER BY RANDOM() LIMIT 1') row = crawler_database.cursor.fetchone() if row is None: # Breaking the loop when there are no row print("All pages are retrieved.") break from_id = row[0] url = row[1] print("Crawling ->", "Page ID =", from_id, "| URL =", url, end=' ') # This query deletes all records which the current page points to. crawler_database.cursor.execute('DELETE from Links WHERE from_id=?', (from_id, )) try: document = wm.open_url(url) if wm.get_http_status_code( document ) != 200: # Checking HTTP Status Code (200 means success) print("Error occured while loading the page: ", wm.get_http_status_code(document)) crawler_database.cursor.execute( 'UPDATE Pages SET error=? WHERE url=?', (wm.get_http_status_code(document), url)) if 'text/html' != wm.get_http_content_type( document): # Checking the Content-Type print("This URL is not an HTML file.") crawler_database.cursor.execute( 'DELETE FROM Pages WHERE url=?', (url, )) crawler_database.cursor.execute( 'UPDATE Pages SET error=0 WHERE url=?', (url, )) crawler_database.connection.commit() continue soup = hp.parse_html( document ) # Using BeautifulSoup to repair and parse HTML document html_code = str(soup) except HTTPError: print("Unable to retrieve or parse page") crawler_database.cursor.execute( 'UPDATE Pages SET error=-1 WHERE url=?', (url, )) crawler_database.connection.commit() continue # Inserting the currently crawling page to Pages table with initial rank 1.0 crawler_database.cursor.execute( 'INSERT OR IGNORE INTO Pages (url, html, new_rank)' ' VALUES ( ?, NULL, 1.0 )', (url, )) # Updating the currently crawling pages html field crawler_database.cursor.execute('UPDATE Pages SET html=? WHERE url=?', (html_code, url)) crawler_database.connection.commit() a_tags = hp.get_tags( soup, "a" ) # Getting all <a> tags because they are the links to other pages href_count = 0 for tag in a_tags: href_attribute = hp.get_attribute(tag, "href") if not wm.does_exist(href_attribute): continue if wm.is_image(href_attribute): continue if href_attribute.endswith('/'): href_attribute = href_attribute[:-1] if wm.is_relative(href_attribute): href = wm.make_absolute( url, href_attribute) # Making the absolute URL if href.find("#") > 1: pointer = href.find("#") href = href[:pointer] # Cutting the fragment part in the URL is_found = False for website in websites: if href.startswith( website ): # If href is in the Websites, breaking the loop is_found = True break if not is_found: continue # Inserting the href to Pages table with initial rank 1.0 crawler_database.cursor.execute( 'INSERT OR IGNORE INTO Pages (url, html, new_rank) ' 'VALUES ( ?, NULL, 1.0 )', (href, )) href_count = href_count + 1 crawler_database.connection.commit() # Getting the id of the href crawler_database.cursor.execute( 'SELECT id FROM Pages WHERE url=? LIMIT 1', (href, )) row = crawler_database.cursor.fetchone() if row is None: print("Couldn't retrieve id of href ->", href) continue to_id = row[0] # Inserting from_id (currently crawling pages id) and to_id (href's id) to Links Table crawler_database.cursor.execute( 'INSERT OR IGNORE INTO Links (from_id, to_id) ' 'VALUES ( ?, ? )', (from_id, to_id)) print("| links found =", href_count, "\n") crawler_database.disconnect_db()
def process(self): try: #step 1: get text content and title print("INFO: Getting the text content of web page."); title, content = content_extraction.get_text(self.__url); if (title == None or content == None): raise ValueError("Could not retrieve text data."); #step 2: Get all title, h1, meta tags of the web page. print("INFO: Getting the title, meta and h1 tag content of web page."); title_content, meta_content, h1_tag_content = html_parser.parse_html(self.__url); #step 3: get noun words and phrases (from wiki search) print("INFO: Extracting single word nouns from text content and searching related phrases from wikipedia articles."); print("INFO: This might take a few seconds."); nouns = noun_extractor.get_nouns(title, content); is_input_small = False; if(len(nouns) < 15): is_input_small = True; #step 4: ranking - giving scores to each word based on several factors. print("INFO: Ranking words and phrases based on factors like occurrence, frequency etc."); ranked_words = []; for w in nouns: ranked_word = ranking.do_rank(self.__url, w, content, title_content, meta_content, h1_tag_content); ranked_words.append(ranked_word); #step 5: grouping of similar phrases and eliminating repetition for more diverse keywords print("INFO: Grouping similar words to clusters and getting the most ranked words from each cluster"); print("INFO: This might take a few seconds."); words, clusters = k_means_grouper.get_clusters(ranked_words); cluster_count = 0; final_ranked_words = []; #Add all keywords that did not have a match in data-set and hence cannot be grouped. final_ranked_words.extend(words) #sort each data clustered based on ranks and get the highest ranked data from a cluster for data_cluster in clusters: data_cluster.sort(key=operator.attrgetter('score'), reverse=True) #Taking the most ranked word of the cluster final_ranked_words.append(data_cluster[0]); print("INFO: Sorting all the words and phrases based on the ranking scores and getting Top 15 words."); #sort the final list of ranked words final_ranked_words.sort(key=operator.attrgetter('score'), reverse=True) count = 0; key_words = []; #Will show first 15 ranked keywords for rw in final_ranked_words: count += 1; if len(key_words) == 15: break; else: if rw.isUpper: key_words.append(rw.getword().upper()); else: key_words.append(rw.getword().title()); print("INFO: Success, check your words in the UI."); if(count < 15): return {'words' : key_words, "is_input_small" : is_input_small}; else: return {'words' : key_words}; except ValueError as e: print(e); return {"error": e}; except Exception as e: print("ERROR: Some error occurred."); print(e); return {"error": "Sorry, something went wrong! Please verify the URL."};
def download_data(): response = requests.get(DOWNLOAD_URL) parse_html() return response.text
def _populate_textarea(self): parse_html() self._textarea.setPlainText(unicode(self._reply.readAll(), 'utf-8'))
def parse(content): content = deduplicate(content) patch_parsers() html_parser.parse_html(content) return check_sequences(content)