def update_DBfollow(self): num_of_articles_per_update = 10 try: num_of_articles_per_update = self.num_of_articles_per_update if (num_of_articles_per_update == None): self.num_of_articles_per_update = 10 num_of_articles_per_update = 10 except Exception: num_of_articles_per_update = 10 # GET OLD ARTICLE KEYS FROM DB self.num_of_update_requests += 1 search_params_object = pickle.loads(str(self.search_params_str)) # GET NEW ARTICLE KEYS FROM QUERY new_resultsList = HTMLparser.getAllResultsFromURLwithProxy(search_params_object) #if (new_resultsList == None): # raise ResearchExceptions.InputError("In update follow", "Function: HTMLparser.getResultsFromURL(self.url) returned None\n") new_resultsKeys = [] for article in new_resultsList: year= article.get_year_from_HTML_author_year_pub() if (year != None): if (year > 2008): new_resultsKeys.append(article.get_key()) # Check if There are new articles diff_list = compareKeysListswithOrder(self.pastResultsKeysList, new_resultsKeys,num_of_articles_per_update ) num_new_articles = len(diff_list) try: self.num_of_new_articles_added_last_update = num_new_articles except Exception: pass self.total_num_of_articles += num_new_articles # Update the user on changes, and update the DB if (num_new_articles != 0): # TODO: add here try and catch on the email sending, and only afterwards update follow and add to DB rc = self.create_email_message(diff_list, new_resultsList, diff_list) if (rc): # Add the new articles to the saved dictionary for key in diff_list: self.pastResultsKeysList.append(key) self.num_of_meaningful_updates += 1 # In any case, update the Follow in DB self.time_last_updated = datetime.datetime.now() try: self.put() except Exception: print "Could not put to DB" # TODO: change this return True
def get_articles_list_with_label_as_HTMLParser(user, label_name): article_objects_list = get_articles_list_with_label(user, label_name) if (type(article_objects_list) is int): return article_objects_list html_parser = HTMLparser.HTMLparser(url=None, html=None) html_parser.results = article_objects_list html_parser.numOfResults = len(article_objects_list) return html_parser
def url2ArticleDict(url): # get the HTML from the URL (first 100 results) newHTML = getHTML.getHTML(url) newHTML.getHTMLfromURL() # parse the results newHTMLParser = HTMLparser.HTMLparser(url, newHTML.get_html()) newHTMLParser.parseHTML() #print newHTMLParser.results return newHTMLParser.results
def first_follow_query(self): results = HTMLparser.getAllResultsFromURLwithProxy(self.search_params) self.first_results = results #results = resultsObject.get_results() # results is a list of articles self.total_num_of_articles = len(results) self.pastResultsKeysList = [] for article in results: self.pastResultsKeysList.append(article.get_key()) return len(results)
def pending_share_preview_as_HTMLparser(invited_user, pending_id): pending_obj = get_single_pending(invited_user, pending_id) if not isinstance(pending_obj, PendingSharedLabel): return pending_obj article_objects_list = Label.get_articles_list_with_label( pending_obj.inviting_user, pending_obj.label_name) if (article_objects_list == -7): return -7 html_parser = HTMLparser.HTMLparser(url=None, html=None) html_parser.results = article_objects_list html_parser.numOfResults = len(article_objects_list) return html_parser
def search_in_labels_return_HTMLparser(user, label_name, search_term): query = db.GqlQuery( "SELECT * FROM Label WHERE users_list = :1 " + "AND label_name = :2", user, label_name) article_objects_list = [] for label_object in query: if (label_object.article_abstract_title_author.lower().find( search_term.lower()) != -1): article_objects_list.append( pickle.loads(str(label_object.serialized_article))) html_parser = HTMLparser.HTMLparser(url=None, html=None) html_parser.results = article_objects_list html_parser.numOfResults = len(article_objects_list) return html_parser
def get_list_of_suggested_article_ordered_by_date(user_name): suggestions_list = [] final_articles_list = [] query = db.GqlQuery("SELECT * FROM Suggestion WHERE user = :1", user_name) # results = q.fetch(10) # this is supposed to be only one result but who knows... for sugg in query: if (not sugg.is_removed): suggestions_list.append(sugg) for sugg in sort_list_of_suggestions_by_date(suggestions_list): final_articles_list.append( pickle.loads(str(sugg.suggested_serialized_article))) html_parser = HTMLparser.HTMLparser(url=None, html=None) html_parser.results = final_articles_list html_parser.numOfResults = len(final_articles_list) return html_parser
def search_in_labels_return_HTMLparser_JSON(user, label_name, search_term): plain_msg = "" query = db.GqlQuery( "SELECT * FROM Label WHERE users_list = :1 " + "AND label_name = :2", user, label_name) article_objects_list = [] for label_object in query: if (label_object.article_abstract_title_author.lower().find( search_term.lower()) != -1): article_objects_list.append( pickle.loads(str(label_object.serialized_article))) html_parser = HTMLparser.HTMLparser(url=None, html=None) html_parser.results = article_objects_list html_parser.numOfResults = len(article_objects_list) my_htmlparser_encoder = JSONConvertors.HTMLparserEncoder() as_json = my_htmlparser_encoder.encode(html_parser) return as_json
def update_DBfollow(self): num_of_articles_per_update = 10 try: num_of_articles_per_update = self.num_of_articles_per_update if (num_of_articles_per_update == None): self.num_of_articles_per_update = 10 num_of_articles_per_update = 10 except Exception: num_of_articles_per_update = 10 # GET OLD ARTICLE KEYS FROM DB self.num_of_update_requests += 1 search_params_object = pickle.loads(str(self.search_params_str)) # GET NEW ARTICLE KEYS FROM QUERY new_resultsList = HTMLparser.getAllResultsFromURLwithProxy( search_params_object) #if (new_resultsList == None): # raise ResearchExceptions.InputError("In update follow", "Function: HTMLparser.getResultsFromURL(self.url) returned None\n") new_resultsKeys = [] for article in new_resultsList: year = article.get_year_from_HTML_author_year_pub() if (year != None): if (year > 2008): new_resultsKeys.append(article.get_key()) # Check if There are new articles diff_list = compareKeysListswithOrder(self.pastResultsKeysList, new_resultsKeys, num_of_articles_per_update) num_new_articles = len(diff_list) try: self.num_of_new_articles_added_last_update = num_new_articles except Exception: pass self.total_num_of_articles += num_new_articles # Update the user on changes, and update the DB if (num_new_articles != 0): # TODO: add here try and catch on the email sending, and only afterwards update follow and add to DB rc = self.create_email_message(diff_list, new_resultsList, diff_list) if (rc): # Add the new articles to the saved dictionary for key in diff_list: self.pastResultsKeysList.append(key) self.num_of_meaningful_updates += 1 # In any case, update the Follow in DB self.time_last_updated = datetime.datetime.now() try: self.put() except Exception: print "Could not put to DB" # TODO: change this return True
import HTMLparser # you can accuire an html file throw urlopen file = open('info.php', 'r', encoding='utf-8') # parser is the url handler parser = HTMLparser.HTMLparser(file) # build the dom tree parser.buildTree() # from root dom 'html' and print the content of each tag parser.traverseTree(parser.root) # traverse the dom tree throw tag name parser.attrTraverse('div', parser.root) # find every tag whose attribute is 'class' and the content of attribute is 'banner-content' parser.findByAttr('class', 'banner-content', parser.root)