Пример #1
0
    def update_DBfollow(self):
        
        num_of_articles_per_update = 10
        try:
            num_of_articles_per_update = self.num_of_articles_per_update
            if (num_of_articles_per_update == None):
                self.num_of_articles_per_update = 10
                num_of_articles_per_update = 10 
        except Exception:
            num_of_articles_per_update = 10
            
        
        # GET OLD ARTICLE KEYS FROM DB
        self.num_of_update_requests += 1   
        search_params_object = pickle.loads(str(self.search_params_str))
        
        # GET NEW ARTICLE KEYS FROM QUERY
        new_resultsList = HTMLparser.getAllResultsFromURLwithProxy(search_params_object)
        #if (new_resultsList == None):
        #    raise ResearchExceptions.InputError("In update follow", "Function: HTMLparser.getResultsFromURL(self.url) returned None\n")  
        new_resultsKeys = []
        for article in new_resultsList:
            year= article.get_year_from_HTML_author_year_pub()
            if (year != None):
                if (year > 2008):
                    new_resultsKeys.append(article.get_key())
        
        # Check if There are new articles  
        diff_list = compareKeysListswithOrder(self.pastResultsKeysList, new_resultsKeys,num_of_articles_per_update )
        
        num_new_articles = len(diff_list)
        try:
            self.num_of_new_articles_added_last_update = num_new_articles
        except Exception:
            pass
        
        self.total_num_of_articles += num_new_articles

        # Update the user on changes, and update the DB
        if (num_new_articles != 0):
            # TODO: add here try and catch on the email sending, and only afterwards update follow and add to DB
            rc = self.create_email_message(diff_list, new_resultsList, diff_list)
            if (rc):
            
                # Add the new articles to the saved dictionary
                for key in diff_list:
                    self.pastResultsKeysList.append(key)
                    
                self.num_of_meaningful_updates += 1
        
        # In any case, update the Follow in DB
        self.time_last_updated = datetime.datetime.now()
        
        try:
            self.put()
        except Exception:
            print "Could not put to DB"
            # TODO: change this
        
        return True
Пример #2
0
def get_articles_list_with_label_as_HTMLParser(user, label_name):
    article_objects_list = get_articles_list_with_label(user, label_name)
    if (type(article_objects_list) is int):
        return article_objects_list

    html_parser = HTMLparser.HTMLparser(url=None, html=None)
    html_parser.results = article_objects_list
    html_parser.numOfResults = len(article_objects_list)

    return html_parser
Пример #3
0
def url2ArticleDict(url):
    # get the HTML from the URL (first 100 results)
    newHTML = getHTML.getHTML(url)
    newHTML.getHTMLfromURL()

    # parse the results
    newHTMLParser = HTMLparser.HTMLparser(url, newHTML.get_html())
    newHTMLParser.parseHTML()
    #print newHTMLParser.results
    return newHTMLParser.results
Пример #4
0
 def first_follow_query(self):
     
     results = HTMLparser.getAllResultsFromURLwithProxy(self.search_params)
     self.first_results = results
     #results = resultsObject.get_results() # results is a list of articles
     self.total_num_of_articles = len(results)
     
     self.pastResultsKeysList = []
     
     for article in results:
         self.pastResultsKeysList.append(article.get_key())
     
     return len(results)
def pending_share_preview_as_HTMLparser(invited_user, pending_id):
    pending_obj = get_single_pending(invited_user, pending_id)
    if not isinstance(pending_obj, PendingSharedLabel):
        return pending_obj

    article_objects_list = Label.get_articles_list_with_label(
        pending_obj.inviting_user, pending_obj.label_name)
    if (article_objects_list == -7):
        return -7

    html_parser = HTMLparser.HTMLparser(url=None, html=None)
    html_parser.results = article_objects_list
    html_parser.numOfResults = len(article_objects_list)
    return html_parser
Пример #6
0
def search_in_labels_return_HTMLparser(user, label_name, search_term):
    query = db.GqlQuery(
        "SELECT * FROM Label WHERE users_list = :1 " + "AND label_name = :2",
        user, label_name)

    article_objects_list = []
    for label_object in query:
        if (label_object.article_abstract_title_author.lower().find(
                search_term.lower()) != -1):
            article_objects_list.append(
                pickle.loads(str(label_object.serialized_article)))

    html_parser = HTMLparser.HTMLparser(url=None, html=None)
    html_parser.results = article_objects_list
    html_parser.numOfResults = len(article_objects_list)

    return html_parser
Пример #7
0
def get_list_of_suggested_article_ordered_by_date(user_name):
    suggestions_list = []
    final_articles_list = []
    query = db.GqlQuery("SELECT * FROM Suggestion WHERE user = :1", user_name)
    # results = q.fetch(10)
    # this is supposed to be only one result but who knows...
    for sugg in query:
        if (not sugg.is_removed):
            suggestions_list.append(sugg)

    for sugg in sort_list_of_suggestions_by_date(suggestions_list):
        final_articles_list.append(
            pickle.loads(str(sugg.suggested_serialized_article)))

    html_parser = HTMLparser.HTMLparser(url=None, html=None)
    html_parser.results = final_articles_list
    html_parser.numOfResults = len(final_articles_list)
    return html_parser
Пример #8
0
def search_in_labels_return_HTMLparser_JSON(user, label_name, search_term):
    plain_msg = ""
    query = db.GqlQuery(
        "SELECT * FROM Label WHERE users_list = :1 " + "AND label_name = :2",
        user, label_name)

    article_objects_list = []
    for label_object in query:
        if (label_object.article_abstract_title_author.lower().find(
                search_term.lower()) != -1):
            article_objects_list.append(
                pickle.loads(str(label_object.serialized_article)))

    html_parser = HTMLparser.HTMLparser(url=None, html=None)
    html_parser.results = article_objects_list
    html_parser.numOfResults = len(article_objects_list)

    my_htmlparser_encoder = JSONConvertors.HTMLparserEncoder()
    as_json = my_htmlparser_encoder.encode(html_parser)

    return as_json
Пример #9
0
    def update_DBfollow(self):

        num_of_articles_per_update = 10
        try:
            num_of_articles_per_update = self.num_of_articles_per_update
            if (num_of_articles_per_update == None):
                self.num_of_articles_per_update = 10
                num_of_articles_per_update = 10
        except Exception:
            num_of_articles_per_update = 10

        # GET OLD ARTICLE KEYS FROM DB
        self.num_of_update_requests += 1
        search_params_object = pickle.loads(str(self.search_params_str))

        # GET NEW ARTICLE KEYS FROM QUERY
        new_resultsList = HTMLparser.getAllResultsFromURLwithProxy(
            search_params_object)
        #if (new_resultsList == None):
        #    raise ResearchExceptions.InputError("In update follow", "Function: HTMLparser.getResultsFromURL(self.url) returned None\n")
        new_resultsKeys = []
        for article in new_resultsList:
            year = article.get_year_from_HTML_author_year_pub()
            if (year != None):
                if (year > 2008):
                    new_resultsKeys.append(article.get_key())

        # Check if There are new articles
        diff_list = compareKeysListswithOrder(self.pastResultsKeysList,
                                              new_resultsKeys,
                                              num_of_articles_per_update)

        num_new_articles = len(diff_list)
        try:
            self.num_of_new_articles_added_last_update = num_new_articles
        except Exception:
            pass

        self.total_num_of_articles += num_new_articles

        # Update the user on changes, and update the DB
        if (num_new_articles != 0):
            # TODO: add here try and catch on the email sending, and only afterwards update follow and add to DB
            rc = self.create_email_message(diff_list, new_resultsList,
                                           diff_list)
            if (rc):

                # Add the new articles to the saved dictionary
                for key in diff_list:
                    self.pastResultsKeysList.append(key)

                self.num_of_meaningful_updates += 1

        # In any case, update the Follow in DB
        self.time_last_updated = datetime.datetime.now()

        try:
            self.put()
        except Exception:
            print "Could not put to DB"
            # TODO: change this

        return True
Пример #10
0
import HTMLparser
# you can accuire an html file throw urlopen
file = open('info.php', 'r', encoding='utf-8')

# parser is the url handler
parser = HTMLparser.HTMLparser(file)

# build the dom tree
parser.buildTree()

# from root dom 'html' and print the content of each tag
parser.traverseTree(parser.root)

# traverse the dom tree throw tag name
parser.attrTraverse('div', parser.root)

# find every tag whose attribute is 'class' and the content of attribute is 'banner-content'
parser.findByAttr('class', 'banner-content', parser.root)