Exemplos de WebGrab em Python, exemplos de WebGrab em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: DataAdd.py Projeto: laganojunior/Malanalyzer

def addUserData(username, db):
    """
    Adds a username and all of its ratings to a MalDB.
    
    Arguments:
    username - the username to add
    db       - the MalDB instance to add the data to
    """
    
    animelist = WebGrab.getAnimeList(username)
    userid = WebGrab.getUserId(username)

    addAnimeList(db, userid, username, animelist)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: FillQueue.py Projeto: laganojunior/Malanalyzer

    def post(self):
        self.response.headers['Content-Type'] = 'text/plain'

        # Fill the queue with most recent online users
        # Get the next username on the list
        usernamelist = WebGrab.getRecentOnlineUsernames()

        if usernamelist == []:
            self.response.out.write('Webgrab got 0 results<br>')
            logging.debug('Webgrab got 0 results')

        # Create task queue items for each user
        for i, username in enumerate(usernamelist):
            taskqueue.add(url='/extract',
                          params={'username' : username},
                          name="user_extract-%s-%s" % (username, 
               int(time.time())),
                          queue_name="user-extract")

Exemplo n.º 3

0

Exibir arquivo

Arquivo: InsertUsername.py Projeto: laganojunior/Malanalyzer

    def post(self):
        self.response.headers["Content-Type"] = "text/html"

        username = self.request.get("username")

        logging.debug("Got request to queue %s" % cgi.escape(username))

        # Verify the user profile is real
        try:
            userid = WebGrab.getUserId(username)
        except urllib2.URLError:
            self.response.out.write("Could not find user %s" % cgi.escape(username))
            return
        except WebGrab.UnknownUser:
            self.response.out.write("Could not find user %s" % cgi.escape(username))
            return

        # Enter the user into the taskqueue
        taskqueue.add(
            url="/extract",
            params={"username": username},
            name="user_extract-%s-%s" % (username, int(time.time())),
            queue_name="user-extract",
        )

Exemplo n.º 4

0

Exibir arquivo

Arquivo: Extractor.py Projeto: laganojunior/Malanalyzer

    def post(self):
        self.response.headers["Content-Type"] = "text/html"

        username = self.request.get("username")

        self.response.out.write("Getting %s" % username)
        logging.debug("Getting %s" % username)

        # Get the users animelist and id
        animelist = WebGrab.getAnimeList(username)

        # Limit the number of animes to use
        if len(animelist) > MAX_ANIMES_TO_USE:
            animelist = random.sample(animelist, MAX_ANIMES_TO_USE)

        # Go through each rating in the new list and create a map from
        # id to rating
        ratingMap = {}
        nameMap = {}
        ratingSum = 0.0
        ratingSumSquares = 0.0
        trueCount = 0
        for anime in animelist:
            animeid = anime["id"]
            rating = anime["score"]

            ratingSum += rating
            ratingSumSquares += rating * rating

            nameMap[str(animeid)] = anime["title"]

            if rating != 0:
                trueCount += 1

        if trueCount != 0:
            mean = ratingSum / trueCount
            stddev = math.sqrt((ratingSumSquares / trueCount) - mean * mean)
        else:
            mean = 0
            stddev = 0

        # Normalize all ratings
        if stddev < 0.1:
            # Standard deviation seems to indicate no variance, so set
            # all the animes to the average
            for anime in animelist:
                ratingMap[str(anime["id"])] = 0.0
        else:
            for anime in animelist:
                rating = anime["score"]
                animeid = str(anime["id"])
                if rating == 0:
                    # No rating, default to average
                    ratingMap[animeid] = 0.0
                else:
                    ratingMap[animeid] = (rating - mean) / stddev

        # Get anime objects, creating new ones if necessary
        animes = self.getAnimeObjects(nameMap)

        # Get all topic objects, making new ones as needed
        topics = self.getTopicObjects(ratingMap.keys(), animes)

        # Deserialize the topic maps
        topicMaps = [0] * len(topics)
        for i, topic in enumerate(topics):
            topicMaps[i] = eval(str(topic.animes))

        # Get the topic weights for this user
        topicWeights = [0.1] * len(topics)
        for i, topic in enumerate(topics):
            for animeid in ratingMap:
                if animeid in topicMaps[i]:
                    topicWeights[i] += topicMaps[i][animeid] * ratingMap[animeid]

        # Normalize by averaging over all ratings
        for i, weight in enumerate(topicWeights):
            topicWeights[i] /= len(ratingMap)

        # Now using the user weights, calculate error predictions from all
        # ratings
        ratingErrors = {}
        for animeid in ratingMap:
            ratingSum = 0.0
            for i, weight in enumerate(topicWeights):
                if animeid in topicMaps[i]:
                    ratingSum += weight * topicMaps[i][animeid]

            ratingErrors[animeid] = ratingSum - ratingMap[animeid]

        # Move the topic->anime weights using gradient descent
        for i, topic in enumerate(topics):

            key_union = set(ratingErrors.keys()) | set(topicMaps[i].keys())
            for animeid in key_union:
                if animeid not in topicMaps[i]:
                    topicMaps[i][animeid] = 0.0

                if animeid not in ratingErrors:
                    ratingErrors[animeid] = 0.0

                topicMaps[i][animeid] -= LEARNING_RATE * (
                    ratingErrors[animeid] * topicWeights[i] + REGULARIZATION_FACTOR * topicMaps[i][animeid]
                )

                # Make sure the weight meets the threshold for keeping it
                if abs(topicMaps[i][animeid]) < THRESHOLD_WEIGHT:
                    del topicMaps[i][animeid]

            # Write the final map
            topic.animes = db.Blob(str(topicMaps[i]))

        # Batch update everything
        db.put(animes + topics)