Exemplo n.º 1
0
    def updateUbuWebDBOld(self):
        # Open Ubuweb film page
        req = urllib2.Request("http://www.ubu.com/film/")
        response = urllib2.urlopen(req)
        result = response.read()
        response.close()
        soup = BS.BeautifulSoup(result)
        links = soup.findAll("table")[1].findAll("a")

        fewerLinks = links[1:5]
        c = self.db.cursor()
        for link in fewerLinks:
            name = link.text
            nameHash = sha1.sha(name).hexdigest()
            href = link["href"][2:]
            c.execute('insert into names (name, hash, link) values (?,?,?)',
                      (unescape(name), nameHash, href))
            self.db.commit()

            # Open up new requests to load film links
            req = urllib2.Request(self.BASE + href)
            response = urllib2.urlopen(req)
            result = response.read()
            response.close()
            nameSoup = BS.BeautifulSoup(result)
            print "Working on %s" % (self.BASE + href)
            potentialFilmLinks = nameSoup.findAll("table")[1].findAll(
                "font")[2].findAll("img")
            for potentialFilmLink in potentialFilmLinks:
                a = potentialFilmLink.findNext()
                name = a.text
                potentialHref = a["href"]
                print "Working on %s" % (self.BASE + potentialHref)
                filmRequest = urllib2.Request(self.BASE + potentialHref)
                try:
                    filmResponse = urllib2.urlopen(filmRequest)
                except urllib2.HTTPError:
                    continue
                filmResult = filmResponse.read()
                filmResponse.close()
                filmSoup = BS.BeautifulSoup(filmResult)

                for s in filmSoup.findAll("script"):
                    r = self.fileRE.findall(s.text)

                    if r != []:
                        c.execute(
                            "insert into films (hash, title, link) values (?,?,?)",
                            (nameHash, unescape(name), r[0]))

                #filmLink = self.fileRE.findall(filmSoup.findAll("script")[2].text)[0]
                self.db.commit()
Exemplo n.º 2
0
    def updateUbuWebDBOld(self):
        # Open Ubuweb film page
        req = urllib2.Request("http://www.ubu.com/film/")
        response = urllib2.urlopen(req)
        result = response.read()
        response.close()
        soup = BS.BeautifulSoup(result)
        links = soup.findAll("table")[1].findAll("a")

        fewerLinks = links[1:5]
        c = self.db.cursor()
        for link in fewerLinks:
            name = link.text
            nameHash = sha1.sha(name).hexdigest()
            href = link["href"][2:]
            c.execute("insert into names (name, hash, link) values (?,?,?)", (unescape(name), nameHash, href))
            self.db.commit()

            # Open up new requests to load film links
            req = urllib2.Request(self.BASE + href)
            response = urllib2.urlopen(req)
            result = response.read()
            response.close()
            nameSoup = BS.BeautifulSoup(result)
            print "Working on %s" % (self.BASE + href)
            potentialFilmLinks = nameSoup.findAll("table")[1].findAll("font")[2].findAll("img")
            for potentialFilmLink in potentialFilmLinks:
                a = potentialFilmLink.findNext()
                name = a.text
                potentialHref = a["href"]
                print "Working on %s" % (self.BASE + potentialHref)
                filmRequest = urllib2.Request(self.BASE + potentialHref)
                try:
                    filmResponse = urllib2.urlopen(filmRequest)
                except urllib2.HTTPError:
                    continue
                filmResult = filmResponse.read()
                filmResponse.close()
                filmSoup = BS.BeautifulSoup(filmResult)

                for s in filmSoup.findAll("script"):
                    r = self.fileRE.findall(s.text)

                    if r != []:
                        c.execute(
                            "insert into films (hash, title, link) values (?,?,?)", (nameHash, unescape(name), r[0])
                        )

                # filmLink = self.fileRE.findall(filmSoup.findAll("script")[2].text)[0]
                self.db.commit()
Exemplo n.º 3
0
 def __init__(self, **kw):
     """
     @todo: Make a heartbeat to clean-up temporary subscribtions
     """
     self.email = kw.pop("email")
     self.first_name = kw.get("first_name")
     self.last_name = kw.get("last_name")
     self.organisation = kw.get("organisation")
     self.country = kw.get("country")
     self.notif_type = kw.pop("notif_type")
     self.lang = kw.pop("lang")
     self.location = kw.pop("location")
     self.key = sha("%s%s" % (time(), random.randrange(1, 10000))).hexdigest()
     self.__dict__.update(kw)
     self.datetime = datetime.now()
Exemplo n.º 4
0
    def parseFilmListingPage(self,
                             filmPage="http://www.ubu.com/film",
                             numLinks=10,
                             startLink=1):
        # Open Ubuweb film page
        req = urllib2.Request("http://www.ubu.com/film/")
        response = urllib2.urlopen(req)
        result = response.read()
        response.close()
        soup = BS.BeautifulSoup(result)
        links = soup.findAll("table")[1].findAll("a")

        # Select a subset (or all)
        if numLinks is not None:
            totalLinks = links[startLink:(startLink + numLinks)]
        else:
            totalLinks = links[startLink:]

        c = self.db.cursor()
        currentLink = startLink
        for link in totalLinks:
            print "Working on link %d" % currentLink
            name = link.text
            nameHash = sha1.sha(name).hexdigest()
            nameLink = link["href"][2:]

            result = self.parseNamePage(self.BASE + nameLink)
            if (result is not None):
                c.execute(
                    'insert into names (name, hash, link, comments) values (?,?,?,?)',
                    (unescape(name), nameHash, nameLink, result["comments"]))
                self.db.commit()

                for film in result["allFilms"]:
                    c.execute(
                        "insert into Films(hash, title, link, originalLink, comments) values (?, ?, ?, ?, ?)",
                        (nameHash, film["filmName"], film["link"],
                         film["originalLink"], film["comments"]))

            # Sleep for a bit to cutdown on usage
            sleepTime = random.randrange(5, 10)
            print "Sleeping for %d" % sleepTime
            time.sleep(sleepTime)
            currentLink += 1

        self.db.commit()
        c.close()
Exemplo n.º 5
0
 def __init__(self, **kw):
     """
     @todo: Make a heartbeat to clean-up temporary subscribtions
     """
     self.email = kw.pop('email')
     self.first_name = kw.get('first_name')
     self.last_name = kw.get('last_name')
     self.organisation = kw.get('organisation')
     self.country = kw.get('country')
     self.notif_type = kw.pop('notif_type')
     self.lang = kw.pop('lang')
     self.content_types = kw.pop('content_types')
     self.location = kw.pop('location')
     self.key = sha("%s%s" % (time(),
                                  random.randrange(1, 10000))).hexdigest()
     self.__dict__.update(kw)
     self.datetime = datetime.now()
Exemplo n.º 6
0
    def parseFilmListingPage(self, filmPage="http://www.ubu.com/film", numLinks=10, startLink=1):
        # Open Ubuweb film page
        req = urllib2.Request("http://www.ubu.com/film/")
        response = urllib2.urlopen(req)
        result = response.read()
        response.close()
        soup = BS.BeautifulSoup(result)
        links = soup.findAll("table")[1].findAll("a")

        # Select a subset (or all)
        if numLinks is not None:
            totalLinks = links[startLink : (startLink + numLinks)]
        else:
            totalLinks = links[startLink:]

        c = self.db.cursor()
        currentLink = startLink
        for link in totalLinks:
            print "Working on link %d" % currentLink
            name = link.text
            nameHash = sha1.sha(name).hexdigest()
            nameLink = link["href"][2:]

            result = self.parseNamePage(self.BASE + nameLink)
            if result is not None:
                c.execute(
                    "insert into names (name, hash, link, comments) values (?,?,?,?)",
                    (unescape(name), nameHash, nameLink, result["comments"]),
                )
                self.db.commit()

                for film in result["allFilms"]:
                    c.execute(
                        "insert into Films(hash, title, link, originalLink, comments) values (?, ?, ?, ?, ?)",
                        (nameHash, film["filmName"], film["link"], film["originalLink"], film["comments"]),
                    )

            # Sleep for a bit to cutdown on usage
            sleepTime = random.randrange(5, 10)
            print "Sleeping for %d" % sleepTime
            time.sleep(sleepTime)
            currentLink += 1

        self.db.commit()
        c.close()
Exemplo n.º 7
0
def hashAndBase64(s):
    return stringToBase64(sha.sha(s).digest())
Exemplo n.º 8
0
def hashAndBase64(s):
    return stringToBase64(sha.sha(s).digest())