Exemplo n.º 1
0
def scrape_ciid():
    dbm = DataManager()
    for pid, source_url in dbm.get_ciid_projects():
        print '\n-----\n'
        print "working on project %s" % pid
        print "url: %s" % source_url
        print "====="
        try:
            fp = codecs_open("%s/html/ciid/%s.html" % (HOME_DIR, pid), "r", "utf-8" )
            html = fp.read()
            fp.close()
            soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
            fp = codecs_open("%s/html/processed/%s.txt" % (HOME_DIR, pid), "w", "utf-8" )
            text = " ".join(x.text for x in soup.find("div", {'class':"post"}).findAll("p"))
            fp.write(text)
            fp.close()
            print text
            sleep(2)
        except Exception, e:
            print "failed"
            print e.message()