def create_stop_words(): stop_list = []; stop_file = open('new_stop_words', 'r'); for line in stop_file.readlines(): line = line.replace('\n', ''); uummuuWord.append_word(stop_list, line); return stop_list;
print "found something other than text. Found:", myHTML.content_type.split("/")[0].lower() cursor.execute("UPDATE sites_sitequeue set status=501 WHERE id = %d;" % doc_id) continue try: if myHTML.redirected_url != "": print "got a redirected url:", myHTML.redirected_url redirected_domain = getDomain(myHTML.redirected_url) cursor.execute( "INSERT INTO sites_sitequeue(url,crawled,domain,date_submitted, last_crawl, status) VALUES('%s',0,'%s',now(),now(),200);" % (myHTML.redirected_url, redirected_domain) ) cursor.execute("UPDATE sites_sitequeue set last_crawl = now() where id = %d;" % doc_id) cursor.execute("SELECT id FROM sites_sitequeue WHERE url = '%s';" % myHTML.redirected_url) redirected_to = cursor.fetchall() uummuuWord.append_word(link_list, redirected_to[0][0]) if getDomain(site.url) != site.domain: site.domain = getDomain(site.url) cursor.execute("UPDATE sites_sitequeue SET domain = '%s' where id = %d;" % (site.domain, doc_id)) except Exception, e: print "error 216:", e cursor.execute("UPDATE sites_sitequeue set status=300, crawled=1 where id = %d;" % (doc_id)) conn.commit() continue ##initialize the parser ## parser = myParser(myurl=site.url, domain=site.domain) try: ## run the parser on the html from this page.## parser.parse(myHTML.page)