def Main(): parser = OptionParser() parser.add_option("--crawl-text", dest="crawl_text", action="store_true", help="crawl text", default=False) parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.') parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='orangegrove') parser.add_option("--attachment-table-name", dest="attachment_table", type="string", help="attachment table name", default='attachments') parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True) (options, args) = parser.parse_args() workingdir = options.workingdir.rstrip('/') num_of_iframs=0 if not os.path.exists(workingdir): parser.error("workingdir not exists") db = mysql.DB(db=options.db_name) db.set_autocommit(True) cursor=db.get_cursor() try: if options.crawl_text: count=0 attachments=db.query("select distinct url,attachment_id from %s where file_type = '%s' and landed=0 and url is not NULL"%(options.attachment_table,"URL Link")) print "Number of urls to crawl ",len(attachments) for (url,attachment_id,) in attachments: try: count+=1 print "source url :",url print "attachment_id :",attachment_id print "count %s"%count res=requests.get(url) time.sleep(3) landing_url=res.url if ".pdf" in landing_url: raise Exception("Format not html") data=res.text soup=bs(data,"html") iframes=soup.findAll("iframe") num_of_iframs=len(iframes) body=soup.find('body') print body.text [e.extract() for e in body.findAll('script')] [e.extract() for e in body.findAll('style')] comments = body.findAll(text=lambda text:isinstance(text, Comment)) [e.extract() for e in comments] txt=body.text visible_text=txt.replace('\n', ' ').replace('\s','').replace(" ",' ').replace('\t','') txt_location="/mnt/data/kendavar/orangegrove/textfile/%s.txt"%attachment_id f=codecs.open(txt_location,"w","utf-8") f.write(visible_text) f.close() print "txt_location :",txt_location print "landing_url :",landing_url print "num_of_iframs :",num_of_iframs print "landing :",1 cursor.execute("""update %s set txt_location='%s',landing_url='%s',landed=%s,num_of_iframs=%s where attachment_id=%s"""%(options.attachment_table,txt_location,landing_url,1,num_of_iframs,attachment_id)) except: traceback.print_exc() logging.exception('Got exception on main handler') cursor.execute("""update %s set landing_url='%s', landed=%s where attachment_id=%s"""%(options.attachment_table,landing_url,-1,attachment_id)) pass #data={ #"txt_location":txt_location, #"landing_url":landing_url, #"num_of_iframs":num_of_iframs, #"landed":1 #} #db.update(options.attachment_table,data,"url='%s'"%url) except: traceback.print_exc()
def Main(): parser = OptionParser() parser.add_option("--crawl", dest="crawl", action="store_true", help="crawllist", default=False) parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.') parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='ted') parser.add_option("--topic-table-name", dest="topic_table_name", type="string", help="topic table name", default='ted_topics') parser.add_option("--ted-table-name", dest="ted_table_name", type="string", help="ted table name", default='ted') parser.add_option("--skip", dest="skip", type=int, help="integer value", default=0) parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True) (options, args) = parser.parse_args() workingdir = options.workingdir.rstrip('/') if not os.path.exists(workingdir): parser.error("workingdir not exists") try: display = None from pyvirtualdisplay import Display display = Display(visible=0, size=(1000,900)) display.start() except: print 'No Xvfb!' db = mysql.DB(db=options.db_name) driver = crawlutils.open_driver(use_firefox=options.use_firefox) try: if options.crawl: topics = db.query("""select * from %s""" %(options.topic_table_name)) print len(topics), 'Topics to be crawled yet' db.set_autocommit(True) count=0 for (topic,topic_url,) in topics: count+=1 print 'topic:', topic print 'topic_url:', topic_url print 'topic count :',count driver.get(topic_url) time.sleep(3) pagination=driver.find_elements_by_class_name("pagination") number=0 if pagination: atag=pagination[0].find_elements_by_tag_name("a") page_numbers=int(atag[-2].text.encode("utf-8")) print "Page numbers ",page_numbers for page_number in range(page_numbers): number+=1 url="https://www.ted.com/talks?page=%s&sort=newest&topics[]=%s"%(str(page_number+1),topic) url=url.replace(" ","+") print "Page url :",url print "page number :",number driver.get(url) time.sleep(3) crawl_data(driver,options,db,topic) else: print "Paginator not found" crawl_data(driver,options,db,topic) except: traceback.print_exc() if driver: driver.save_screenshot(workingdir + '/error.png') print workingdir + '/error.png' finally: if driver: driver.quit() if display: display.stop()
def Main(): driver=None parser = OptionParser() parser.add_option("--crawl-textbook", dest="crawl_textbook", action="store_true", help="crawl textbook", default=False) parser.add_option("--crawl-textbook-details", dest="crawl_textbook_details", action="store_true", help="crawl textbook details", default=False) parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.') parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='opentextbooks') parser.add_option("--subject-table-name", dest="subject_table_name", type="string", help="subject table name", default='subject') parser.add_option("--textbook-table-name", dest="textbook_table_name", type="string", help="textbook table name", default='opentextbooks') parser.add_option("--attachment-table-name", dest="attachment_table_name", type="string", help="attachment table name", default='attachments') parser.add_option("--toc-table-name", dest="toc_table_name", type="string", help="toc table name", default='table_of_content') parser.add_option("--skip", dest="skip", type=int, help="integer value", default=0) parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True) (options, args) = parser.parse_args() workingdir = options.workingdir.rstrip('/') if not os.path.exists(workingdir): parser.error("workingdir not exists") try: display = None from pyvirtualdisplay import Display display = Display(visible=0, size=(1000,900)) display.start() except: print 'No Xvfb!' db = mysql.DB(db=options.db_name) if options.use_firefox: driver = crawlutils.open_driver(use_firefox=options.use_firefox) try: if options.crawl_textbook: subjects = db.query("""select * from %s""" %(options.subject_table_name)) print len(subjects), 'subjects to be crawled yet' db.set_autocommit(True) count=0 for (url,subject_title,) in subjects: count+=1 print 'subject title:', subject_title print 'url:', url print 'subject count :',count driver.get(url) time.sleep(3) ShortDescription=driver.find_elements_by_class_name("ShortDescription") for short in ShortDescription: thrid=short.find_element_by_class_name("thrid") img_url=thrid.find_element_by_tag_name("img") h2=short.find_element_by_tag_name("h2") textbook_title=h2.text.strip() textbook_link=h2.find_element_by_tag_name("a").get_attribute("href") m = hashlib.md5() m.update(textbook_title+textbook_link) document_id=m.hexdigest() string=short.find_element_by_tag_name("p").text l=[] if string.find("\n"): authors=string.replace("\n",", ") list=string.split("\n") for a in list: l.append(a.split(",")[0]) author=','.join(l) elif string.find(","): authors=string l.append(a.split(",")[0]) author=','.join(l) else: authors=string author=string print 'textbook_url',textbook_url print 'subject_title',subject_title print 'url',url print 'author',author print 'authors',authors print 'document_id',document_id print 'img_url',img_url data = { 'textbook_title':textbook_title, 'textbook_url':textbook_url, 'subject_title':subject_title, 'url':url, 'author':author, 'authors':authors, 'document_id':document_id, 'img_url':img_url } db.insert(options.textbook_table_name, data) print "db inserted" if options.crawl_textbook_details: textbook = db.query("""select document_id,textbook_url from %s where crawled=0""" %(options.textbook_table_name)) print len(textbook), 'textbook to be crawled yet' db.set_autocommit(True) count=0 for (document_id,textbook_url,) in textbook: count+=1 print 'textbook_url:', textbook_url print 'document_id:', document_id print 'subject count :',count driver.get(textbook_url) time.sleep(3) third=driver.find_element_by_class_name("twothird") para= third.find_elements_by_tag_name("p") for p in para: para_text=p.text if para_text.startswith("Pub Date:"): pub_date=para_text.replace("Pub Date:","") if pub_date: pub_date=pub_date.strip() else: pub_date=None elif para_text.startswith("ISBN 13:") isbn_13_string=para_text.replace("ISBN 13:","") if isbn_13_string: isbn_13_string=isbn_13_string.strip() isbn_13=isbn_13_string.replace("-","") else: isbn_13_string=None isbn_13=None BookTypes=driver.find_element_by_class_name("BookTypes") books=BookTypes.find_elements_by_tag_name("a") for book in books: attachment_link=book.get_attribute("href") type=book.text.strip() print "attachment_link",attachment_link print "type",type data={ "document_id":document_id, "attachment_link":attachment_link, "type":type } db.insert(options.attachment_table_name, data) print "toc table inserted" Badge=driver.find_element_by_class_name("Badge-Condition") conditions_text=Badge.text condition_link=Badge.find_element_by_tag_name("a").get_attribute("href") toc=driver.find_element_by_id("TOC") table_of_content=str(toc) list_tags=toc.find_elements_by_tag_name("li") for list in list_tags: chapter=list.text.strip() if chapter.startswith("Chapter"): chapter_type="Chapter" elif chapter.startswith("Part"): chapter_type="Part" else: chapter_type=None print "title",chapter print "type",chapter_type data={ 'document_id':document_id, 'title':chapter, 'type': chapter_type } db.insert(options.toc_table_name, data) print "toc table inserted" AboutBook = driver.find_element_by_id("AboutBook") description = AboutBook.text links=AboutBook.find_elements_by_tag_name("a") for link in links: href = link.get_attribute("href") print "link in books",href data={ "document_id":document_id "link":href } db.insert("books", data) print "toc table inserted" AboutAuthors = driver.find_element_by_id("AboutAuthors") author_details = AboutAuthors.text print 'pub_date',pub_date, print 'isbn_13_string',isbn_13_string, print 'isbn_13',isbn_13, print 'conditions_text',conditions_text, print 'condition_link', condition_link, print 'table_of_content',table_of_content, print 'description',description, print 'author_details',author_details data = { 'pub_date':pub_date, 'isbn_13_string':isbn_13_string, 'isbn_13':isbn_13, 'conditions_text': conditions_text, 'condition_link': condition_link, 'table_of_content': table_of_content, 'description' : description, 'author_details':author_details 'crawled':1 } db.update(options.textbook_table_name, data, "document_id='%s'" %document_id) except: traceback.print_exc() if driver: driver.save_screenshot(workingdir + '/error.png') print workingdir + '/error.png' finally: if driver: driver.quit() if display: display.stop()
def Main(): parser = OptionParser() parser.add_option("--crawl-text", dest="crawl_text", action="store_true", help="crawl text", default=False) parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.') parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='orangegrove') parser.add_option("--attachment-table-name", dest="attachment_table", type="string", help="attachment table name", default='attachments') parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True) (options, args) = parser.parse_args() workingdir = options.workingdir.rstrip('/') num_of_iframs=0 if not os.path.exists(workingdir): parser.error("workingdir not exists") db = mysql.DB(db=options.db_name) db.set_autocommit(True) try: display = None from pyvirtualdisplay import Display display = Display(visible=0, size=(1000,900)) display.start() except: print 'No Xvfb!' driver = crawlutils.open_driver(use_firefox=options.use_firefox) try: if options.crawl_text: count=0 attachments=db.query("select distinct url,attachment_id from %s where file_type = '%s' and landed=0 and url is not NULL"%(options.attachment_table,"URL Link")) print "Number of urls to crawl ",len(attachments) for (url,attachment_id,) in attachments: try: count+=1 print "source url :",url print "attachment_id :",attachment_id print "count %s"%count if "pdf" in url: raise Exception(url) driver.get(url) iframes=driver.find_elements_by_tag_name("iframe") body=driver.find_element_by_tag_name("body") landing_url=driver.current_url if "pdf" in landing_url: raise Exception(landing_url) cursor=db.get_cursor() visible_text=body.text if iframes: num_of_iframs=len(iframes) print "landing_url :",landing_url print"landed :",2 print"num_of_iframs :",num_of_iframs #data={ #"landing_url":landing_url, #"landed":2, #"num_of_iframs":num_of_iframs #} cursor.execute("""update %s set landing_url='%s',landed=%s,num_of_iframs=%s where url='%s'"""%(options.attachment_table,landing_url,2,num_of_iframs,url)) #db.update(options.attachment_table,data,"url='%s'"%url) else: txt_location="/mnt/data/kendavar/orangegrove/textfile/%s.html"%attachment_id f=codecs.open(txt_location,"w","utf-8") f.write(visible_text) f.close() print "txt_location :",txt_location print "landing_url :",landing_url print "num_of_iframs :",num_of_iframs print "landing :",1 cursor.execute("""update %s set txt_location='%s',landing_url='%s',landed=%s,num_of_iframs=%s where url='%s'"""%(options.attachment_table,txt_location,landing_url,1,num_of_iframs,url)) except: traceback.print_exc() logging.exception('Got exception on main handler') cursor.execute("""update %s set landed=%s where url='%s'"""%(options.attachment_table,-1,url)) pass except: traceback.print_exc() if driver: driver.save_screenshot(workingdir + '/error.png') print workingdir + '/error.png' finally: if driver: driver.quit() if display: display.stop()
finally: if driver: driver.quit() if display: display.stop() if __name__ == "__main__": Main() from src import mysql from src import crawlutils import shutil from selenium import webdriver db = mysql.DB("usgs") links = db.query("""select document_id,link from %s where crawled=2""" %("usgs")) driver=webdriver.Chrome() driver.set_window_size(1000,900) for (document_id,link,) in links: txt_location="/mnt/data/kendavar/usgs/txtfiles/%s_txt"%document_id filename="/mnt/data/kendavar/usgs/screenshots/%s.png"%document_id driver.save_screenshot(filename) crawlutils.resize_png_image(filename) img_location="/mnt/data/kendavar/usgs/screenshot_png/%s_png"%document_id shutil.copyfile(filename,img_location) data = { 'screenshot':img_location, 'txt_location':txt_location, 'crawled':1 }
def Main(): parser = OptionParser() parser.add_option("--crawl-textbooks", dest="crawl_textbooks", action="store_true", help="crawl textbooks list", default=False) parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.') parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='colorado_1012') parser.add_option("--subject2-table-name", dest="subject2_table_name", type="string", help="subject2 table name", default='colorado_subject2') parser.add_option("--textbook-table-name", dest="textbook_table_name", type="string", help="textbook table name", default='colorado_textbook') parser.add_option("--skip", dest="skip", type=int, help="integer value", default=0) parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True) (options, args) = parser.parse_args() workingdir = options.workingdir.rstrip('/') if not os.path.exists(workingdir): parser.error("workingdir not exists") try: display = None from pyvirtualdisplay import Display display = Display(visible=0, size=(1000, 900)) display.start() except: print 'No Xvfb!' db = mysql.DB(db=options.db_name, port=options.db_port) driver = crawlutils.open_driver(use_firefox=options.use_firefox) try: if options.crawl_textbooks: subject2 = db.query("""select * from %s""" % (options.subject2_table_name)) print len(subject2), 'Textbook to be crawled yet' db.set_autocommit(True) for (subject1_title, subject2_title, subject_url) in subject2: print 'subject1_title:', subject1_title print 'subject2_title:', subject2_title print 'subject_url:', subject_url driver.get(subject_url) time.sleep(3) simulation_link = driver.find_elements_by_class_name( "simulation-link") for link in simulation_link: file_format = None textbook_url = link.get_attribute("href") textbook_image_url = link.find_element_by_tag_name( "img").get_attribute("src") textbook_title = link.find_element_by_tag_name( "strong").text span = link.find_element_by_tag_name('span') badge = span[1].get_attribute("class") if "html" in badge: file_format = "html5" if "java" in badge: file_format = "java applet" if "flash" in badge: file_format = "shockwave flash" print "textbook_title :", textbook_title print "textbook_url :", textbook_url print "textbook_image_url :", textbook_image_url print "file_format :", file_format raise Exception("done") data = { 'subject1_title': subject1_title, 'subject2_title': subject2_title, 'textbook_title': textbook_title, 'textbook_url': textbook_url, 'textbook_image_url': textbook_image_url, 'format': file_format } db.insert(options.textbook_table_name, data) except: traceback.print_exc() if driver: driver.save_screenshot(workingdir + '/error.png') print workingdir + '/error.png' finally: if driver: driver.quit() if display: display.stop()
def Main(): driver = None parser = OptionParser() parser.add_option("--crawl", dest="crawl", action="store_true", help="crawl textbook", default=False) parser.add_option("--details", dest="details", action="store_true", help="crawl textbook details", default=False) parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.') parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='usgs') parser.add_option("--table-name", dest="table_name", type="string", help="textbook table name", default='usgs') parser.add_option("--skip", dest="skip", type=int, help="integer value", default=0) parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True) (options, args) = parser.parse_args() workingdir = options.workingdir.rstrip('/') if not os.path.exists(workingdir): parser.error("workingdir not exists") try: display = None from pyvirtualdisplay import Display display = Display(visible=0, size=(1000, 900)) display.start() except: print 'No Xvfb!' db = mysql.DB(db=options.db_name) #if options.use_firefox: # driver = crawlutils.open_driver(use_firefox=options.use_firefox) try: if options.crawl: res = requests.get("http://education.usgs.gov/undergraduate.html") soup = bs(res.text) td = soup.findAll("td") subject2 = None for i, tag in enumerate(td): if i >= 5: if tag.find("h2"): subject1 = tag.text.strip() elif len(tag.findAll("hr")) == 2: if tag.find("div"): tag.div.extract() subject2 = tag.text.strip() print subject2 elif tag.find("a"): if tag.find("li"): if not tag.find("strong"): tag.a.extract() description = tag.text.strip() for list1 in tag.findAll("li"): save(list1, subject1, subject2) else: save(tag, subject1, subject2) if options.details: links = db.query( """select document_id,link from %s where crawled=0""" % (options.table_name)) print len(links), 'links to be crawled yet' db.set_autocommit(True) count = 0 for ( document_id, link, ) in links: count += 1 print 'link:', link print 'document_id:', document_id print 'link count :', count documents = (document_id, link) txt_location, driver = crawl_documents( documents, '/mnt/data/kendavar/usgs') driver.set_window_size(1000, 900) filename = "/mnt/data/kendavar/usgs/screenshots/%s.png" % document_id driver.save_screenshot(filename) crawlutils.resize_png_image(filename) img_location = "/mnt/data/kendavar/usgs/screenshot_png/%s_png" % document_id shutil.copyfile(filename, img_location) data = { 'screenshot': img_location, 'txt_location': txt_location, 'crawled': 1 } db.update(options.table_name, data, "document_id='%s'" % document_id) except: traceback.print_exc() if driver: driver.save_screenshot(workingdir + '/error.png') print workingdir + '/error.png' finally: if driver: driver.quit() if display: display.stop()
def Main(): parser = OptionParser() parser.add_option("--crawl", dest="crawl", action="store_true", help="crawl url", default=False) parser.add_option("--crawl-landing", dest="crawl_landing", action="store_true", help="crawl url", default=False) parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.') parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='skillscommons') parser.add_option("--table-name", dest="table_name", type="string", help="table name", default='skill') parser.add_option("--main-table-name", dest="main_table_name", type="string", help="main table name", default='skillscommons') parser.add_option("--attachment-table-name", dest="attachment_table_name", type="string", help="attachment table name", default='attachment') parser.add_option("--meta-table-name", dest="meta_table_name", type="string", help="meta table name", default='meta_data') parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True) (options, args) = parser.parse_args() workingdir = options.workingdir.rstrip('/') if not os.path.exists(workingdir): parser.error("workingdir not exists") try: display = None from pyvirtualdisplay import Display display = Display(visible=0, size=(1000,900)) display.start() except: print 'No Xvfb!' db = mysql.DB(db=options.db_name) db.set_autocommit(True) driver = crawlutils.open_driver(use_firefox=options.use_firefox) links=["https://www.skillscommons.org/discover?rpp=2000&page=1&group_by=none&etal=0", "https://www.skillscommons.org/discover?rpp=2000&page=2&group_by=none&etal=0", "https://www.skillscommons.org/discover?rpp=2000&page=3&group_by=none&etal=0"] try: if options.crawl: count = 0 for link in links: print "Link :",link driver.get(link) time.sleep(5) medium_results=driver.find_element_by_class_name("medium-results") li=medium_results.find_elements_by_tag_name("li") for tag in li: count+=1 print "Count :",count link_tag=tag.find_element_by_tag_name("a") title=link_tag.text.strip() url=link_tag.get_attribute("href") types=tag.find_elements_by_class_name("type") if len(types)==2: type=types[0].text.strip() institution=types[1].text.strip() else: type=None institution=types[0].text.strip() description=tag.find_element_by_class_name("abstract").text.strip() print "title :", title print "url :",url print "type :",type print "institution :",institution print "description :",description data = { 'title':title, 'institution':institution, 'url':url, 'type':type, 'description':description, } db.insert(options.table_name, data) if options.crawl_landing: count=0 skill=db.query("select distinct url from skill where crawled=0") print "Number of urls to crawl ",len(skill) for (src_url,) in skill: print "source url :",src_url print "count %s"%count count+=1 driver.get(src_url) author=None col=driver.find_element_by_class_name("col-sm-8") title=col.find_element_by_tag_name("h1").text.strip() m = hashlib.md5() m.update(title+src_url) document_id=m.hexdigest() toc_html="/mnt/data/kendavar/skillscommons/%s.html"%document_id file(toc_html,"w","utf8").write(driver.page_source) authors=col.find_element_by_class_name("authors") if not authors.find_elements_by_tag_name("div"): author=authors.text.strip() description=col.find_element_by_class_name("abstract").text files=col.find_element_by_class_name("files") file_information=files.find_elements_by_class_name("file-information") attachment=[] for attach in file_information: attachment.append((attach.text.strip(),attach.find_element_by_tag_name("a").get_attribute("href"))) dls=col.find_elements_by_tag_name("dl") meta={} string='' for dl in dls: for div in dl.find_elements_by_tag_name("div"): string='' dd=div.find_element_by_tag_name("dd") if dd.find_elements_by_tag_name("li"): for li in dd.find_elements_by_tag_name("li"): string=string+li.text.strip()+"," elif dd.find_elements_by_tag_name("a"): string=[dd.text.strip()] anchors=[] for anchor in dd.find_elements_by_tag_name("a"): if anchor.get_attribute("href") not in anchors: anchors.append(anchor.get_attribute("href")) string.append(anchor.get_attribute("href")) else: string=dd.text.strip() meta[div.find_element_by_tag_name("dt").text.replace(":","").strip()]=string print "title :",title print "author :",author print "description :",description print "toc_path",toc_html data={ "document_id":document_id, "title":title, "author":author, "description":description, "toc_path":toc_html } db.insert(options.main_table_name, data) for (attachment_title,attachment_url) in attachment: print "document_id":document_id, print "attachment_title":attachment_title, print "attachment_url":attachment_url data={ "document_id":document_id, "attachment_title":attachment_title, "attachment_url":attachment_url } db.insert(options.attachment_table_name, data) for key,value in meta.iteritems(): if value[-1]==",": value=value[:-1] print '%s : %s'%(key,value) if type(value) is list: for val in value: meta_title=key if i%2==0 : meta_value=val else: meta_url=val print "meta_title":meta_title print "meta_value":meta_value print "meta_url":meta_url data={ "document_id":document_id, "meta_title":meta_title, "meta_value":meta_value, "meta_url":meta_url } db.insert(options.meta_table_name, data) else: meta_title=key meta_url=None meta_value=value print "meta_title":meta_title print "meta_value":meta_value print "meta_url":meta_url data={ "document_id":document_id, "meta_title":meta_title, "meta_value":meta_value, "meta_url":meta_url } db.insert(options.meta_table_name, data) data={ "crawled":1 } db.update(options.table_name,data,"url='%s'"%src_url) print "updated the table" except: traceback.print_exc() if driver: driver.save_screenshot(workingdir + '/error.png') print workingdir + '/error.png' finally: if driver: driver.quit() if display: display.stop()