def Main(): driver=None parser = OptionParser() parser.add_option("--crawl-textbook", dest="crawl_textbook", action="store_true", help="crawl textbook", default=False) parser.add_option("--crawl-textbook-details", dest="crawl_textbook_details", action="store_true", help="crawl textbook details", default=False) parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.') parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='opentextbooks') parser.add_option("--subject-table-name", dest="subject_table_name", type="string", help="subject table name", default='subject') parser.add_option("--textbook-table-name", dest="textbook_table_name", type="string", help="textbook table name", default='opentextbooks') parser.add_option("--attachment-table-name", dest="attachment_table_name", type="string", help="attachment table name", default='attachments') parser.add_option("--toc-table-name", dest="toc_table_name", type="string", help="toc table name", default='table_of_content') parser.add_option("--skip", dest="skip", type=int, help="integer value", default=0) parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True) (options, args) = parser.parse_args() workingdir = options.workingdir.rstrip('/') if not os.path.exists(workingdir): parser.error("workingdir not exists") try: display = None from pyvirtualdisplay import Display display = Display(visible=0, size=(1000,900)) display.start() except: print 'No Xvfb!' db = mysql.DB(db=options.db_name) if options.use_firefox: driver = crawlutils.open_driver(use_firefox=options.use_firefox) try: if options.crawl_textbook: subjects = db.query("""select * from %s""" %(options.subject_table_name)) print len(subjects), 'subjects to be crawled yet' db.set_autocommit(True) count=0 for (url,subject_title,) in subjects: count+=1 print 'subject title:', subject_title print 'url:', url print 'subject count :',count driver.get(url) time.sleep(3) ShortDescription=driver.find_elements_by_class_name("ShortDescription") for short in ShortDescription: thrid=short.find_element_by_class_name("thrid") img_url=thrid.find_element_by_tag_name("img") h2=short.find_element_by_tag_name("h2") textbook_title=h2.text.strip() textbook_link=h2.find_element_by_tag_name("a").get_attribute("href") m = hashlib.md5() m.update(textbook_title+textbook_link) document_id=m.hexdigest() string=short.find_element_by_tag_name("p").text l=[] if string.find("\n"): authors=string.replace("\n",", ") list=string.split("\n") for a in list: l.append(a.split(",")[0]) author=','.join(l) elif string.find(","): authors=string l.append(a.split(",")[0]) author=','.join(l) else: authors=string author=string print 'textbook_url',textbook_url print 'subject_title',subject_title print 'url',url print 'author',author print 'authors',authors print 'document_id',document_id print 'img_url',img_url data = { 'textbook_title':textbook_title, 'textbook_url':textbook_url, 'subject_title':subject_title, 'url':url, 'author':author, 'authors':authors, 'document_id':document_id, 'img_url':img_url } db.insert(options.textbook_table_name, data) print "db inserted" if options.crawl_textbook_details: textbook = db.query("""select document_id,textbook_url from %s where crawled=0""" %(options.textbook_table_name)) print len(textbook), 'textbook to be crawled yet' db.set_autocommit(True) count=0 for (document_id,textbook_url,) in textbook: count+=1 print 'textbook_url:', textbook_url print 'document_id:', document_id print 'subject count :',count driver.get(textbook_url) time.sleep(3) third=driver.find_element_by_class_name("twothird") para= third.find_elements_by_tag_name("p") for p in para: para_text=p.text if para_text.startswith("Pub Date:"): pub_date=para_text.replace("Pub Date:","") if pub_date: pub_date=pub_date.strip() else: pub_date=None elif para_text.startswith("ISBN 13:") isbn_13_string=para_text.replace("ISBN 13:","") if isbn_13_string: isbn_13_string=isbn_13_string.strip() isbn_13=isbn_13_string.replace("-","") else: isbn_13_string=None isbn_13=None BookTypes=driver.find_element_by_class_name("BookTypes") books=BookTypes.find_elements_by_tag_name("a") for book in books: attachment_link=book.get_attribute("href") type=book.text.strip() print "attachment_link",attachment_link print "type",type data={ "document_id":document_id, "attachment_link":attachment_link, "type":type } db.insert(options.attachment_table_name, data) print "toc table inserted" Badge=driver.find_element_by_class_name("Badge-Condition") conditions_text=Badge.text condition_link=Badge.find_element_by_tag_name("a").get_attribute("href") toc=driver.find_element_by_id("TOC") table_of_content=str(toc) list_tags=toc.find_elements_by_tag_name("li") for list in list_tags: chapter=list.text.strip() if chapter.startswith("Chapter"): chapter_type="Chapter" elif chapter.startswith("Part"): chapter_type="Part" else: chapter_type=None print "title",chapter print "type",chapter_type data={ 'document_id':document_id, 'title':chapter, 'type': chapter_type } db.insert(options.toc_table_name, data) print "toc table inserted" AboutBook = driver.find_element_by_id("AboutBook") description = AboutBook.text links=AboutBook.find_elements_by_tag_name("a") for link in links: href = link.get_attribute("href") print "link in books",href data={ "document_id":document_id "link":href } db.insert("books", data) print "toc table inserted" AboutAuthors = driver.find_element_by_id("AboutAuthors") author_details = AboutAuthors.text print 'pub_date',pub_date, print 'isbn_13_string',isbn_13_string, print 'isbn_13',isbn_13, print 'conditions_text',conditions_text, print 'condition_link', condition_link, print 'table_of_content',table_of_content, print 'description',description, print 'author_details',author_details data = { 'pub_date':pub_date, 'isbn_13_string':isbn_13_string, 'isbn_13':isbn_13, 'conditions_text': conditions_text, 'condition_link': condition_link, 'table_of_content': table_of_content, 'description' : description, 'author_details':author_details 'crawled':1 } db.update(options.textbook_table_name, data, "document_id='%s'" %document_id) except: traceback.print_exc() if driver: driver.save_screenshot(workingdir + '/error.png') print workingdir + '/error.png' finally: if driver: driver.quit() if display: display.stop()
def Main(): parser = OptionParser() parser.add_option("--crawl", dest="crawl", action="store_true", help="crawllist", default=False) parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.') parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='ted') parser.add_option("--topic-table-name", dest="topic_table_name", type="string", help="topic table name", default='ted_topics') parser.add_option("--ted-table-name", dest="ted_table_name", type="string", help="ted table name", default='ted') parser.add_option("--skip", dest="skip", type=int, help="integer value", default=0) parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True) (options, args) = parser.parse_args() workingdir = options.workingdir.rstrip('/') if not os.path.exists(workingdir): parser.error("workingdir not exists") try: display = None from pyvirtualdisplay import Display display = Display(visible=0, size=(1000,900)) display.start() except: print 'No Xvfb!' db = mysql.DB(db=options.db_name) driver = crawlutils.open_driver(use_firefox=options.use_firefox) try: if options.crawl: topics = db.query("""select * from %s""" %(options.topic_table_name)) print len(topics), 'Topics to be crawled yet' db.set_autocommit(True) count=0 for (topic,topic_url,) in topics: count+=1 print 'topic:', topic print 'topic_url:', topic_url print 'topic count :',count driver.get(topic_url) time.sleep(3) pagination=driver.find_elements_by_class_name("pagination") number=0 if pagination: atag=pagination[0].find_elements_by_tag_name("a") page_numbers=int(atag[-2].text.encode("utf-8")) print "Page numbers ",page_numbers for page_number in range(page_numbers): number+=1 url="https://www.ted.com/talks?page=%s&sort=newest&topics[]=%s"%(str(page_number+1),topic) url=url.replace(" ","+") print "Page url :",url print "page number :",number driver.get(url) time.sleep(3) crawl_data(driver,options,db,topic) else: print "Paginator not found" crawl_data(driver,options,db,topic) except: traceback.print_exc() if driver: driver.save_screenshot(workingdir + '/error.png') print workingdir + '/error.png' finally: if driver: driver.quit() if display: display.stop()
def Main(): parser = OptionParser() parser.add_option("--textbook-package", dest="textbook_package", action="store_true", help="textbook package details", default=False) parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.') parser.add_option("--db-name", dest="db_name", type="string", help="db name", default='pearsonhighered') parser.add_option("--db-port", dest="db_port", type="int", help="db port", default=3306) parser.add_option("--textbook-package-table-name", dest="textbook_package_table_name", type="string", help="textbook package table name", default='textbook_package') parser.add_option("--pearsonhighered-textbooks-table-name", dest="pearsonhighered_textbooks_table_name", type="string", help="subject2 table name", default='pearsonhighered_textbooks') parser.add_option("--skip", dest="skip", type=int, help="integer value", default=0) parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=False) (options, args) = parser.parse_args() workingdir = options.workingdir.rstrip('/') if not os.path.exists(workingdir): parser.error("workingdir not exists") try: display = None from pyvirtualdisplay import Display display = Display(visible=0, size=(1000,900)) display.start() except: print 'No Xvfb!' db = mysql.DB(db=options.db_name, port=options.db_port) driver = crawlutils.open_driver(use_firefox=options.use_firefox) try: if options.crawl_subjects1: textbooks = db.query("""select textbook_id,textbook_url from pearsonhighered_textbooks where status=0'""" %(options.textbook_package_table_name)) print len(textbooks), 'textbooks yet to be crawled' db.set_autocommit(True) count = 0 for (textbook_id,textbook_url) in textbooks: count += 1 print 'count:', count print 'textbook_id:',textbook_id print 'textbook_url:', textbook_url if not textbook_url: continue driver.get(textbook_url) time.sleep(3) tab_content_group=driver.find_elements_by_class_name("tab-content group") if not tab_content_group: raise Exception "tab-content group not found" data_feed_float_right=tab_content_group[0].find_elements_by_class_name("data-feed float-right") if data_feed_float_right: raise Exception "data-feed float-right no found" description=data_feed_float_right[0].find_elements_by_id("description") if not description: raise Exception "description not found" ul=description[0].find_elements_by_tag_name("ul") if not ul: raise Exception "ul tag not found" while True: a=ul[0].find_elements_by_tag_name("a") if not a: raise Exception "a tag not found" textbook_title=a[0].text.strip() p=ul[0].find_elements_by_tag_name("p") if not p: raise Exception "p tag not found" if len(p)==3: raise Exception "all p tags are not found" for tag in p: package_details=tag.text if '©' in package_details: copy_right_year=package_details[package_details.find("©")+1:package_details.find("•")] copy_right_year=copy_right_year.strip() if not len(copy_right_year)==4: raise Exception "copy right right is not correct" if 'pp' in package_details: pages=package_details[package_details.find(",")+1:package_details.find("pp")] pages=pages.strip() print "copy right year",copy_right_year print "Pages",pages if "ISBN" in package_details: if "•" in package_details: isbns=package_details.split("•") for isbn in isbns: if "ISBN-10:" in isbn: isbn_10=isbn.replace("ISBN-10:","").strip() if "ISBN-13:" in isbn: isbn_13=isbn.replace("ISBN-13:","").strip() if not len(isbn_10)==10: raise Exception "isbn 10 is not correct" if not len(isbn_13)==13: raise Exception "isbn 13 is not correct" print "isbn_10 :",isbn_10 print "isbn_13 :",isbn_13 author=package_details.strip() print "author :",author data = {'textbook_title':title,'textbook_isbn_10':textbook_isbn_10,'textbook_isbn_13':textbook_isbn_13, 'textbook_author':author, 'textbook_copyright_year':textbook_copyright_year,'pages':pages, 'status':1} db.update(options.textbook_table_name, data, "textbook_url='%s'" %textbook_url) time.sleep(3) next=description[0].find_element_by_xpath("//ul/following-sibling::ul") if next: print "Next sibling found" ul=next else: print "next sibling not found" break except: traceback.print_exc() if driver: driver.save_screenshot(workingdir + '/error.png') print workingdir + '/error.png' finally: if driver: driver.quit() if display: display.stop()
def Main(): parser = OptionParser() parser.add_option("--crawl-text", dest="crawl_text", action="store_true", help="crawl text", default=False) parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.') parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='orangegrove') parser.add_option("--attachment-table-name", dest="attachment_table", type="string", help="attachment table name", default='attachments') parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True) (options, args) = parser.parse_args() workingdir = options.workingdir.rstrip('/') num_of_iframs=0 if not os.path.exists(workingdir): parser.error("workingdir not exists") db = mysql.DB(db=options.db_name) db.set_autocommit(True) try: display = None from pyvirtualdisplay import Display display = Display(visible=0, size=(1000,900)) display.start() except: print 'No Xvfb!' driver = crawlutils.open_driver(use_firefox=options.use_firefox) try: if options.crawl_text: count=0 attachments=db.query("select distinct url,attachment_id from %s where file_type = '%s' and landed=0 and url is not NULL"%(options.attachment_table,"URL Link")) print "Number of urls to crawl ",len(attachments) for (url,attachment_id,) in attachments: try: count+=1 print "source url :",url print "attachment_id :",attachment_id print "count %s"%count if "pdf" in url: raise Exception(url) driver.get(url) iframes=driver.find_elements_by_tag_name("iframe") body=driver.find_element_by_tag_name("body") landing_url=driver.current_url if "pdf" in landing_url: raise Exception(landing_url) cursor=db.get_cursor() visible_text=body.text if iframes: num_of_iframs=len(iframes) print "landing_url :",landing_url print"landed :",2 print"num_of_iframs :",num_of_iframs #data={ #"landing_url":landing_url, #"landed":2, #"num_of_iframs":num_of_iframs #} cursor.execute("""update %s set landing_url='%s',landed=%s,num_of_iframs=%s where url='%s'"""%(options.attachment_table,landing_url,2,num_of_iframs,url)) #db.update(options.attachment_table,data,"url='%s'"%url) else: txt_location="/mnt/data/kendavar/orangegrove/textfile/%s.html"%attachment_id f=codecs.open(txt_location,"w","utf-8") f.write(visible_text) f.close() print "txt_location :",txt_location print "landing_url :",landing_url print "num_of_iframs :",num_of_iframs print "landing :",1 cursor.execute("""update %s set txt_location='%s',landing_url='%s',landed=%s,num_of_iframs=%s where url='%s'"""%(options.attachment_table,txt_location,landing_url,1,num_of_iframs,url)) except: traceback.print_exc() logging.exception('Got exception on main handler') cursor.execute("""update %s set landed=%s where url='%s'"""%(options.attachment_table,-1,url)) pass except: traceback.print_exc() if driver: driver.save_screenshot(workingdir + '/error.png') print workingdir + '/error.png' finally: if driver: driver.quit() if display: display.stop()
def Main(): parser = OptionParser() parser.add_option("--crawl-subjects1", dest="crawl_subjects1", action="store_true", help="crawl first level subjects", default=False) parser.add_option("--crawl-subjects2", dest="crawl_subjects2", action="store_true", help="crawl second level subjects", default=False) parser.add_option("--crawl-subjects3", dest="crawl_subjects3", action="store_true", help="crawl third level subjects", default=False) parser.add_option("--crawl-subjects4", dest="crawl_subjects4", action="store_true", help="crawl fourth level subjects", default=False) parser.add_option("--crawl-textbooks", dest="crawl_textbooks", action="store_true", help="crawl textbooks list", default=False) parser.add_option("--crawl-textbook-details", dest="crawl_textbook_details", action="store_true", help="crawl textbooks details", default=False) parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.') parser.add_option("--db-name", dest="db_name", type="string", help="db name", default='textbook_0915') parser.add_option("--db-port", dest="db_port", type="int", help="db port", default=6606) parser.add_option("--subject1-table-name", dest="subject1_table_name", type="string", help="subject1 table name", default='wiley_subject1') parser.add_option("--subject2-table-name", dest="subject2_table_name", type="string", help="subject2 table name", default='wiley_subject2') parser.add_option("--subject3-table-name", dest="subject3_table_name", type="string", help="subject3 table name", default='wiley_subject3') parser.add_option("--subject4-table-name", dest="subject4_table_name", type="string", help="subject4 table name", default='wiley_subject4') parser.add_option("--textbook-table-name", dest="textbook_table_name", type="string", help="textbook table name", default='wiley_textbook') parser.add_option("--journal-table-name", dest="journal_table_name", type="string", help="journal table name", default='wiley_journal') parser.add_option("--skip", dest="skip", type=int, help="integer value", default=0) parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=False) (options, args) = parser.parse_args() workingdir = options.workingdir.rstrip('/') if not os.path.exists(workingdir): parser.error("workingdir not exists") try: display = None from pyvirtualdisplay import Display display = Display(visible=0, size=(1000, 900)) display.start() except: print 'No Xvfb!' db = mysql.DB(db=options.db_name, port=options.db_port) driver = crawlutils.open_driver(use_firefox=options.use_firefox) try: if options.crawl_subjects1: url = "http://as.wiley.com/WileyCDA/Section/index.html" driver.get(url) hoverlist = WebDriverWait(driver, 30).until( EC.presence_of_element_located( (By.CLASS_NAME, "subjects-hoverlist"))) db.set_autocommit(False) index = 0 for a in hoverlist.find_elements_by_xpath("./li/a"): index += 1 print 'index:', index subject1_title = a.text.strip() print 'subject1_title:', subject1_title subject1_url = a.get_attribute('href') print 'subject1_url:', subject1_url data = { 'subject1_title': subject1_title, 'subject1_url': subject1_url } db.insert(options.subject1_table_name, data) db.commit() if options.crawl_subjects2: subjects1 = db.query( """select subject1_title, subject1_url from %s where subject1_title not in (select distinct subject1_title from %s) """ % (options.subject1_table_name, options.subject2_table_name)) print len(subjects1), 'subjects1 yet to be crawled' db.set_autocommit(True) for (subject1_title, subject1_url) in subjects1: print 'subject1_title:', subject1_title print 'subject1_url:', subject1_url driver.get(subject1_url) hoverlist = WebDriverWait(driver, 30).until( EC.presence_of_element_located( (By.CLASS_NAME, "subjects"))) index = 0 for a in hoverlist.find_elements_by_xpath("./li/a"): index += 1 print 'index:', index subject2_title = a.text.strip() print 'subject2_title:', subject2_title subject2_url = a.get_attribute('href') print 'subject2_url:', subject2_url data = { 'subject1_title': subject1_title, 'subject2_title': subject2_title, 'subject2_url': subject2_url } db.insert(options.subject2_table_name, data) db.commit() time.sleep(3) if options.crawl_subjects3: subjects2 = db.query( """select a.subject1_title, a.subject2_title, a.subject2_url from %s a left join %s b on a.subject1_title=b.subject1_title and a.subject2_title=b.subject2_title where b.subject1_title is null """ % (options.subject2_table_name, options.subject3_table_name)) print len(subjects2), 'subjects2 yet to be crawled' db.set_autocommit(False) for (subject1_title, subject2_title, subject2_url) in subjects2: print 'subject1_title:', subject1_title print 'subject2_title:', subject2_title print 'subject2_url:', subject2_url driver.get(subject2_url) time.sleep(3) try: hoverlist = WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.CLASS_NAME, "subjectsbox"))) hoverlist = hoverlist.find_elements_by_xpath( "./ul[@class='subjects']") except: print "subjects not found. so crawling textbook listing" data = { 'subject1_title': subject1_title, 'subject2_title': subject2_title, 'subject3_title': '', 'subject3_url': subject2_url } db.insert(options.subject3_table_name, data) db.commit() crawl_textbook_listing(driver, db, workingdir, options, subject1_title, subject2_title, '', '') continue index = 0 for a in hoverlist.find_elements_by_xpath("./li/a"): index += 1 print 'index:', index subject3_title = a.text.strip() print 'subject3_title:', subject3_title subject3_url = a.get_attribute('href') print 'subject3_url:', subject3_url data = { 'subject1_title': subject1_title, 'subject2_title': subject2_title, 'subject3_title': subject3_title, 'subject3_url': subject3_url } db.insert(options.subject3_table_name, data) db.commit() time.sleep(3) if options.crawl_subjects4: subjects3 = db.query( """select a.subject1_title, a.subject2_title, a.subject3_title, a.subject3_url from %s a left join %s b on a.subject1_title=b.subject1_title and a.subject2_title=b.subject2_title and a.subject3_title=b.subject3_title where b.subject1_title is null """ % (options.subject3_table_name, options.subject4_table_name)) print len(subjects3), 'subjects3 yet to be crawled' db.set_autocommit(False) for (subject1_title, subject2_title, subject3_title, subject3_url) in subjects3: print 'subject1_title:', subject1_title print 'subject2_title:', subject2_title print 'subject3_title:', subject3_title print 'subject3_url:', subject3_url driver.get(subject3_url) time.sleep(3) if not subject3_title: print "subject3_title is empty" data = { 'subject1_title': subject1_title, 'subject2_title': subject2_title, 'subject3_title': subject3_title, 'subject4_title': '', 'subject4_url': subject3_url } db.insert(options.subject4_table_name, data) db.commit() continue try: hoverlist = WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.CLASS_NAME, "subjectsbox"))) hoverlist = hoverlist.find_elements_by_xpath( "./ul[@class='subjects']") except: data = { 'subject1_title': subject1_title, 'subject2_title': subject2_title, 'subject3_title': subject3_title, 'subject4_title': '', 'subject4_url': subject3_url } db.insert(options.subject4_table_name, data) db.commit() crawl_textbook_listing(driver, db, workingdir, options, subject1_title, subject2_title, subject3_title, '') continue index = 0 for a in hoverlist.find_elements_by_xpath("./li/a"): index += 1 print 'index:', index subject4_title = a.text.strip() print 'subject4_title:', subject4_title subject4_url = a.get_attribute('href') print 'subject4_url:', subject4_url data = { 'subject1_title': subject1_title, 'subject2_title': subject2_title, 'subject3_title': subject3_title, 'subject4_title': subject4_title, 'subject4_url': subject4_url } db.insert(options.subject4_table_name, data) db.commit() time.sleep(3) if options.crawl_textbooks: subjects4 = db.query( """select a.subject1_title, a.subject2_title, a.subject3_title, a.subject4_title, a.subject4_url from %s a left join %s b on a.subject1_title=b.subject1_title and a.subject2_title=b.subject2_title and a.subject3_title=b.subject3_title and a.subject3_title=b.subject3_title where b.subject1_title is null """ % (options.subject4_table_name, options.textbook_table_name)) print len(subjects4), 'subjects3 to be crawled yet' db.set_autocommit(False) for (subject1_title, subject2_title, subject3_title, subject4_title, subject4_url) in subjects4: print 'subject1_title:', subject1_title print 'subject2_title:', subject2_title print 'subject3_title:', subject3_title print 'subject4_title:', subject4_title print 'subject4_url:', subject4_url driver.get(subject4_url) time.sleep(3) crawl_textbook_listing(driver, db, workingdir, options, subject1_title, subject2_title, subject3_title, subject4_title) if options.crawl_textbook_details: textbooks = db.query( """select distinct subject1_title, subject2_title, subject3_title, subject4_title,textbook_title,textbook_url from %s where crawled=0""" % (options.textbook_table_name)) print len(textbooks), 'textbooks yet to be crawled' db.set_autocommit(True) count = 0 for (subject1_title, subject2_title, subject3_title, subject4_title, textbook_title, textbook_url) in textbooks: count += 1 print 'count:', count print 'textbook_title:', textbook_title print 'textbook_url:', textbook_url if not textbook_url: continue driver.get(textbook_url) time.sleep(3) format_journal = driver.find_elements_by_class_name( "format-journal") if format_journal: data = {'crawled': None} db.update(options.textbook_table_name, data, "textbook_url='%s'" % textbook_url) time.sleep(3) continue #crawl_journal(driver,options,db,subject1_title, subject2_title, subject3_title, subject4_title,textbook_title,textbook_url) #continue product_main = driver.find_elements_by_class_name( "product-main") if not product_main: data = {'crawled': None} db.update(options.textbook_table_name, data, "textbook_url='%s'" % textbook_url) time.sleep(3) continue productDetail_largeCover = product_main[ 0].find_element_by_class_name("productDetail-largeCover") coverImage = productDetail_largeCover.find_elements_by_tag_name( 'img') textbook_image_url = coverImage[0].get_attribute('src') print 'textbook_image_url:', textbook_image_url product_biblio = driver.find_element_by_class_name( "product-biblio") productDetail_authorsMain = product_biblio.find_elements_by_class_name( "productDetail-authorsMain") textbook_author = None if productDetail_authorsMain: textbook_author = productDetail_authorsMain[0].text.strip() if textbook_author.startswith('By '): textbook_author = textbook_author[3:].strip() print 'textbook_author:', textbook_author textbook_publish_date = None textbook_copyright_year = None if product_biblio.find_elements_by_class_name( "productDetail-dateImprint"): textbook_publish_date = product_biblio.find_element_by_class_name( "productDetail-dateImprint").text.strip( ",")[0].strip() textbook_publish_date = int( mx.DateTime.DateTimeFrom(textbook_publish_date)) productDetail_dateImprint = product_biblio.find_element_by_class_name( "productDetail-dateImprint").text if '©' in productDetail_dateImprint: textbook_copyright_year = productDetail_dateImprint[( productDetail_dateImprint.find('©') + 1):].strip() print 'textbook_publish_date:', textbook_publish_date print 'textbook_copyright_year:', textbook_copyright_year textbook_isbn = textbook_url[(textbook_url.find('-') + 1):].replace(".html", "").strip() if len(textbook_isbn) > 12: textbook_isbn = textbook_isbn.replace( textbook_isbn[textbook_isbn.find(','):], '').strip() textbook_isbn_10 = textbook_isbn print 'textbook_isbn_10:', textbook_isbn_10 productDetail_productCode = product_biblio.find_elements_by_class_name( "productDetail-productCode") textbook_isbn_13 = None if productDetail_productCode: textbook_isbn_13 = productDetail_productCode[ 0].text.replace('-', '') textbook_isbn_13 = textbook_isbn_13.replace("ISBN:", "").strip() else: textbook_isbn_13 = None print 'textbook_isbn_13:', textbook_isbn_13 toc = 0 toc_html = '' textbook_description = None textbook_publisher = "Wiley" print 'textbook_publisher:', textbook_publisher infoDescription = driver.find_elements_by_id("infoDescription") if infoDescription: #productDetail_richDataText = driver.find_elements_by_class_name("showMore") #if productDetail_richDataText: # if productDetail_richDataText[0].text.strip() == 'See More': # productDetail_richDataText[0].click() textbook_description = infoDescription[ 0].find_element_by_class_name( "productDetail-richDataText") textbook_description = textbook_description.get_attribute( 'innerText').strip() print 'textbook_description:', textbook_description # ribbon_tab_navigation = driver.find_element_by_class_name("ribbon-tab-navigation") # a = ribbon_tab_navigation.find_elements_by_xpath(".//li[@class = 'toc-tab']") # if a: # toc = 1 # print 'toc available' # #a[0].click() # #time.sleep(3) # infoTableof = driver.find_elements_by_id("infoTableof") if infoTableof: #if infoTableof[0].text.strip() == 'See More': # infoTableof[0].click() content = infoTableof[0].find_element_by_class_name( 'productDetail-richDataText') toc_html = content.get_attribute('innerHTML').strip() m = hashlib.md5() m.update(textbook_url) url_md5 = m.hexdigest() file = codecs.open( workingdir + '/wiley_toc_html/' + url_md5 + '.html', "w", "utf-8") file.write(toc_html) file.close() print 'TOC:' print toc_html print 'toc_html_file :', url_md5 + '.html' toc = 1 data = { 'textbook_isbn': textbook_isbn_13, 'textbook_isbn_10': textbook_isbn_10, 'textbook_isbn_13': textbook_isbn_13, 'textbook_author': textbook_author, 'textbook_copyright_year': textbook_copyright_year, 'textbook_publish_date': textbook_publish_date, 'textbook_description': textbook_description, 'textbook_publisher': textbook_publisher, 'textbook_image_url': textbook_image_url, 'crawled': 1, 'toc': toc, 'toc_html': toc_html } db.update(options.textbook_table_name, data, "textbook_url='%s'" % textbook_url) time.sleep(3) except: traceback.print_exc() if driver: driver.save_screenshot(workingdir + '/error.png') print workingdir + '/error.png' finally: if driver: driver.quit() if display: display.stop()
def Main(): parser = OptionParser() parser.add_option("--crawl-textbooks", dest="crawl_textbooks", action="store_true", help="crawl textbooks list", default=False) parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.') parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='colorado_1012') parser.add_option("--subject2-table-name", dest="subject2_table_name", type="string", help="subject2 table name", default='colorado_subject2') parser.add_option("--textbook-table-name", dest="textbook_table_name", type="string", help="textbook table name", default='colorado_textbook') parser.add_option("--skip", dest="skip", type=int, help="integer value", default=0) parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True) (options, args) = parser.parse_args() workingdir = options.workingdir.rstrip('/') if not os.path.exists(workingdir): parser.error("workingdir not exists") try: display = None from pyvirtualdisplay import Display display = Display(visible=0, size=(1000, 900)) display.start() except: print 'No Xvfb!' db = mysql.DB(db=options.db_name, port=options.db_port) driver = crawlutils.open_driver(use_firefox=options.use_firefox) try: if options.crawl_textbooks: subject2 = db.query("""select * from %s""" % (options.subject2_table_name)) print len(subject2), 'Textbook to be crawled yet' db.set_autocommit(True) for (subject1_title, subject2_title, subject_url) in subject2: print 'subject1_title:', subject1_title print 'subject2_title:', subject2_title print 'subject_url:', subject_url driver.get(subject_url) time.sleep(3) simulation_link = driver.find_elements_by_class_name( "simulation-link") for link in simulation_link: file_format = None textbook_url = link.get_attribute("href") textbook_image_url = link.find_element_by_tag_name( "img").get_attribute("src") textbook_title = link.find_element_by_tag_name( "strong").text span = link.find_element_by_tag_name('span') badge = span[1].get_attribute("class") if "html" in badge: file_format = "html5" if "java" in badge: file_format = "java applet" if "flash" in badge: file_format = "shockwave flash" print "textbook_title :", textbook_title print "textbook_url :", textbook_url print "textbook_image_url :", textbook_image_url print "file_format :", file_format raise Exception("done") data = { 'subject1_title': subject1_title, 'subject2_title': subject2_title, 'textbook_title': textbook_title, 'textbook_url': textbook_url, 'textbook_image_url': textbook_image_url, 'format': file_format } db.insert(options.textbook_table_name, data) except: traceback.print_exc() if driver: driver.save_screenshot(workingdir + '/error.png') print workingdir + '/error.png' finally: if driver: driver.quit() if display: display.stop()
def Main(): parser = OptionParser() parser.add_option("--textbook-package", dest="textbook_package", action="store_true", help="textbook package details", default=False) parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.') parser.add_option("--db-name", dest="db_name", type="string", help="db name", default='pearsonhighered') parser.add_option("--db-port", dest="db_port", type="int", help="db port", default=3306) parser.add_option("--textbook-package-table-name", dest="textbook_package_table_name", type="string", help="textbook package table name", default='textbook_package') parser.add_option("--skip", dest="skip", type=int, help="integer value", default=0) parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=False) (options, args) = parser.parse_args() workingdir = options.workingdir.rstrip('/') if not os.path.exists(workingdir): parser.error("workingdir not exists") try: display = None from pyvirtualdisplay import Display display = Display(visible=0, size=(1000, 900)) display.start() except: print 'No Xvfb!' db = mysql.DB(db=options.db_name, port=options.db_port) driver = crawlutils.open_driver(use_firefox=options.use_firefox) try: if options.textbook_package: textbooks = db.query( """select textbook_id,textbook_url from %s where status=%s""" % (options.textbook_package_table_name, 0)) print len(textbooks), 'textbooks yet to be crawled' db.set_autocommit(True) count = 0 for (textbook_id, textbook_url) in textbooks: count += 1 print 'count:', count print 'textbook_id:', textbook_id print 'textbook_url:', textbook_url if not textbook_url: continue driver.get(textbook_url) time.sleep(3) tab_about_this_product = driver.find_elements_by_id( "tab-about-this-product") if not tab_about_this_product: data = {'status': 2} db.update(options.textbook_package_table_name, data, "textbook_url='%s'" % textbook_url) continue #raise Exception("tab about this product not found") #data_feed_float_right=tab_content_group[0].find_elements_by_class_name("data-feed.float-right") #if data_feed_float_right: # raise Exception("data-feed float-right no found") description = tab_about_this_product[0].find_elements_by_id( "description") if not description: data = {'status': -1} db.update(options.textbook_package_table_name, data, "textbook_url='%s'" % textbook_url) continue #raise Exception("description not found") ul = description[0].find_elements_by_tag_name("ul") if not ul: data = {'status': -1} db.update(options.textbook_package_table_name, data, "textbook_url='%s'" % textbook_url) continue #raise Exception("ul tag not found") li_tag = ul[0].find_elements_by_tag_name("li") if not li_tag: raise Exception("li tag not found") a = li_tag[0].find_elements_by_tag_name("a") if not a: data = {'status': -1} db.update(options.textbook_package_table_name, data, "textbook_url='%s'" % textbook_url) continue #raise Exception("a tag not found") for li in li_tag: a = li.find_elements_by_tag_name("a") if not a: raise Exception("a tag not found") title = a[0].text.strip() print "textbook title", title p = li.find_elements_by_tag_name("p") if not p: raise Exception("p tag not found") if not len(p) == 3: raise Exception("all p tags are not found") for tag in p: package_details = tag.text if '©' in package_details: copy_right_year = package_details[ package_details.find("©") + 1:package_details.find("•")] copy_right_year = copy_right_year.strip() if not len(copy_right_year) == 4: raise Exception( "copy right right is not correct") if 'pp' in package_details: pages = package_details[ package_details.find(",") + 1:package_details.find("pp")] pages = pages.strip() continue if "ISBN" in package_details: if "•" in package_details: isbns = package_details.split("•") for isbn in isbns: if "ISBN-10:" in isbn: isbn_10 = isbn.replace("ISBN-10:", "").strip() continue if "ISBN-13:" in isbn: isbn_13 = isbn.replace("ISBN-13:", "").strip() continue continue author = package_details.strip() if not len(isbn_10) == 10: raise Exception("isbn 10 is not correct") if not len(isbn_13) == 13: raise Exception("isbn 13 is not correct") print "author :", author print "copy right year", copy_right_year print "Pages", pages print "isbn_10 :", isbn_10 print "isbn_13 :", isbn_13 data = { 'textbook_id': textbook_id, 'textbook_url': textbook_url, 'textbook_title': title, 'isbn10': isbn_10, 'isbn13': isbn_13, 'author': author, 'copyright_year': copy_right_year, 'pages': pages, 'status': 1 } db.insert('package_textbook2', data) data = {'status': 1} db.update(options.textbook_package_table_name, data, "textbook_url='%s'" % textbook_url) print "Crawling done" except: traceback.print_exc() if driver: driver.save_screenshot(workingdir + '/error.png') print workingdir + '/error.png' finally: if driver: driver.quit() if display: display.stop()
def Main(): parser = OptionParser() parser.add_option("--crawl", dest="crawl", action="store_true", help="crawl url", default=False) parser.add_option("--crawl-landing", dest="crawl_landing", action="store_true", help="crawl url", default=False) parser.add_option("--working-dir", dest="workingdir", type="string", help="working directory", default='.') parser.add_option("--db-name", dest="db_name", type="string", help="database name", default='skillscommons') parser.add_option("--table-name", dest="table_name", type="string", help="table name", default='skill') parser.add_option("--main-table-name", dest="main_table_name", type="string", help="main table name", default='skillscommons') parser.add_option("--attachment-table-name", dest="attachment_table_name", type="string", help="attachment table name", default='attachment') parser.add_option("--meta-table-name", dest="meta_table_name", type="string", help="meta table name", default='meta_data') parser.add_option("--use-firefox", dest="use_firefox", action="store_true", help="use-firefox", default=True) (options, args) = parser.parse_args() workingdir = options.workingdir.rstrip('/') if not os.path.exists(workingdir): parser.error("workingdir not exists") try: display = None from pyvirtualdisplay import Display display = Display(visible=0, size=(1000,900)) display.start() except: print 'No Xvfb!' db = mysql.DB(db=options.db_name) db.set_autocommit(True) driver = crawlutils.open_driver(use_firefox=options.use_firefox) links=["https://www.skillscommons.org/discover?rpp=2000&page=1&group_by=none&etal=0", "https://www.skillscommons.org/discover?rpp=2000&page=2&group_by=none&etal=0", "https://www.skillscommons.org/discover?rpp=2000&page=3&group_by=none&etal=0"] try: if options.crawl: count = 0 for link in links: print "Link :",link driver.get(link) time.sleep(5) medium_results=driver.find_element_by_class_name("medium-results") li=medium_results.find_elements_by_tag_name("li") for tag in li: count+=1 print "Count :",count link_tag=tag.find_element_by_tag_name("a") title=link_tag.text.strip() url=link_tag.get_attribute("href") types=tag.find_elements_by_class_name("type") if len(types)==2: type=types[0].text.strip() institution=types[1].text.strip() else: type=None institution=types[0].text.strip() description=tag.find_element_by_class_name("abstract").text.strip() print "title :", title print "url :",url print "type :",type print "institution :",institution print "description :",description data = { 'title':title, 'institution':institution, 'url':url, 'type':type, 'description':description, } db.insert(options.table_name, data) if options.crawl_landing: count=0 skill=db.query("select distinct url from skill where crawled=0") print "Number of urls to crawl ",len(skill) for (src_url,) in skill: print "source url :",src_url print "count %s"%count count+=1 driver.get(src_url) author=None col=driver.find_element_by_class_name("col-sm-8") title=col.find_element_by_tag_name("h1").text.strip() m = hashlib.md5() m.update(title+src_url) document_id=m.hexdigest() toc_html="/mnt/data/kendavar/skillscommons/%s.html"%document_id file(toc_html,"w","utf8").write(driver.page_source) authors=col.find_element_by_class_name("authors") if not authors.find_elements_by_tag_name("div"): author=authors.text.strip() description=col.find_element_by_class_name("abstract").text files=col.find_element_by_class_name("files") file_information=files.find_elements_by_class_name("file-information") attachment=[] for attach in file_information: attachment.append((attach.text.strip(),attach.find_element_by_tag_name("a").get_attribute("href"))) dls=col.find_elements_by_tag_name("dl") meta={} string='' for dl in dls: for div in dl.find_elements_by_tag_name("div"): string='' dd=div.find_element_by_tag_name("dd") if dd.find_elements_by_tag_name("li"): for li in dd.find_elements_by_tag_name("li"): string=string+li.text.strip()+"," elif dd.find_elements_by_tag_name("a"): string=[dd.text.strip()] anchors=[] for anchor in dd.find_elements_by_tag_name("a"): if anchor.get_attribute("href") not in anchors: anchors.append(anchor.get_attribute("href")) string.append(anchor.get_attribute("href")) else: string=dd.text.strip() meta[div.find_element_by_tag_name("dt").text.replace(":","").strip()]=string print "title :",title print "author :",author print "description :",description print "toc_path",toc_html data={ "document_id":document_id, "title":title, "author":author, "description":description, "toc_path":toc_html } db.insert(options.main_table_name, data) for (attachment_title,attachment_url) in attachment: print "document_id":document_id, print "attachment_title":attachment_title, print "attachment_url":attachment_url data={ "document_id":document_id, "attachment_title":attachment_title, "attachment_url":attachment_url } db.insert(options.attachment_table_name, data) for key,value in meta.iteritems(): if value[-1]==",": value=value[:-1] print '%s : %s'%(key,value) if type(value) is list: for val in value: meta_title=key if i%2==0 : meta_value=val else: meta_url=val print "meta_title":meta_title print "meta_value":meta_value print "meta_url":meta_url data={ "document_id":document_id, "meta_title":meta_title, "meta_value":meta_value, "meta_url":meta_url } db.insert(options.meta_table_name, data) else: meta_title=key meta_url=None meta_value=value print "meta_title":meta_title print "meta_value":meta_value print "meta_url":meta_url data={ "document_id":document_id, "meta_title":meta_title, "meta_value":meta_value, "meta_url":meta_url } db.insert(options.meta_table_name, data) data={ "crawled":1 } db.update(options.table_name,data,"url='%s'"%src_url) print "updated the table" except: traceback.print_exc() if driver: driver.save_screenshot(workingdir + '/error.png') print workingdir + '/error.png' finally: if driver: driver.quit() if display: display.stop()