def main(): folder = "./downloads/" files = glob.glob1(folder, "*.png") for file1 in files: crawlutils.resize_png_image(folder + file1) shutil.copyfile(folder + file1, "./thumbnails_png/" + file1.replace(".png", "_png"))
def main(): urls=["http://mediaplayer.pearsoncmg.com/assets/_embed.true/_S4d9GbwzNVKJsky71elZaFdEEyLCW69"] filenames=[] doc_id=["6fdc7009d956eb2b84fab4af2af6d234"] """for line in open("vi_dr12.txt","r") : urls.append(line.rstrip('\n')) for line in open("vi_doc.txt","r") : doc_id.append(line.rstrip('\n'))""" driver =webdriver.Chrome() driver.set_window_size(1280, 1024) print len(urls) try: for number,url in enumerate(urls): print url driver.get(url) time.sleep(5) filename="/home/vulcantech/kendavar/Ace/"+doc_id[number]+".png" print doc_id[number] print filename element = driver.find_element_by_id('video') print element location = element.location print location size = element.size print size driver.save_screenshot(filename) im = Image.open(filename) left = location['x'] print left top = location['y'] print top right = location['x'] + size['width'] print right bottom = location['y'] + size['height'] print bottom left=int(round(left)) right=int(round(right)) print left,right im = im.crop((left, top, right, bottom)) im.save(filename) filenames.append(filename) #driver.save_screenshot(filename) crawlutils.resize_png_image(filename) time.sleep(2) print number shutil.copyfile(filename, "/home/vulcantech/kendavar/Ace/"+doc_id[number]+"_png") #sys.exit() driver.quit() except Exception as e: print " error",e driver.close()
def main(): urls = [] filenames = [] doc_id = [] for line in open("pearson_url.txt", "r"): urls.append(line.rstrip('\n')) #for line in open("vi_doc.txt","r") : # doc_id.append(line.rstrip('\n')) #driver =webdriver.Chrome() #driver.set_window_size(1280, 1024) print len(urls) try: for number, url in enumerate(urls): print url res = urllib2.urlopen(url) data = res.read() time.sleep(2) url = re.findall(r'<script src="(.*)"></script>', data) res = urllib2.urlopen(url[0]) time.sleep(2) soup = res.read() jsonValue = '{%s}' % (soup.split('{', 1)[1].rsplit('}', 1)[0], ) #print jsonValue tmp = jsonValue[jsonValue.find("'html5', config: {'file':"):] print tmp tmp = tmp.replace("'html5', config: {'file':", '') tmp = tmp[:tmp.find("', 'provider': 'video'}")] tmp = tmp.replace("'", "").strip().replace("\\", "") print tmp filename = "/home/vulcantech/kendavar/Ace/vi_screenshot/%s.png" % str( number) os.system( """ffmpeg -i %s -ss 00:00:00.435 -vframes 1 /home/vulcantech/kendavar/Ace/vi_screenshot/%s.png""" % (tmp, number)) #sys.exit() #im = im.crop((left, top, right, bottom)) #im.save(filename) print filename filenames.append(filename) #driver.save_screenshot(filename) crawlutils.resize_png_image(filename) time.sleep(2) print number shutil.copyfile( filename, "/home/vulcantech/kendavar/Ace/vi_screenshot_png/%s_png" % str(number)) #sys.exit() driver.quit() except Exception as e: print " error", e
def main(): urls=[] doc_id=[] doc_type=[] filenames=[] doc_not=[] #for line in open("url.txt","r") : # urls.append(line.rstrip('\n')) folder="./thumbnails/" for i,f in enumerate(glob.glob1(folder, "*.png")): #doc_not.append(f.replace("*.png","")) print len(doc_not) """with open('ucertify.csv','rb') as csvfile: reader=csv.reader(csvfile,delimiter='\t',quotechar='|') for row in reader: doc_id.append(row[0]) urls.append(row[1]) #doc_type.append(row[2])""" driver =webdriver.Chrome() driver.set_window_size(800,1000) print len(urls) try: for number,url in enumerate(urls): driver.get(url) time.sleep(25) filename="/home/vulcantech/kendavar/Ace/screenshot/"+doc_id[number]+".png" print doc_id[number] #print doc_type[number] print filename filenames.append(filename) driver.save_screenshot(filename) crawlutils.resize_png_image(filename) time.sleep(2) shutil.copyfile(filename, "/home/vulcantech/kendavar/Ace/vi_screenshot_png/"+doc_id[number]+"_png") print number """ if doc_type[number] in "webview": if "purdueowl" not in url: driver.get(url) time.sleep(5) filename="/home/vulcantech/kendavar/Ace/screenshot/"+doc_id[number]+".png" print doc_id[number] print doc_type[number] print filename filenames.append(filename) driver.save_screenshot(filename) crawlutils.resize_png_image(filename) time.sleep(2) shutil.copyfile(filename, "/home/vulcantech/kendavar/Ace/vi_screenshot_png/"+doc_id[number]+"_png") print number elif doc_type[number] in "audio": url="http://staging.ace.app.writer.pearsonhighered.com/get/thumbnail/b0a4649daeecd82338c45b54a5f20200" filename="/home/vulcantech/kendavar/Ace/screenshot/"+doc_id[number]+".png" print filename filenames.append(filename) file = cStringIO.StringIO(urllib.urlopen(url).read()) img = Image.open(file) img.save(filename) shutil.copyfile(filename, "/home/vulcantech/kendavar/Ace/vi_screenshot_png/"+doc_id[number]+"_png") print number""" print len(filenames) driver.close() except Exception as e: print " error",e print len(filenames) driver.close() if __name__ == '__main__': main()
def create_thumbnail(driver, workingdir, document_id, document_url, s3bucket=None): print 'document_id:', document_id print document_url thumbnail_name = "%s_png" % (document_id) thumbnail_local_filepath = workingdir + "/thumbnails/" + thumbnail_name thumbnail_local_filepath_doubts = workingdir + "/thumbnails_doubts/" + thumbnail_name if os.path.exists(thumbnail_local_filepath): print 'local copy of the thumbnail exists' elif os.path.exists(thumbnail_local_filepath_doubts): print 'local copy of the thumbnail exists on doubts folder' thumbnail_local_filepath = None else: if "media.pearsoncmg.com" in document_url: driver.set_window_size(900, 800) elif "khanacademy.org" in document_url: driver.set_window_size(1280, 1024) driver.get(document_url) time.sleep(3) crawlutils.handle_alert(driver) title = driver.title print 'Title:', title file(workingdir + '/document_tiles.txt', 'a').write("%s\t%s\n" % (document_id, title)) title = title.lower() if 'log in' in title or 'sign in' in title or '404' in title or 'expired' in title or 'not found' in title: print '404 page:', document_id, ">>", driver.current_url file(workingdir + '/loaded_different_url.txt', 'a').write("%s\t%s" % (document_id, document_url)) driver.switch_to_window(driver.window_handles[-1]) driver.save_screenshot(thumbnail_local_filepath_doubts) close_all_new_windows(driver) print 'screenshot done' time.sleep(1) if not os.path.exists(thumbnail_local_filepath_doubts): thumbnail_local_filepath_doubts = None thumbnail_local_filepath = None else: retry = 0 while True: if retry > 3: break time.sleep(30) driver.switch_to_window(driver.window_handles[0]) driver.save_screenshot(thumbnail_local_filepath) close_all_new_windows(driver) print 'screenshot done' if os.path.exists(thumbnail_local_filepath): thumbnail_size = os.stat(thumbnail_local_filepath).st_size if thumbnail_size < 10000: #print 'file size is small.(%s Bytes) so moving to doubts folder' %thumbnail_size #shutil.move(thumbnail_local_filepath, thumbnail_local_filepath_doubts) #thumbnail_local_filepath = None print 'file size is small.(%s Bytes) so retrying' % thumbnail_size retry += 1 continue break else: thumbnail_local_filepath = None if thumbnail_local_filepath: if os.path.exists(thumbnail_local_filepath): thumbnail_size = os.stat(thumbnail_local_filepath).st_size if thumbnail_size > 10000: crawlutils.resize_png_image(thumbnail_local_filepath) print 'thumbnail resized' if thumbnail_local_filepath_doubts: if os.path.exists(thumbnail_local_filepath_doubts): thumbnail_size = os.stat(thumbnail_local_filepath_doubts).st_size if thumbnail_size > 10000: crawlutils.resize_png_image(thumbnail_local_filepath_doubts) print 'thumbnail resized' if thumbnail_local_filepath: upload_thumbnail_to_s3(s3bucket, thumbnail_name, thumbnail_local_filepath)
def main(): urls=[] filenames=[] doc_id=[] duplicates=[] doc_not=[] i=0 with open('mylab.csv','rb') as csvfile: reader=csv.reader(csvfile,delimiter=',',quotechar='"') for row in reader: doc_id.append(row[0]) #urls.append(row[1]) folder="/home/vulcantech/kendavar/Ace/vi_screenshot_png/" for i,f in enumerate(glob.glob1(folder, "*_png")): doc_not.append(f.replace("_png","")) print len(doc_not) print doc_not[1] driver =webdriver.Chrome() driver.set_window_size(1000, 900) loginurl = "https://portal.mypearson.com/login" driver.get(loginurl) time.sleep(5) WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.ID, "username"))).send_keys("ace_learning") driver.find_element_by_id('password').send_keys("ACE15Pearson") driver.find_element_by_id('mainButton').click() time.sleep(10) #print len(urls) try: for number,doc_ids in enumerate(doc_id): filename="/home/vulcantech/kendavar/Ace/vi_screenshot/%s.png"%(doc_ids) driver.get("http://www.learningace.com/get/thumbnail/"+doc_ids) time.sleep(10) #print doc_id[number] #print filename #print url #if doc_id[number] in doc_not: # continue if filename in filenames: duplicates.append(doc_id) #continue if ".pptx" in url: print doc_id[number] print url if ".ppt" in url: print "skiped",i continue continue if "triolafc" in url: driver.set_window_size(1000, 900) elif "mediaplayer" in url: driver.set_window_size(1000, 900) else: driver.set_window_size(800, 900) driver.get(url) time.sleep(20) if ".html" not in url: ##main code for getting the thumbnail element = driver.find_element_by_tag_name('object') print element location = element.location print location size = element.size print size driver.save_screenshot(filename) im = Image.open(filename) left = location['x'] print left top = location['y'] print top right = location['x'] + size['width'] print right bottom = location['y'] + size['height'] print bottom left=int(round(left)) right=int(round(right)) print left,right im = im.crop((left, top, right, bottom)) im.save(filename) else: driver.save_screenshot(filename) filenames.append(filename) #driver.save_screenshot(filename) crawlutils.resize_png_image(filename) time.sleep(2) print number shutil.copyfile(filename, "/home/vulcantech/kendavar/Ace/vi_screenshot_png/%s_png"%(doc_id[number])) #sys.exit()""" print "duplicates :",len(duplicates) driver.quit() except Exception as e: print " error",e driver.close()