示例#1
0
def main():
    folder = "./downloads/"
    files = glob.glob1(folder, "*.png")
    for file1 in files:
        crawlutils.resize_png_image(folder + file1)
        shutil.copyfile(folder + file1,
                        "./thumbnails_png/" + file1.replace(".png", "_png"))
示例#2
0
def main():
    urls=["http://mediaplayer.pearsoncmg.com/assets/_embed.true/_S4d9GbwzNVKJsky71elZaFdEEyLCW69"]
    filenames=[]
    doc_id=["6fdc7009d956eb2b84fab4af2af6d234"]
    """for line in open("vi_dr12.txt","r") :
        urls.append(line.rstrip('\n'))
    for line in open("vi_doc.txt","r") :
        doc_id.append(line.rstrip('\n'))"""
    driver =webdriver.Chrome()
    driver.set_window_size(1280, 1024)
    print len(urls)
    try:
        for number,url in enumerate(urls):
            print url
            driver.get(url)
            time.sleep(5)
            filename="/home/vulcantech/kendavar/Ace/"+doc_id[number]+".png"   
            print doc_id[number]
            print filename
            element = driver.find_element_by_id('video')
            print element
            location = element.location
            print location
            size = element.size
            print size
            driver.save_screenshot(filename)
            im = Image.open(filename)
            left = location['x']
            print left
            top = location['y']
            print top
            right = location['x'] + size['width']
            print right
            bottom = location['y'] + size['height']
            print bottom
            left=int(round(left))
            right=int(round(right))
            print left,right
            im = im.crop((left, top, right, bottom))
            im.save(filename)
            filenames.append(filename)
            #driver.save_screenshot(filename)
            crawlutils.resize_png_image(filename)
            time.sleep(2)
            print number
            shutil.copyfile(filename, "/home/vulcantech/kendavar/Ace/"+doc_id[number]+"_png")
            #sys.exit()
        driver.quit()           
    except Exception as e:
            print " error",e
            driver.close()
示例#3
0
def main():
    urls = []
    filenames = []
    doc_id = []
    for line in open("pearson_url.txt", "r"):
        urls.append(line.rstrip('\n'))
    #for line in open("vi_doc.txt","r") :
    #    doc_id.append(line.rstrip('\n'))
    #driver =webdriver.Chrome()
    #driver.set_window_size(1280, 1024)
    print len(urls)
    try:
        for number, url in enumerate(urls):
            print url
            res = urllib2.urlopen(url)
            data = res.read()
            time.sleep(2)
            url = re.findall(r'<script src="(.*)"></script>', data)
            res = urllib2.urlopen(url[0])
            time.sleep(2)
            soup = res.read()
            jsonValue = '{%s}' % (soup.split('{', 1)[1].rsplit('}', 1)[0], )
            #print jsonValue
            tmp = jsonValue[jsonValue.find("'html5', config: {'file':"):]
            print tmp
            tmp = tmp.replace("'html5', config: {'file':", '')
            tmp = tmp[:tmp.find("', 'provider': 'video'}")]
            tmp = tmp.replace("'", "").strip().replace("\\", "")
            print tmp
            filename = "/home/vulcantech/kendavar/Ace/vi_screenshot/%s.png" % str(
                number)
            os.system(
                """ffmpeg -i %s -ss 00:00:00.435 -vframes 1 /home/vulcantech/kendavar/Ace/vi_screenshot/%s.png"""
                % (tmp, number))
            #sys.exit()
            #im = im.crop((left, top, right, bottom))
            #im.save(filename)
            print filename
            filenames.append(filename)
            #driver.save_screenshot(filename)
            crawlutils.resize_png_image(filename)
            time.sleep(2)
            print number
            shutil.copyfile(
                filename,
                "/home/vulcantech/kendavar/Ace/vi_screenshot_png/%s_png" %
                str(number))
            #sys.exit()
        driver.quit()
    except Exception as e:
        print " error", e
def main():
    urls=[]
    doc_id=[]
    doc_type=[]
    filenames=[]
    doc_not=[]
    #for line in open("url.txt","r") :
    #    urls.append(line.rstrip('\n'))
    folder="./thumbnails/"
    for i,f in enumerate(glob.glob1(folder, "*.png")):
        
        #doc_not.append(f.replace("*.png",""))
    print len(doc_not)
    """with open('ucertify.csv','rb') as csvfile:
        reader=csv.reader(csvfile,delimiter='\t',quotechar='|')
        for row in reader:
            doc_id.append(row[0])
            urls.append(row[1])
            #doc_type.append(row[2])"""
    driver =webdriver.Chrome()
    driver.set_window_size(800,1000)
    
    print len(urls)
    try:
        for number,url in enumerate(urls):
            driver.get(url)
            time.sleep(25)
            filename="/home/vulcantech/kendavar/Ace/screenshot/"+doc_id[number]+".png"
            print doc_id[number]
            #print doc_type[number]
            print filename
            filenames.append(filename)
            driver.save_screenshot(filename)
            crawlutils.resize_png_image(filename)
            time.sleep(2)
            shutil.copyfile(filename, "/home/vulcantech/kendavar/Ace/vi_screenshot_png/"+doc_id[number]+"_png")
            print number
            """
            if doc_type[number] in "webview":
                if "purdueowl" not in url: 
                   driver.get(url)
                   time.sleep(5)
                   filename="/home/vulcantech/kendavar/Ace/screenshot/"+doc_id[number]+".png"
                   print doc_id[number]
                   print doc_type[number]
                   print filename
                   filenames.append(filename)
                   driver.save_screenshot(filename)
                   crawlutils.resize_png_image(filename)
                   time.sleep(2)
                   shutil.copyfile(filename, "/home/vulcantech/kendavar/Ace/vi_screenshot_png/"+doc_id[number]+"_png")
                   print number 
            elif doc_type[number] in "audio":
                url="http://staging.ace.app.writer.pearsonhighered.com/get/thumbnail/b0a4649daeecd82338c45b54a5f20200"
                filename="/home/vulcantech/kendavar/Ace/screenshot/"+doc_id[number]+".png"
                print filename
                filenames.append(filename)
                file = cStringIO.StringIO(urllib.urlopen(url).read())
                img = Image.open(file)
                img.save(filename)
                shutil.copyfile(filename, "/home/vulcantech/kendavar/Ace/vi_screenshot_png/"+doc_id[number]+"_png")
                print number"""
            
        print len(filenames) 
        driver.close()
    except Exception as e:
            print " error",e
            print len(filenames)
            driver.close()

if __name__ == '__main__':
    main()
示例#5
0
def create_thumbnail(driver,
                     workingdir,
                     document_id,
                     document_url,
                     s3bucket=None):
    print 'document_id:', document_id
    print document_url

    thumbnail_name = "%s_png" % (document_id)
    thumbnail_local_filepath = workingdir + "/thumbnails/" + thumbnail_name
    thumbnail_local_filepath_doubts = workingdir + "/thumbnails_doubts/" + thumbnail_name

    if os.path.exists(thumbnail_local_filepath):
        print 'local copy of the thumbnail exists'
    elif os.path.exists(thumbnail_local_filepath_doubts):
        print 'local copy of the thumbnail exists on doubts folder'
        thumbnail_local_filepath = None
    else:
        if "media.pearsoncmg.com" in document_url:
            driver.set_window_size(900, 800)
        elif "khanacademy.org" in document_url:
            driver.set_window_size(1280, 1024)
        driver.get(document_url)
        time.sleep(3)
        crawlutils.handle_alert(driver)
        title = driver.title
        print 'Title:', title
        file(workingdir + '/document_tiles.txt',
             'a').write("%s\t%s\n" % (document_id, title))
        title = title.lower()
        if 'log in' in title or 'sign in' in title or '404' in title or 'expired' in title or 'not found' in title:
            print '404 page:', document_id, ">>", driver.current_url
            file(workingdir + '/loaded_different_url.txt',
                 'a').write("%s\t%s" % (document_id, document_url))
            driver.switch_to_window(driver.window_handles[-1])
            driver.save_screenshot(thumbnail_local_filepath_doubts)
            close_all_new_windows(driver)
            print 'screenshot done'
            time.sleep(1)
            if not os.path.exists(thumbnail_local_filepath_doubts):
                thumbnail_local_filepath_doubts = None
            thumbnail_local_filepath = None
        else:
            retry = 0
            while True:
                if retry > 3: break
                time.sleep(30)
                driver.switch_to_window(driver.window_handles[0])
                driver.save_screenshot(thumbnail_local_filepath)
                close_all_new_windows(driver)
                print 'screenshot done'
                if os.path.exists(thumbnail_local_filepath):
                    thumbnail_size = os.stat(thumbnail_local_filepath).st_size
                    if thumbnail_size < 10000:
                        #print 'file size is small.(%s Bytes) so moving to doubts folder' %thumbnail_size
                        #shutil.move(thumbnail_local_filepath, thumbnail_local_filepath_doubts)
                        #thumbnail_local_filepath = None
                        print 'file size is small.(%s Bytes) so retrying' % thumbnail_size
                        retry += 1
                        continue
                break

            else:
                thumbnail_local_filepath = None

    if thumbnail_local_filepath:
        if os.path.exists(thumbnail_local_filepath):
            thumbnail_size = os.stat(thumbnail_local_filepath).st_size
            if thumbnail_size > 10000:
                crawlutils.resize_png_image(thumbnail_local_filepath)
                print 'thumbnail resized'

    if thumbnail_local_filepath_doubts:
        if os.path.exists(thumbnail_local_filepath_doubts):
            thumbnail_size = os.stat(thumbnail_local_filepath_doubts).st_size
            if thumbnail_size > 10000:
                crawlutils.resize_png_image(thumbnail_local_filepath_doubts)
                print 'thumbnail resized'

    if thumbnail_local_filepath:
        upload_thumbnail_to_s3(s3bucket, thumbnail_name,
                               thumbnail_local_filepath)
def main():
    urls=[]
    filenames=[]
    doc_id=[]
    duplicates=[]
    doc_not=[]
    i=0
    with open('mylab.csv','rb') as csvfile:
        reader=csv.reader(csvfile,delimiter=',',quotechar='"')
        for row in reader:
            doc_id.append(row[0])
            #urls.append(row[1])
    folder="/home/vulcantech/kendavar/Ace/vi_screenshot_png/"
    for i,f in enumerate(glob.glob1(folder, "*_png")):
        doc_not.append(f.replace("_png",""))
        
    print len(doc_not)
    print doc_not[1]
    driver =webdriver.Chrome()
    driver.set_window_size(1000, 900)
    loginurl = "https://portal.mypearson.com/login"
    driver.get(loginurl)
    time.sleep(5)
    WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.ID, "username"))).send_keys("ace_learning")
    driver.find_element_by_id('password').send_keys("ACE15Pearson")
    driver.find_element_by_id('mainButton').click()
    time.sleep(10)

    #print len(urls)
    try:
        for number,doc_ids in enumerate(doc_id):
            filename="/home/vulcantech/kendavar/Ace/vi_screenshot/%s.png"%(doc_ids) 
            driver.get("http://www.learningace.com/get/thumbnail/"+doc_ids)
            time.sleep(10)  
            #print doc_id[number]
            #print filename
            #print url
            #if doc_id[number] in doc_not:
            #    continue
            if filename in filenames:
                duplicates.append(doc_id)
                #continue
            if ".pptx" in url:
                print doc_id[number]
                print url
            if ".ppt" in url:
                print "skiped",i
                continue
            continue
            if "triolafc" in url:
                driver.set_window_size(1000, 900)
            elif "mediaplayer" in url:
                driver.set_window_size(1000, 900)
            else:
                driver.set_window_size(800, 900)
            driver.get(url)
            time.sleep(20)    
            if ".html" not in url:
                ##main code for getting the thumbnail
                element = driver.find_element_by_tag_name('object')
                print element
                location = element.location
                print location
                size = element.size
                print size
                driver.save_screenshot(filename)
                im = Image.open(filename)
                left = location['x']
                print left
                top = location['y']
                print top
                right = location['x'] + size['width']
                print right
                bottom = location['y'] + size['height']
                print bottom
                left=int(round(left))
                right=int(round(right))
                print left,right
                im = im.crop((left, top, right, bottom))
                im.save(filename)
            else:
            driver.save_screenshot(filename)
            filenames.append(filename)
            
            #driver.save_screenshot(filename)
            crawlutils.resize_png_image(filename)
            time.sleep(2)
            print number
            shutil.copyfile(filename, "/home/vulcantech/kendavar/Ace/vi_screenshot_png/%s_png"%(doc_id[number]))
            #sys.exit()"""
        print "duplicates :",len(duplicates)
        driver.quit()           
    except Exception as e:
            print " error",e
            driver.close()