def __init__(self, url, referer, id, time, p, format, cw=None, isVideo=False, try_n=4, n_thread=1): self._url = url self.referer = referer self.id = int(id) self.time = time self.p = p self.n_thread = n_thread if not isVideo: url_alter = Url_alter(url) else: url_alter = None if isVideo and get_ext(url).lower() not in ['.mp4', '.m3u8']: get = self.get else: get = lambda _: self._url self.url = LazyUrl_twitter(referer, get, self, url_alter) self.format = format self.cw = cw self.isVideo = isVideo self.try_n = try_n ## time_ms = (int(id) >> 22) + 1288834974657 ## time = time_ms / 1000 # GMT+0 date = datetime.fromtimestamp(float(time)) timeStamp = date.strftime(format).replace(':', '\uff1a') # local time ext = '.mp4' if isVideo else get_ext(url) self.filename = timeStamp.replace('id', str(id)).replace('page', str(p)) + ext
def run(self): response = gtk.FileChooserDialog.run(self) self.set_modal(True) if response == gtk.RESPONSE_OK: filename = self.get_filename() if utils.get_ext(filename) in [".m3u",".pls",".xspf"]: utils.export_playlist(self.songs,filename,utils.get_ext(filename,False)) else: pl_type = ".m3u" filename = filename+"."+pl_type utils.export_playlist(self.songs,filename,pl_type) self.destroy()
def run(self): Dispatcher.emit("dialog-run") response = gtk.FileChooserDialog.run(self) self.set_modal(True) if response == gtk.RESPONSE_OK: filename = self.get_filename() if utils.get_ext(filename) in [".m3u", ".pls", ".xspf"]: utils.export_playlist(self.songs, filename, utils.get_ext(filename, False)) else: pl_type = ".m3u" filename = filename + "." + pl_type utils.export_playlist(self.songs, filename, pl_type) self.destroy() Dispatcher.emit("dialog-close")
def get(self, _): if self._url_cache: return self._url_cache print_ = get_print(self.cw) for try_ in range(self.try_n): try: d = ytdl.YoutubeDL() info = d.extract_info(self._url) url = info['url'] ext = get_ext(url) self.ext = ext print_('get_video: {} {}'.format(url, ext)) if ext.lower() == '.m3u8': url = M3u8_stream(url, n_thread=self.n_thread, post_processing=True) self._url_cache = url return url except Exception as e: e_ = e msg = print_error(e)[(-1)] print_('\nTwitter video Error:\n{}'.format(msg)) if try_ < self.try_n - 1: sleep(10, self.cw) else: raise e_
def fetch_hubble_image(image_id): hubble_api = f"http://hubblesite.org/api/v3/image/{image_id}" response = requests.get(hubble_api) best_image = response.json()['image_files'][-1] image_ext = get_ext(best_image['file_url']) image_url = best_image.get('file_url') load_image(image_url, f"{image_id}{image_ext}")
def handler(event, context): aws_event = aws_events.event_parser.EventParser(event) logger.info(aws_event.event_types()) event = True while event: event = aws_event.next_event('s3') logger.info(event) if (event is not None): event_klass = factory.instantiate('s3', event) new_file = event_klass.change_file_extension('txt') logger.info(new_file) ext = get_ext(event_klass.key) logger.info(ext) try: status, text = ocr_extractor.extract(event_handler=event_klass, context=context, ext=ext) if (status == 'ok'): event_klass.put_content(new_file, text) elif (status == 'ocr'): logger.info('>>> Sending to OCR lambda') except Exception as e: logger.exception('Extraction exception for <{}>'.format( event_klass.key))
def get(self, referer): soup = downloader.read_soup(self._url, referer, session=self.session) div = soup.find('div', id='display_image_detail') url = urljoin(self._url, div.find('img').parent['href']) ext = get_ext(url) self.filename = '{:04}{}'.format(self._p, ext) return url, self._url
def read(self): qs = query_url(self.url) for key in qs: if key.lower() in ('file', 'filename'): name = qs[key][(-1)] break else: name = os.path.basename(self.url) for esc in ['?', '#']: name = name.split(esc)[0] ext = get_ext(name) if not ext: try: ext = downloader.get_ext(self.url) except: ext = '' name = os.path.splitext(name)[0] self.urls.append(self.url) id_ = md5(self.url.encode('utf8')).hexdigest()[:8] tail = ' ({}){}'.format(id_, ext) filename = clean_title(name, n=-len(tail)) + tail self.filenames[self.url] = filename self.title = filename
def get_output_path(output_dir, input_name, idx, x): main = get_main(input_name) ext = get_ext(input_name) start = str(x['start_time']).replace('.', 'p') end = str(x['end_time']).replace('.', 'p') output_name = '{}_{}_{}_to_{}{}'.format(main, idx, start, end, ext) return path.join(output_dir, output_name)
def get(self, url): soup = read_soup(url, self._cw) ori = soup.find('li', id='post-option-view-original') if ori: img = ori.find('a')['href'] else: img = soup.find('li', id='post-info-size').find('a')['href'] if get_ext(img) == '.zip': #4635 img = soup.find('section', id='content').find('video')['src'] img = urljoin(url, img) ext = get_ext(img) self.filename = '{}{}'.format(self.id, ext) return img
def get(self, url): print_ = get_print(self.cw) if self._url: return self._url ydl = ytdl.YoutubeDL(cw=self.cw) try: info = ydl.extract_info(url) except Exception as e: ex = type(ytdl.get_extractor(url))(ydl) _download_info = getattr(ex, '_download_info', None) if _download_info is not None: vod_id = ex._match_id(url) info = _download_info(vod_id) print_(info) raise video_best = info['formats'][-1] video = video_best['url'] ext = get_ext(video) self.title = info['title'] id = info['display_id'] if ext.lower() == '.m3u8': video = M3u8_stream(video, n_thread=4, alter=alter) ext = '.mp4' self.filename = format_filename(self.title, id, ext) self.url_thumb = info['thumbnail'] self.thumb = BytesIO() downloader.download(self.url_thumb, buffer=self.thumb) self._url = video return self._url
def get_ext_(url, session, referer): try: ext = downloader.get_ext(url, session, referer) except Exception as e: print(e) ext = get_ext(url) return ext
def __init__(self, url, p, page, cw): self.cw = cw ext = get_ext(url) self.filename = '{:04}{}'.format(p, ext) if page.title is not None: self.filename = '{}/{}'.format(page.title, self.filename) self._url = url self.url = LazyUrl(page.url, self.get, self)
def __init__(self, src, referer, title, session): ext = get_ext(src) if ext == '.m3u8': _src = src src = M3u8_stream(_src, referer=referer, session=session) ext = '.mp4' self.url = LazyUrl(referer, lambda _: src, self) self.filename = '{}{}'.format(clean_title(title), ext)
def __init__(self, url, page, p): ext = get_ext(url) if ext.lower()[1:] not in [ 'jpg', 'jpeg', 'bmp', 'png', 'gif', 'webm', 'webp' ]: ext = '.jpg' self.filename = '{}/{:04}{}'.format(page.title, p, ext) self.url = LazyUrl(page.url, lambda _: url, self)
def guardar(self, nombre): if not nombre:return ext=utils.get_fileext() if not ext:ext=utils.get_ext() nombre=nombre.replace(" ", "_") text=utils.get_text() text=text.replace("$", "\$"); utils.file_write(TEMPLATES_PATH+nombre+"."+ext, text)
def __init__(self, id, url, p, lazy=True, img=None): self.id = id self.p = p if lazy: self.url = LazyUrl(url, self.get_single, self) else: self.url = LazyUrl(url, lambda _: img, self) ext = get_ext(img) self.filename = '{}_p{}{}'.format(id, p, ext)
def get(self, url): print_ = get_print(self.cw) if self._url: return self._url ydl = ytdl.YoutubeDL(cw=self.cw) try: info = ydl.extract_info(url) except Exception as e: ex = type(ytdl.get_extractor(url))(ydl) _download_info = getattr(ex, '_download_info', None) if _download_info is not None: vod_id = ex._match_id(url) info = _download_info(vod_id) print_(info) if 'HTTPError 403' in str(e): raise errors.LoginRequired() raise def print_video(video): print_('[{}] [{}] [{}] {}'.format(video['format_id'], video.get('height'), video.get('tbr'), video['url'])) videos = [video for video in info['formats'] if video.get('height')] videos = sorted(videos, key=lambda video: (video.get('height', 0), video.get('tbr', 0)), reverse=True) for video in videos: print_video(video) for video in videos: if video.get('height', 0) <= get_resolution(): #3723 video_best = video break else: video_best = videos[-1] print_video(video) video = video_best['url'] ext = get_ext(video) self.title = info['title'] id = info['display_id'] if ext.lower() == '.m3u8': video = M3u8_stream(video, n_thread=4, alter=alter) ext = '.mp4' self.filename = format_filename(self.title, id, ext) self.url_thumb = info['thumbnail'] self.thumb = BytesIO() downloader.download(self.url_thumb, buffer=self.thumb) self._url = video return self._url
def get(self, referer): # https://j.nozomi.la/nozomi.js s_id = str(self._id) url_post = 'https://j.nozomi.la/post/{}/{}/{}.json'.format( s_id[-1], s_id[-3:-1], self._id) j = downloader.read_json(url_post, referer) url = urljoin(referer, j['imageurl']) ext = get_ext(url) self.filename = '{}{}'.format(self._id, ext) return url
def get_rep_and_ext(path, model_choice): rep = rep_base(path, model_choice) ext = get_ext(path) if ext is None: rep['error'] = True rep['error_reason'] = "Cannot determine extension" rep['ext'] = '' return rep rep['ext'] = ext return rep
def _wrapper(file_path, verbose): ext = get_ext(file_path) if ext == '.json': return parse_json(file_path, verbose) elif ext == '.ndjson': return parse_ndjson(file_path, verbose) else: raise AssertionError('File must be in JSON format and ' 'could not be parsed.')
def fetch_hubble_image(image_id, dir_path): os.makedirs(dir_path, exist_ok=True) response = requests.request('GET', 'http://hubblesite.org/api/v3/image/{}'.format(image_id)) file_url = response.json()['image_files'][-1]['file_url'] ext = get_ext(file_url) path = os.path.join(dir_path, 'hubble_img_{}{}'.format(image_id, ext)) download_file(file_url, path)
def upload_photo_to_instagram(img_dir, username, password): files = get_files_path_from_dir(img_dir) image_files = filter(lambda path: get_ext(path).lower() in IMG_EXT_SET, files) bot = Bot() bot.login(username, password) for img in image_files: bot.upload_photo(img)
async def load_audio(self): idx = self.results_box.index(tk.ACTIVE) if not self.audio[idx]: r = self.results[idx] start, end, video = r.start, r.end, r.video self.audio[idx], _ = await ffmpeg_helper.trim( start, end, file_input=video, format=get_ext(video), format_out='matroska') print('loaded ' + video)
def get(self, _): print_ = get_print(self.cw) url = self._url ext = get_ext(url) if ext.lower() == '.gif': print_('get_ext: {}, {}'.format(self.id_, url)) try: ext = downloader.get_ext(url) except Exception as e: #3235 print_('Err: {}, {}\n'.format(self.id_, url)+print_error(e)[0]) self.filename = '{}_p{}{}'.format(self.id_, self.p, ext) return url
def extract(event_handler, ext=None, disable_ocr=True): logger.info("\n\n\nStarting Document parse\n" + event_handler.key + "\n\n\n") # AWS Lambda auto-retries errors for 3x. This should make it disable retrying...kinda. See https://stackoverflow.com/questions/32064038/aws-lambda-function-triggering-multiple-times-for-a-single-event #aws_context_retry_uri = os.path.join(temp_uri_prefix, 'aws_lambda_request_ids', context.aws_request_id) #if uri_exists(aws_context_retry_uri): # return #uri_dump(aws_context_retry_uri, '', mode='w') # logger.info('{} invoked with event {}.'.format(os.environ['AWS_LAMBDA_FUNCTION_NAME'], json.dumps(event))) if ext is None: ext = get_ext(event_handler.key) logger.info("file ext " + ext) extract_func = PARSE_FUNCS.get(ext) if extract_func is None: raise ValueError('<{}> has unsupported extension "{}".'.format( event_handler.key, ext)) return fallback_to_ocr = False # textractor_results = {} if extract_func is False: fallback_to_ocr = True logger.info('Fallback to OCR for <{}>.'.format(event_handler.key)) return ('ocr', None) else: with NamedTemporaryFile(mode='wb', suffix=ext, delete=False) as f: document_path = f.name f.write(event_handler.get_content().read()) logger.info('Downloaded <{}> to <{}>.'.format( event_handler.key, document_path)) #end with text = extract_func(document_path, event_handler) if extract_func is pdf_to_text and len(text) < 512 and not disable_ocr: return ('ocr', None) else: if len(text) == 0: logger.warning('<{}> does not contain any content.'.format( event_handler.key)) return ('ok', text) #end if #end if return ('ocr', None)
def get(self, url): html = downloader.read_html(url) soup = Soup(html) for li in soup.findAll('li'): if li.text.strip() == 'Original image': break else: raise Exception('no Original image') url = li.find('a')['href'] ext = get_ext(url) self.filename = u'{}{}'.format(self.id_, ext) return url
def fetch_spacex_last_launch(dir_path): os.makedirs(dir_path, exist_ok=True) response = requests.request('GET', 'https://api.spacexdata.com/v3/launches/latest') images_urls = response.json()['links']['flickr_images'] for num, url in enumerate(images_urls): ext = get_ext(url) path = os.path.join(dir_path, 'spacex_{}{}'.format(num, ext)) download_file(url, path)
async def post(self): user_id = id_validator(self.request.match_info['user_id'], 'User') if self.request.content_type != 'multipart/form-data' or self.request.content_length == 0: return web.json_response(data=[]) user_table = get_model_by_name('user') file_table = get_model_by_name('file') user_exists = await self.request.app['pg'].fetchval( select([exists().where(user_table.c.user_id == user_id)])) if not user_exists: await self.request.app['pg'].fetchrow( user_table.insert().values(**{'user_id': user_id})) reader = await self.request.multipart() upload_folder = self.request.app['config']['UPLOAD_FOLDER'] data = [] while not reader.at_eof(): image = await reader.next() if not image: break file_name, ext = get_ext(image.filename) generated_file_name = '{}.{}'.format(uuid.uuid4(), ext) full_path = os.path.abspath( os.path.join(upload_folder, generated_file_name)) size = 0 with open(full_path, 'wb') as f: while True: chunk = await image.read_chunk() if not chunk: break size += len(chunk) f.write(chunk) body = { 'user_id': user_id, 'name': image.filename, 'path': full_path, 'size': size } file = await self.request.app['pg'].fetchrow( file_table.insert().values(**body).returning( literal_column('*'))) file = row_to_dict(file, 'file') data.append(dict(file)) return web.json_response(data=data)
def get(self, url): if self._url: return self._url self.info = get_info(url) self.title = self.info['title'] id = self.info['id'] video_best = self.info['formats'][(-1)] self._url = video_best['url'] ext = get_ext(self._url) self.filename = format_filename(self.title, id, ext) return self._url
def get(self, _): print_ = get_print(self.cw) url = self._url ext = get_ext(url) if ext.lower()[1:] not in ['jpg', 'png', 'mp4']: #4645 print_('get_ext: {}, {}'.format(self.id_, url)) try: ext = downloader.get_ext(url, referer=_) except Exception as e: #3235 print_('Err: {}, {}\n'.format(self.id_, url) + print_error(e)[0]) self.filename = '{}_p{}{}'.format(self.id_, self.p, ext) return url
def get(self, referer): ext = get_ext(self._url) name = self.format_.replace('id', '###id*').replace( 'page', '###page*').replace('artist', '###artist*').replace('title', '###title*') name = name.replace('###id*', str(self.id_)).replace( '###page*', str(self.p)).replace('###artist*', self.artist).replace('###title*', self.title) self.filename = clean_title(name.strip(), allow_dot=True, n=-len(ext)) + ext return self._url
def get_ext(self, complete=True): return utils.get_ext(self.get("uri"), complete)
def Download_COGCC_Data(self, seqNum, fClass, outputDir): dlFile = False # search list for logs to download log_list = ['mud', 'core', 'cores'] # List variable to count the number of pages on the current COGCC web page pageLinks = ['1'] # Counter - Some of the COGCC files have the same name. This variable allows us to create an 'index' number for every file fileCount = 1 # Download COGCC files try: # Create a mechanize browser br = mechanize.Browser() # Include some headers that are recognized by the Microsoft server software br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] # Get Site response response = br.open("%s%s" % ("http://ogccweblink.state.co.us/results.aspx?id=", seqNum)) # Read site page html html = response.read() # Send html data to BeautifulSoup for collection soup = BeautifulSoup(html) # Cycle through the anchor elements. Identify javascript links so we can count the number of pages that we need to cycl through for a1 in soup.find_all('a'): if "javascript:__doPostBack('WQResultGridView','Page$" in (a1.get('href')): # Append java script text to the pageLinks list pageLinks.append(a1.get_text()) # Cycle through the index numbers starting from 0 up to the number of pages for ix in range(1, (len(pageLinks)+1)): # Collect all anchor elements anchors = soup.find_all('a') # cycle from 1 up to the number of anchor elements for jx in range(0, len(anchors)): # Determine if the current anchor is a javascript header link if "javascript:__doPostBack" in (anchors[jx].get("href")): # Determine if the current anchor is a javascript page link if "javascript:__doPostBack('WQResultGridView','Page$" in (anchors[jx].get("href")): # Determine if the current anchor is a javascript page link that we want to click on if "javascript:__doPostBack('WQResultGridView','Page$%s" % (ix + 1) in (anchors[jx].get("href")): # Select the javascript form br.select_form(nr=0) # Set the form to modifiable br.set_all_readonly(False) # Pass over necessary arguments to the javascript: __doPostBack function br["__EVENTTARGET"] = 'WQResultGridView' br["__EVENTARGUMENT"] = 'Page$%s' % (ix + 1) # update the response variable with the site response from br.submit response = br.submit() # Update html data html = response.read() # updates the soup objects soup = BeautifulSoup(html) else: pass # We need to document the current class value of the current well. This well tell us if we should download the well based on the provided parameters elif (jx-1) % 5 == 0: dlFile = False if fClass == "all": dlFile = True if anchors[jx].get_text() == "Wells" or anchors[jx].get_text() == "Facilities" or anchors[jx].get_text() == "Operator": self.__current_file = 'whf' else: self.__current_file = 'log' elif fClass == "whfs" and (anchors[jx].get_text() == "Wells" or anchors[jx].get_text() == "Facilities" or anchors[jx].get_text() == "Operator"): dlFile = True self.__current_file = 'whf' # This method is really designed for the Mud log and core search search, not really logs specifically. Need to reinsert the following comment into the fClass == conditional statement #"""(anchors[jx].get_text() == "Wells" or anchors[jx].get_text() == "Well Logs" or anchors[jx].get_text() == "Projects") and""" elif fClass == "logs" and utils.find_substring(anchors[jx+2].get_text(), log_list): dlFile = True self.__current_file = 'log' else: dlFile = False self.__current_file = '' # We want to download from every 4th link. The 4th link contains the name of the file. So download the file if the current (index-3) % 5 == 0 elif (jx-3) % 5 == 0 and dlFile == True: # set the file name fileName = "05" + seqNum + "0000_" + str(fileCount) + "_" + anchors[jx].get_text() # Increment file count fileCount = fileCount + 1 # Remove shit characters fileName = re.sub('[\\\/:*?\'\"<>\|\\r\\n]','',fileName) fileName = re.sub(' ','_',fileName) # Set the url using the current anchor element href url = "%s" % ("http://ogccweblink.state.co.us/" + anchors[jx].get("href")) # Get it!!!! r = requests.get(url) # Set the output file path, need to extend the extDict to include other mime types, (i.e. xml, word docs...) # Use try block because we might encounter unknown media type filePath = os.path.join(outputDir, fileName) if r.status_code == 200: with open(filePath, "wb") as image: image.write(r.content) # retrieve the file extension from the downloaded file ext = utils.get_ext(filePath) # set the extension on the current download try: if ext == 'pdf' or self.__current_file == 'log': utils.set_file_ext(filePath, ext) else: pass # removing the tiff2pdf function. Most colorado files seem to pdf files now. (Except logs) #utils.tiff2pdf(filePath) except: pass else: pass # Clear out all variables br = None soup = None html = None response = None fileName = None filePath = None url = None r = None return True except requests.ConnectionError, e: # Clear out all variables br = None soup = None html = None response = None fileName = None filePath = None url = None r = None return False