def process_imgur_url(url): """ Given an imgur URL, determine if it's a direct link to an image or an album. If the latter, attempt to determine all images within the album Returns: list of imgur URLs """ if 'imgur.com/a/' in url or 'imgur.com/gallery/' in url: return extract_imgur_album_urls(url) # use beautifulsoup4 to find real link # find vid url only ''' try: print("TRYING AT %s" % url) from bs4 import BeautifulSoup html = urlopen(url).read() soup = BeautifulSoup(html, 'lxml') vid = soup.find('div', {'class': 'video-container'}) vid_type = 'video/webm' # or 'video/mp4' vid_url = vid.find('source', {'type': vid_type}).get('src') if vid_url.startswith('//'): vid_url = 'http:' + vid_url return vid_url except Exception: # do nothing for awhile pass ''' # Change .png to .jpg for imgur urls. if url.endswith('.png'): url = url.replace('.png', '.jpg') else: # Extract the file extension ext = pathsplitext(pathbasename(url))[1] if ext == '.gifv': url = url.replace('.gifv', '.gif') if not ext: # Append a default url += '.jpg' return [url]
def process_imgur_url(url): """ Given an imgur URL, determine if it's a direct link to an image or an album. If the latter, attempt to determine all images within the album Returns: list of imgur URLs """ if 'imgur.com/a/' in url: return extract_imgur_album_urls(url) # Change .png to .jpg for imgur urls. if url.endswith('.png'): url = url.replace('.png', '.jpg') else: # Extract the file extension ext = pathsplitext(pathbasename(url))[1] if not ext: # Append a default url += '.jpg' return [url]
def process_imgur_url(url): """ Given an imgur URL, determine if it's a direct link to an image or an album. If the latter, attempt to determine all images within the album Returns: list of imgur URLs """ if 'imgur.com/a/' in url or 'imgur.com/gallery/' in url: return extract_imgur_album_urls(url) # use beautifulsoup4 to find real link # find vid url only try: from bs4 import BeautifulSoup html = urlopen(url).read() soup = BeautifulSoup(html, 'lxml') vid = soup.find('div', {'class': 'video-container'}) vid_type = 'video/webm' # or 'video/mp4' vid_url = vid.find('source', {'type': vid_type}).get('src') if vid_url.startswith('//'): vid_url = 'http:' + vid_url return vid_url except Exception: # do nothing for awhile pass # Change .png to .jpg for imgur urls. if url.endswith('.png'): url = url.replace('.png', '.jpg') else: # Extract the file extension ext = pathsplitext(pathbasename(url))[1] if ext == '.gifv': url = url.replace('.gifv', '.gif') if not ext: # Append a default url += '.jpg' return [url]
def main(args=None): ARGS = parse_args(args if len(args) > 0 else sys.argv[1:]) logging.basicConfig(level=logging.INFO) # value at first index is of current subreddit, second index is total TOTAL, DOWNLOADED, ERRORS, SKIPPED, FAILED = [0, 0], [0, 0], [0, 0], [0, 0], [0, 0] PROG_REPORT = [TOTAL, DOWNLOADED, ERRORS, SKIPPED, FAILED] # Create the specified directory if it doesn't already exist. if not pathexists(ARGS.dir): mkdir(ARGS.dir) # If a regex has been specified, compile the rule (once) RE_RULE = None if ARGS.regex: RE_RULE = re.compile(ARGS.regex) # compile reddit comment url to check if url is one of them reddit_comment_regex = re.compile(r'.*reddit\.com\/r\/(.*?)\/comments') LAST = ARGS.last start_time = None ITEM = None sort_type = ARGS.sort_type if sort_type: sort_type = sort_type.lower() # check to see if ARGS.subreddit is subreddit or subreddit-list if os.path.isfile( ARGS.subreddit) and os.path.splitext(ARGS.subreddit)[1] != '': ARGS.subreddit_list = ARGS.subreddit if ARGS.subreddit_list: # ARGS.subreddit_list = ARGS.subreddit_list[0] # can't remember why I did this -jtara1 subreddit_file = ARGS.subreddit_list subreddit_list = parse_subreddit_list(subreddit_file, ARGS.dir) if ARGS.verbose: print('subreddit_list = %s' % subreddit_list) elif not ARGS.subreddit_list: subreddit_list = [(ARGS.subreddit, ARGS.dir)] # file used to store last reddit id log_file = '._history.txt' # iterate through subreddit(s) for index, section in enumerate(subreddit_list): (ARGS.subreddit, ARGS.dir) = section FINISHED = False if ARGS.verbose: print('index: %s, %s, %s' % (index, ARGS.subreddit, ARGS.dir)) # load last_id or create new entry for last_id in log_data log_data, last_id = process_subreddit_last_id(ARGS.subreddit, ARGS.sort_type, ARGS.dir, log_file, ARGS.dir) if ARGS.restart: last_id = '' TOTAL[0], DOWNLOADED[0], ERRORS[0], SKIPPED[0], FAILED[ 0], FILECOUNT = 0, 0, 0, 0, 0, 0 # ITEMS loop - begin the loop to get reddit submissions & download media from them while not FINISHED: if ARGS.verbose: print() ITEMS = getitems(ARGS.subreddit, multireddit=ARGS.multireddit, previd=last_id, reddit_sort=sort_type) # debug ITEMS variable value # if ARGS.verbose: # history_log(os.getcwd(), 'ITEMS.txt', 'write', ITEMS) # measure time and set the program to wait 4 second between request # as per reddit api guidelines end_time = time.process_time() if start_time is not None: elapsed_time = end_time - start_time if elapsed_time <= 4: # throttling time.sleep(4 - elapsed_time) start_time = time.process_time() # No more items to process if not ITEMS: if ARGS.verbose: print('No more ITEMS for %s %s' % (ARGS.subreddit, ARGS.sort_type)) break for ITEM in ITEMS: TOTAL[0] += 1 if ('reddit.com/r/' + ARGS.subreddit + '/comments/' in ITEM['url'] or re.match(reddit_comment_regex, ITEM['url']) is not None): # hotfix for when last item is comment submission which caused infinite looping last_id = ITEM['id'] if ITEM is not None else None if last_id: log_data[ARGS.subreddit][ ARGS.sort_type]['last-id'] = last_id history_log(ARGS.dir, log_file, mode='write', write_data=log_data) continue # don't download if url is reddit metrics url if 'redditmetrics.com' in ITEM['url']: if ARGS.verbose: print('\t%s was skipped.' % ITEM['url']) SKIPPED[0] += 1 continue if ITEM['score'] < ARGS.score: if ARGS.verbose: print(' SCORE: {} has score of {}'.format( ITEM['id'], ITEM['score'])) 'which is lower than required score of {}.'.format( ARGS.score) SKIPPED[0] += 1 continue elif ARGS.sfw and ITEM['over_18']: if ARGS.verbose: print(' NSFW: %s is marked as NSFW.' % (ITEM['id'])) SKIPPED[0] += 1 continue elif ARGS.nsfw and not ITEM['over_18']: if ARGS.verbose: print(' Not NSFW, skipping %s' % (ITEM['id'])) SKIPPED[0] += 1 continue elif ARGS.regex and not re.match(RE_RULE, ITEM['title']): if ARGS.verbose: print(' Regex match failed') SKIPPED[0] += 1 continue elif ARGS.skipAlbums and 'imgur.com/a/' in ITEM['url']: if ARGS.verbose: print(' Album found, skipping %s' % (ITEM['id'])) SKIPPED[0] += 1 continue if ARGS.title_contain and ARGS.title_contain.lower( ) not in ITEM['title'].lower(): if ARGS.verbose: print(' Title not contain "{}",'.format( ARGS.title_contain)) 'skipping {}'.format(ITEM['id']) SKIPPED[0] += 1 continue try: URLS = extract_urls(ITEM['url']) except URLError as e: print('URLError %s' % e) continue except Exception as e: _log.exception("%s", e) continue for URL in URLS: try: # Find gfycat if requested if URL.endswith('gif') and ARGS.mirror_gfycat: check = gfycat().check(URL) if check.get("urlKnown"): URL = check.get('webmUrl') # Trim any http query off end of file extension. FILEEXT = pathsplitext(URL)[1] if '?' in FILEEXT: FILEEXT = FILEEXT[:FILEEXT.index('?')] # Only append numbers if more than one file FILENUM = ('_%d' % FILECOUNT if len(URLS) > 1 else '') # create filename based on given input from user if ARGS.filename_format == 'url': FILENAME = '%s%s%s' % (pathsplitext( pathbasename(URL))[0], '', FILEEXT) elif ARGS.filename_format == 'title': FILENAME = '%s%s%s' % (slugify( ITEM['title']), FILENUM, FILEEXT) if len(FILENAME) >= 256: shortened_item_title = slugify( ITEM['title'])[:256 - len(FILENAME)] FILENAME = '%s%s%s' % (shortened_item_title, FILENUM, FILEEXT) else: FILENAME = '%s%s%s' % (ITEM['id'], FILENUM, FILEEXT) # join file with directory FILEPATH = pathjoin(ARGS.dir, FILENAME) # Improve debuggability list URL before download too. # url may be wrong so skip that if URL.encode('utf-8') == 'http://': raise URLError('Url is empty') # Download the image try: dl = skp = 0 if 'imgur.com' in URL: fname = os.path.splitext(FILENAME)[0] save_path = os.path.join(os.getcwd(), ARGS.dir) downloader = ImgurDownloader(URL, save_path, fname, delete_dne=True, debug=False) (dl, skp) = downloader.save_images() else: download_from_url(URL, FILEPATH) dl = 1 # Image downloaded successfully! if ARGS.verbose: print('Saved %s as %s' % (URL, FILENAME)) DOWNLOADED[0] += 1 SKIPPED[0] += skp FILECOUNT += 1 except URLError: print('We do not support reddituploads links yet' ' skipping....') except FileExistsException as ERROR: ERRORS[0] += 1 if ARGS.verbose: print(ERROR.message) if ARGS.update: print(' Update complete, exiting.') FINISHED = True break except ImgurException as e: ERRORS[0] += 1 except Exception as e: print(e) ERRORS[0] += 1 if ARGS.num and (DOWNLOADED[0]) >= ARGS.num: print(' Download num limit reached, exiting.') FINISHED = True break except WrongFileTypeException as ERROR: _log_wrongtype(url=URL, target_dir=ARGS.dir, filecount=FILECOUNT, _downloaded=DOWNLOADED[0], filename=FILENAME) SKIPPED[0] += 1 except HTTPError as ERROR: FAILED[0] += 1 except URLError as ERROR: FAILED[0] += 1 except InvalidURL as ERROR: FAILED[0] += 1 except Exception as exc: FAILED[0] += 1 # keep track of last_id id downloaded last_id = ITEM['id'] if ITEM is not None else None if last_id: log_data[ARGS.subreddit][ ARGS.sort_type]['last-id'] = last_id history_log(ARGS.dir, log_file, mode='write', write_data=log_data) # break out of URL loop to end of ITEMS loop if FINISHED: break # update variables in PROG_REPORT in SUBREDDIT loop for var in PROG_REPORT: var[1] += var[0] print('Downloaded from %i reddit submissions' % (DOWNLOADED[1])) print('(Processed %i, Skipped %i, Errors %i)' % (TOTAL[1], SKIPPED[1], ERRORS[1])) return DOWNLOADED[1]
def main(): ARGS = parse_args(sys.argv[1:]) logging.basicConfig(level=logging.INFO) print parse_reddit_argument(ARGS.reddit) TOTAL = DOWNLOADED = ERRORS = SKIPPED = FAILED = 0 FINISHED = False # Create the specified directory if it doesn't already exist. if not pathexists(ARGS.dir): mkdir(ARGS.dir) # If a regex has been specified, compile the rule (once) RE_RULE = None if ARGS.regex: RE_RULE = re.compile(ARGS.regex) # compile reddit comment url to check if url is one of them reddit_comment_regex = re.compile(r'.*reddit\.com\/r\/(.*?)\/comments') LAST = ARGS.last start_time = None ITEM = None sort_type = ARGS.sort_type if sort_type: sort_type = sort_type.lower() while not FINISHED: ITEMS = getitems(ARGS.reddit, multireddit=ARGS.multireddit, previd=LAST, reddit_sort=sort_type) # measure time and set the program to wait 4 second between request # as per reddit api guidelines end_time = time.clock() if start_time is not None: elapsed_time = end_time - start_time if elapsed_time <= 4: # throttling time.sleep(4 - elapsed_time) start_time = time.clock() if not ITEMS: # No more items to process break for ITEM in ITEMS: TOTAL += 1 # not downloading if url is reddit comment if ('reddit.com/r/' + ARGS.reddit + '/comments/' in ITEM['url'] or re.match(reddit_comment_regex, ITEM['url']) is not None): print ' Skip:[{}]'.format(ITEM['url']) continue if ITEM['score'] < ARGS.score: if ARGS.verbose: print ' SCORE: {} has score of {}'.format( ITEM['id'], ITEM['score']) 'which is lower than required score of {}.'.format( ARGS.score) SKIPPED += 1 continue elif ARGS.sfw and ITEM['over_18']: if ARGS.verbose: print ' NSFW: %s is marked as NSFW.' % (ITEM['id']) SKIPPED += 1 continue elif ARGS.nsfw and not ITEM['over_18']: if ARGS.verbose: print ' Not NSFW, skipping %s' % (ITEM['id']) SKIPPED += 1 continue elif ARGS.regex and not re.match(RE_RULE, ITEM['title']): if ARGS.verbose: print ' Regex match failed' SKIPPED += 1 continue elif ARGS.skipAlbums and 'imgur.com/a/' in ITEM['url']: if ARGS.verbose: print ' Album found, skipping %s' % (ITEM['id']) SKIPPED += 1 continue if ARGS.title_contain and ARGS.title_contain.lower( ) not in ITEM['title'].lower(): if ARGS.verbose: print ' Title not contain "{}",'.format( ARGS.title_contain) 'skipping {}'.format(ITEM['id']) SKIPPED += 1 continue FILECOUNT = 0 try: URLS = extract_urls(ITEM['url']) except Exception: _log.exception("Failed to extract urls for %r", URLS) continue for URL in URLS: try: # Find gfycat if requested if URL.endswith('gif') and ARGS.mirror_gfycat: check = gfycat().check(URL) if check.get("urlKnown"): URL = check.get('webmUrl') # Trim any http query off end of file extension. FILEEXT = pathsplitext(URL)[1] if '?' in FILEEXT: FILEEXT = FILEEXT[:FILEEXT.index('?')] # Only append numbers if more than one file FILENUM = ('_%d' % FILECOUNT if len(URLS) > 1 else '') # create filename based on given input from user if ARGS.filename_format == 'url': FILENAME = '%s%s%s' % (pathsplitext( pathbasename(URL))[0], '', FILEEXT) elif ARGS.filename_format == 'title': FILENAME = '%s%s%s' % (slugify( ITEM['title']), FILENUM, FILEEXT) if len(FILENAME) >= 256: shortened_item_title = slugify( ITEM['title'])[:256 - len(FILENAME)] FILENAME = '%s%s%s' % (shortened_item_title, FILENUM, FILEEXT) else: FILENAME = '%s%s%s' % (ITEM['id'], FILENUM, FILEEXT) # join file with directory FILEPATH = pathjoin(ARGS.dir, FILENAME) # Improve debuggability list URL before download too. # url may be wrong so skip that if URL.encode('utf-8') == 'http://': raise URLError('Url is empty') else: text_templ = ' Attempting to download URL[{}] as [{}].' print text_templ.format(URL.encode('utf-8'), FILENAME.encode('utf-8')) # Download the image try: download_from_url(URL, FILEPATH) # Image downloaded successfully! print ' Sucessfully downloaded URL [%s] as [%s].' % ( URL, FILENAME) DOWNLOADED += 1 FILECOUNT += 1 except Exception, e: print ' %s' % str(e) ERRORS += 1 if ARGS.num and DOWNLOADED >= ARGS.num: FINISHED = True break except WrongFileTypeException as ERROR: print ' %s' % (ERROR) _log_wrongtype(url=URL, target_dir=ARGS.dir, filecount=FILECOUNT, _downloaded=DOWNLOADED, filename=FILENAME) SKIPPED += 1 except FileExistsException as ERROR: print ' %s' % (ERROR) ERRORS += 1 if ARGS.update: print ' Update complete, exiting.' FINISHED = True break except HTTPError as ERROR: print ' HTTP ERROR: Code %s for %s.' % (ERROR.code, URL) FAILED += 1
def main(): ARGS = parse_args(sys.argv[1:]) logging.basicConfig(level=logging.INFO) print(parse_reddit_argument(ARGS.reddit)) global lock lock.acquire(1) global TOTAL, DOWNLOADED, ERRORS, SKIPPED, FAILED, FILECOUNT FINISHED = False lock.release() threadList = [] # Create the specified directory if it doesn't already exist. if not pathexists(ARGS.dir): mkdir(ARGS.dir) # If a regex has been specified, compile the rule (once) RE_RULE = None if ARGS.regex: RE_RULE = re.compile(ARGS.regex) # compile reddit comment url to check if url is one of them reddit_comment_regex = re.compile(r'.*reddit\.com\/r\/(.*?)\/comments') LAST = ARGS.last ITEM = None sort_type = ARGS.sort_type if sort_type: sort_type = sort_type.lower() while not FINISHED: ITEMS = getitems( ARGS.reddit, multireddit=ARGS.multireddit, previd=LAST, reddit_sort=sort_type) if not ITEMS: # No more items to process break for ITEM in ITEMS: TOTAL += 1 # not downloading if url is reddit comment if ('reddit.com/r/' + ARGS.reddit + '/comments/' in ITEM['url'] or re.match(reddit_comment_regex, ITEM['url']) is not None): print(' Skip:[{}]'.format(ITEM['url'])) continue if ITEM['score'] < ARGS.score: if ARGS.verbose: print(' SCORE: {} has score of {}'.format(ITEM['id'], ITEM['score']), 'which is lower than required score of {}.'.format(ARGS.score)) SKIPPED += 1 continue elif ARGS.sfw and ITEM['over_18']: if ARGS.verbose: print(' NSFW: %s is marked as NSFW.' % (ITEM['id'])) SKIPPED += 1 continue elif ARGS.nsfw and not ITEM['over_18']: if ARGS.verbose: print(' Not NSFW, skipping %s' % (ITEM['id'])) SKIPPED += 1 continue elif ARGS.regex and not re.match(RE_RULE, ITEM['title']): if ARGS.verbose: print(' Regex not matched') SKIPPED += 1 continue elif ARGS.skipAlbums and 'imgur.com/a/' in ITEM['url']: if ARGS.verbose: print(' Album found, skipping %s' % (ITEM['id'])) SKIPPED += 1 continue if ARGS.title_contain and ARGS.title_contain.lower() not in ITEM['title'].lower(): if ARGS.verbose: print(' Title does not contain "{}",'.format(ARGS.title_contain), 'skipping {}'.format(ITEM['id'])) SKIPPED += 1 continue lock.acquire(1) FILECOUNT = 0 lock.release() try: URLS = extract_urls(ITEM['url']) except Exception: _log.exception("Failed to extract urls for %r", URLS) continue for URL in URLS: try: # Find gfycat if requested if URL.endswith('gif') and ARGS.mirror_gfycat: check = gfycat().check(URL) if check.get("urlKnown"): URL = check.get('webmUrl') FILEEXT = pathsplitext(URL)[1] # Trim any http query off end of file extension. FILEEXT = re.sub(r'\?.*$', '', FILEEXT) if not FILEEXT: # A more usable option that empty. # The extension can be fixed after downloading, but then the 'already downloaded' check will be harder. FILEEXT = '.jpg' # Only append numbers if more than one file lock.acquire(1) FILENUM = ('_%d' % FILECOUNT if len(URLS) > 1 else '') lock.release() # create filename based on given input from user if ARGS.filename_format == 'url': FILENAME = '%s%s%s' % (pathsplitext(pathbasename(URL))[0], '', FILEEXT) elif ARGS.filename_format == 'title': FILENAME = '%s%s%s' % (slugify(ITEM['title']), FILENUM, FILEEXT) if len(FILENAME) >= 256: shortened_item_title = slugify(ITEM['title'])[:256-len(FILENAME)] FILENAME = '%s%s%s' % (shortened_item_title, FILENUM, FILEEXT) else: FILENAME = '%s%s%s' % (ITEM['id'], FILENUM, FILEEXT) # join file with directory FILEPATH = pathjoin(ARGS.dir, FILENAME) # Improve debuggability list URL before download too. # url may be wrong so skip that if URL.encode('utf-8') == 'http://': raise URLError('Url is empty') else: text_templ = ' Attempting to download URL[{}] as [{}].' print(text_templ.format(URL.encode('utf-8'), FILENAME.encode('utf-8'))) # Download the image while(threading.active_count() > 5): time.sleep(5) lock.acquire(1) urlCopy = URL filepathCopy = FILEPATH t = threading.Thread(target=download_threaded, args=([urlCopy, filepathCopy, DOWNLOADED, FILECOUNT, ERRORS], )) t.start() lock.release() threadList.append(t) lock.acquire(1) if ARGS.num and DOWNLOADED >= ARGS.num: FINISHED = True lock.release() break lock.release() except WrongFileTypeException as ERROR: print(' %s' % (ERROR,)) lock.acquire(1) _log_wrongtype(url=URL, target_dir=ARGS.dir, filecount=FILECOUNT, _downloaded=DOWNLOADED, filename=FILENAME) lock.release() SKIPPED += 1 except FileExistsException as ERROR: print(' %s' % (ERROR,)) ERRORS += 1 if ARGS.update: print(' Update complete, exiting.') FINISHED = True break except HTTPError as ERROR: print(' HTTP ERROR: Code %s for %s.' % (ERROR.code, URL)) FAILED += 1 except URLError as ERROR: print(' URL ERROR: %s!' % (URL,)) FAILED += 1 except InvalidURL as ERROR: print(' Invalid URL: %s!' % (URL,)) FAILED += 1 except Exception as exc: _log.exception("Problem with %r: %r", URL, exc) FAILED += 1 if FINISHED: break LAST = ITEM['id'] if ITEM is not None else None # Wait for each thread to finish downloading for t in threadList: t.join() print('Downloaded {} files'.format(DOWNLOADED), '(Processed {}, Skipped {}, Exists {})'.format(TOTAL, SKIPPED, ERRORS))
def main(): ARGS = parse_args(sys.argv[1:]) logging.basicConfig(level=logging.INFO) print(parse_reddit_argument(ARGS.reddit)) TOTAL = DOWNLOADED = ERRORS = SKIPPED = FAILED = 0 FINISHED = False # Create the specified directory if it doesn't already exist. if not pathexists(ARGS.dir): mkdir(ARGS.dir) # If a regex has been specified, compile the rule (once) RE_RULE = None if ARGS.regex: RE_RULE = re.compile(ARGS.regex) # compile reddit comment url to check if url is one of them reddit_comment_regex = re.compile(r'.*reddit\.com\/r\/(.*?)\/comments') LAST = ARGS.last start_time = None ITEM = None sort_type = ARGS.sort_type if sort_type: sort_type = sort_type.lower() #for downloading comments reddit = praw.Reddit('bot1') redanno = dict() capdict = dict() captions = [] #for downloading comments and saving in pre-format cap_l = [] cap_5 = [] while not FINISHED: ITEMS = getitems( ARGS.reddit, multireddit=ARGS.multireddit, previd=LAST, reddit_sort=sort_type) # measure time and set the program to wait 4 second between request # as per reddit api guidelines end_time = time.clock() if start_time is not None: elapsed_time = end_time - start_time if elapsed_time <= 4: # throttling time.sleep(4 - elapsed_time) start_time = time.clock() if not ITEMS: # No more items to process break for ITEM in ITEMS: TOTAL += 1 #print("This is ITEM['id'] : ", ITEM['id']) #print("This is ITEM : ", ITEM) # not downloading if url is reddit comment if ('reddit.com/r/' + ARGS.reddit + '/comments/' in ITEM['url'] or re.match(reddit_comment_regex, ITEM['url']) is not None): print(' Skip:[{}]'.format(ITEM['url'])) continue if ITEM['score'] < ARGS.score: if ARGS.verbose: print(' SCORE: {} has score of {}'.format(ITEM['id'], ITEM['score']), 'which is lower than required score of {}.'.format(ARGS.score)) SKIPPED += 1 continue elif ARGS.sfw and ITEM['over_18']: if ARGS.verbose: print(' NSFW: %s is marked as NSFW.' % (ITEM['id'])) SKIPPED += 1 continue elif ARGS.nsfw and not ITEM['over_18']: if ARGS.verbose: print(' Not NSFW, skipping %s' % (ITEM['id'])) SKIPPED += 1 continue elif ARGS.regex and not re.match(RE_RULE, ITEM['title']): if ARGS.verbose: print(' Regex not matched') SKIPPED += 1 continue elif ARGS.skipAlbums and 'imgur.com/a/' in ITEM['url']: if ARGS.verbose: print(' Album found, skipping %s' % (ITEM['id'])) SKIPPED += 1 continue if ARGS.title_contain and ARGS.title_contain.lower() not in ITEM['title'].lower(): if ARGS.verbose: print(' Title does not contain "{}",'.format(ARGS.title_contain), 'skipping {}'.format(ITEM['id'])) SKIPPED += 1 continue FILECOUNT = 0 try: URLS = extract_urls(ITEM['url']) #print("This is URLS :", URLS) #download_comments(ITEM['id']) #Using the def function does not save all the comments from each submission comm = reddit.submission(id=ITEM['id']) comm.comment_sort = 'best' comm.comments.replace_more(limit=0) # replace "more comments" from commentforest comm.comment_limit = 5 comm_list = list(comm.comments) if len(comm_list) < 5: continue #print("This is a list: ", comm_list) #print("This is len : ", len(comm_list)) #don't empty the cap_5 because reading become difficult #cap_5 = [] for i in range(5): #print("This is comment: ", comm_list[i].body) capdict["image_id"] = ITEM['id'] capdict["title"] = ITEM['title'] capdict["post_upvotes"] = ITEM['ups'] capdict["comment"] = comm_list[i].body capdict["comment_score"] = comm_list[i].score captions.append(capdict) #print(captions) capdict = dict() filename = ITEM['id'] + '.jpg#' + str(i) caption = comm_list[i].body caption = ' '.join(caption.split()) cap_5.append(filename + '\t' + caption) if len(cap_5)%50 == 0: print("\n") print("{} images have been downloaded.".format((len(cap_5)//50) * 10)) print("-"*50) #cap_l.append(cap_5) except Exception: _log.exception("Failed to extract urls for %r", URLS) continue for URL in URLS: try: # Find gfycat if requested if URL.endswith('gif') and ARGS.mirror_gfycat: check = gfycat().check(URL) if check.get("urlKnown"): URL = check.get('webmUrl') FILEEXT = pathsplitext(URL)[1] # Trim any http query off end of file extension. FILEEXT = re.sub(r'\?.*$', '', FILEEXT) if not FILEEXT: # A more usable option that empty. # The extension can be fixed after downloading, but then the 'already downloaded' check will be harder. FILEEXT = '.jpg' # Only append numbers if more than one file FILENUM = ('_%d' % FILECOUNT if len(URLS) > 1 else '') # create filename based on given input from user if ARGS.filename_format == 'url': FILENAME = '%s%s%s' % (pathsplitext(pathbasename(URL))[0], '', FILEEXT) elif ARGS.filename_format == 'title': FILENAME = '%s%s%s' % (slugify(ITEM['title']), FILENUM, FILEEXT) if len(FILENAME) >= 256: shortened_item_title = slugify(ITEM['title'])[:256-len(FILENAME)] FILENAME = '%s%s%s' % (shortened_item_title, FILENUM, FILEEXT) else: FILENAME = '%s%s%s' % (ITEM['id'], FILENUM, FILEEXT) # join file with directory FILEPATH = pathjoin(ARGS.dir, FILENAME) # Improve debuggability list URL before download too. # url may be wrong so skip that if URL.encode('utf-8') == 'http://': raise URLError('Url is empty') else: text_templ = ' Attempting to download URL[{}] as [{}].' #print(text_templ.format(URL.encode('utf-8'), FILENAME.encode('utf-8'))) # Download the image try: download_from_url(URL, FILEPATH) #download_comments(ITEM[id]) # Image downloaded successfully print(' Successfully downloaded URL [%s] as [%s].' % (URL, FILENAME)) DOWNLOADED += 1 FILECOUNT += 1 except Exception as exc: print(' %s' % (exc,)) ERRORS += 1 if ARGS.num and DOWNLOADED >= ARGS.num: FINISHED = True break except WrongFileTypeException as ERROR: print(' %s' % (ERROR,)) _log_wrongtype(url=URL, target_dir=ARGS.dir, filecount=FILECOUNT, _downloaded=DOWNLOADED, filename=FILENAME) SKIPPED += 1 except FileExistsException as ERROR: print(' %s' % (ERROR,)) ERRORS += 1 if ARGS.update: print(' Update complete, exiting.') FINISHED = True break except HTTPError as ERROR: print(' HTTP ERROR: Code %s for %s.' % (ERROR.code, URL)) FAILED += 1 except URLError as ERROR: print(' URL ERROR: %s!' % (URL,)) FAILED += 1 except InvalidURL as ERROR: print(' Invalid URL: %s!' % (URL,)) FAILED += 1 except Exception as exc: _log.exception("Problem with %r: %r", URL, exc) FAILED += 1 if FINISHED: break LAST = ITEM['id'] if ITEM is not None else None """ #finally saving the comments json redanno['pics'] = captions with open ("reddit_captions.json", "w") as f: json.dump(redanno, f, indent=4) #saving image id and their captions sequentially with open("annotations.txt", "w") as f: for s in cap_5: f.write(str(s) + "\n") """ print('Downloaded {} files'.format(DOWNLOADED), '(Processed {}, Skipped {}, Exists {})'.format(TOTAL, SKIPPED, ERRORS)) #finally saving the comments json redanno['pics'] = captions with open ("reddit_captions.json", "w") as f: json.dump(redanno, f, indent=4) #saving image id and their captions sequentially with open("annotations.txt", "w") as f: for s in cap_5: f.write(str(s) + "\n") print("annotations.txt has been saved.")
def main(): ARGS = parse_args(sys.argv[1:]) logging.basicConfig(level=logging.INFO) print parse_reddit_argument(ARGS.reddit) TOTAL = DOWNLOADED = ERRORS = SKIPPED = FAILED = 0 FINISHED = False # Create the specified directory if it doesn't already exist. if not pathexists(ARGS.dir): mkdir(ARGS.dir) # If a regex has been specified, compile the rule (once) RE_RULE = None if ARGS.regex: RE_RULE = re.compile(ARGS.regex, re.UNICODE) # compile reddit comment url to check if url is one of them reddit_comment_regex = re.compile(r'.*reddit\.com\/r\/(.*?)\/comments') LAST = ARGS.last start_time = None ITEM = None while not FINISHED: ITEMS = getitems( ARGS.reddit, multireddit=ARGS.multireddit, previd=LAST, reddit_sort=ARGS.sort_type) # measure time and set the program to wait 4 second between request # as per reddit api guidelines end_time = time.clock() if start_time is not None: elapsed_time = end_time - start_time if elapsed_time <= 4: # throttling time.sleep(4 - elapsed_time) start_time = time.clock() if not ITEMS: # No more items to process break for ITEM in ITEMS: TOTAL += 1 # not downloading if url is reddit comment if ('reddit.com/r/' + ARGS.reddit + '/comments/' in ITEM['url'] or re.match(reddit_comment_regex, ITEM['url']) is not None): print ' Skip:[{}]'.format(ITEM['url']) continue # verifies dimensions # title of submission must contain image dimensions in format [ WIDTH x HEIGHT ] # brackets can also be parenthesis and the x may also be special character × ITEM['title'] = ITEM['title'].replace(u'×', u'x') dim_pattern = re.compile(ur'[\[|\(][0-9]+[ ]*[x|X][ ]*[0-9]+[\]|\)]', re.UNICODE) dim_regex = dim_pattern.search(ITEM['title']) if dim_regex: dimension = dim_regex.group(0).replace('[','').replace(']','').replace('(','').replace(')','').replace(' ','') dimension = re.split('x|×|X',dimension) if len(dimension) == 2: if int(dimension[0]) < ARGS.width or int(dimension[1]) < ARGS.height: if ARGS.verbose: print ' DIMENSION: {} is smaller than {}x{}.'.format(ITEM['title'], ARGS.width, ARGS.height) SKIPPED += 1 continue if ITEM['score'] >= ARGS.maxscore: if ARGS.verbose: print ' SCORE: {} has score of {}'.format(ITEM['id'], ITEM['score']) 'which is higher than maximum score of {}.'.format(ARGS.maxscore) SKIPPED += 1 continue if ITEM['score'] < ARGS.score: if ARGS.verbose: print ' SCORE: {} has score of {}'.format(ITEM['id'], ITEM['score']) 'which is lower than required score of {}.'.format(ARGS.score) SKIPPED += 1 continue elif ARGS.sfw and ITEM['over_18']: if ARGS.verbose: print ' NSFW: %s is marked as NSFW.' % (ITEM['id']) SKIPPED += 1 continue elif ARGS.nsfw and not ITEM['over_18']: if ARGS.verbose: print ' Not NSFW, skipping %s' % (ITEM['id']) SKIPPED += 1 continue elif ARGS.regex and not re.match(RE_RULE, ITEM['title']): if ARGS.verbose: print ' Regex match failed' SKIPPED += 1 continue elif ARGS.skipAlbums and 'imgur.com/a/' in ITEM['url']: if ARGS.verbose: print ' Album found, skipping %s' % (ITEM['id']) SKIPPED += 1 continue if ARGS.title_contain and ARGS.title_contain.lower() not in ITEM['title'].lower(): if ARGS.verbose: print ' Title not contain "{}",'.format(ARGS.title_contain) 'skipping {}'.format(ITEM['id']) SKIPPED += 1 continue FILECOUNT = 0 try: URLS = extract_urls(ITEM['url']) except Exception: _log.exception("Failed to extract urls for %r", URLS) continue for URL in URLS: try: # Find gfycat if requested if URL.endswith('gif') and ARGS.mirror_gfycat: check = gfycat().check(URL) if check.get("urlKnown"): URL = check.get('webmUrl') # Trim any http query off end of file extension. FILEEXT = pathsplitext(URL)[1] if '?' in FILEEXT: FILEEXT = FILEEXT[:FILEEXT.index('?')] # Only append numbers if more than one file FILENUM = ('_%d' % FILECOUNT if len(URLS) > 1 else '') # create filename based on given input from user if ARGS.filename_format == 'url': FILENAME = '%s%s%s' % (pathsplitext(pathbasename(URL))[0], '', FILEEXT) elif ARGS.filename_format == 'title': FILENAME = '%s%s%s' % (slugify(ITEM['title']), FILENUM, FILEEXT) if len(FILENAME) >= 256: shortened_item_title = slugify(ITEM['title'])[:256-len(FILENAME)] FILENAME = '%s%s%s' % (shortened_item_title, FILENUM, FILEEXT) else: FILENAME = '%s%s%s' % (ITEM['id'], FILENUM, FILEEXT) # join file with directory FILEPATH = pathjoin(ARGS.dir, FILENAME) # Improve debuggability list URL before download too. # url may be wrong so skip that if URL.encode('utf-8') == 'http://': raise URLError('Url is empty') else: text_templ = ' Attempting to download URL[{}] as [{}].' print text_templ.format(URL.encode('utf-8'), FILENAME.encode('utf-8')) # Download the image try: download_from_url(URL, FILEPATH) # Image downloaded successfully! print ' Sucessfully downloaded URL [%s] as [%s].' % (URL, FILENAME) DOWNLOADED += 1 FILECOUNT += 1 except Exception,e: print ' %s' % str(e) ERRORS += 1 if ARGS.num and DOWNLOADED >= ARGS.num: FINISHED = True break except WrongFileTypeException as ERROR: print ' %s' % (ERROR) _log_wrongtype(url=URL, target_dir=ARGS.dir, filecount=FILECOUNT, _downloaded=DOWNLOADED, filename=FILENAME) SKIPPED += 1 except FileExistsException as ERROR: print ' %s' % (ERROR) ERRORS += 1 if ARGS.update: print ' Update complete, exiting.' FINISHED = True break except HTTPError as ERROR: print ' HTTP ERROR: Code %s for %s.' % (ERROR.code, URL) FAILED += 1
def main(): ARGS = parse_args(sys.argv[1:]) logging.basicConfig(level=logging.INFO) print(parse_reddit_argument(ARGS.reddit)) TOTAL = DOWNLOADED = ERRORS = SKIPPED = FAILED = 0 FINISHED = False # Create the specified directory if it doesn't already exist. if not pathexists(ARGS.dir): mkdir(ARGS.dir) # If a regex has been specified, compile the rule (once) RE_RULE = None if ARGS.regex: RE_RULE = re.compile(ARGS.regex) # compile reddit comment url to check if url is one of them reddit_comment_regex = re.compile(r'.*reddit\.com\/r\/(.*?)\/comments') LAST = ARGS.last start_time = None ITEM = None sort_type = ARGS.sort_type if sort_type: sort_type = sort_type.lower() while not FINISHED: ITEMS = getitems(ARGS.reddit, multireddit=ARGS.multireddit, previd=LAST, reddit_sort=sort_type, user=ARGS.user) # measure time and set the program to wait 4 second between request # as per reddit api guidelines end_time = time.clock() if start_time is not None: elapsed_time = end_time - start_time if elapsed_time <= 4: # throttling time.sleep(4 - elapsed_time) start_time = time.clock() if not ITEMS: # No more items to process break for ITEM in ITEMS: TOTAL += 1 # not downloading if url is reddit comment if ('reddit.com/r/' + ARGS.reddit + '/comments/' in ITEM['url'] or re.match(reddit_comment_regex, ITEM['url']) is not None): print(' Skip:[{}]'.format(ITEM['url'])) continue if ITEM['score'] < ARGS.score: if ARGS.verbose: print( ' SCORE: {} has score of {}'.format( ITEM['id'], ITEM['score']), 'which is lower than required score of {}.'.format( ARGS.score)) SKIPPED += 1 continue elif ARGS.sfw and ITEM['over_18']: if ARGS.verbose: print(' NSFW: %s is marked as NSFW.' % (ITEM['id'])) SKIPPED += 1 continue elif ARGS.nsfw and not ITEM['over_18']: if ARGS.verbose: print(' Not NSFW, skipping %s' % (ITEM['id'])) SKIPPED += 1 continue elif ARGS.regex and not re.match(RE_RULE, ITEM['title']): if ARGS.verbose: print(' Regex not matched') SKIPPED += 1 continue elif ARGS.skipAlbums and 'imgur.com/a/' in ITEM['url']: if ARGS.verbose: print(' Album found, skipping %s' % (ITEM['id'])) SKIPPED += 1 continue if ARGS.title_contain and ARGS.title_contain.lower( ) not in ITEM['title'].lower(): if ARGS.verbose: print( ' Title does not contain "{}",'.format( ARGS.title_contain), 'skipping {}'.format(ITEM['id'])) SKIPPED += 1 continue FILECOUNT = 0 COMMENTS_ALBUM = False try: URLS = extract_urls(ITEM['url']) if ARGS.comment_album: if re.search("album.+?comment", ITEM['title'], re.IGNORECASE): comments_url = "https://www.reddit.com" + ITEM[ 'permalink'] + ".json" print( ' Album in comments appears to be available for %s. Attempting to find URL in top comment: %s' % (ITEM['title'], comments_url)) comment_album_urls = [] try: time.sleep(4) req = Request(comments_url) json = urlopen(req).read() data = JSONDecoder().decode(json) comments = [ x['data'] for x in data[1]['data']['children'] ] print(' First comment text: %s' % (comments[int( ARGS.comment_album_offset)]['body'])) comment_urls = re.finditer( r"[-a-zA-Z0-9@:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}\b(\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?", comments[int( ARGS.comment_album_offset)]['body']) for comment_url in comment_urls: comment_url = comment_url.group() comment_url = extract_urls(comment_url) comment_album_urls += comment_url if len(comment_album_urls) == 0: print( ' Failed to retrieve album from comments' ) else: URLS = URLS + comment_album_urls COMMENTS_ALBUM = True except HTTPError as ERROR: error_message = '\tHTTP ERROR: Code %s for %s' % ( ERROR.code, comments_url) sys.exit(error_message) except ValueError as ERROR: if ERROR.args[ 0] == 'No JSON object could be decoded': error_message = 'ERROR: subreddit "%s" does not exist' % ( subreddit) sys.exit(error_message) raise ERROR except Exception: _log.exception("Failed to extract urls for %r", ITEM['url']) continue for URL in URLS: try: # Find gfycat if requested if URL.endswith('gif') and ARGS.mirror_gfycat: check = gfycat().check(URL) if check.get("urlKnown"): URL = check.get('webmUrl') FILEEXT = pathsplitext(URL)[1] # Trim any http query off end of file extension. FILEEXT = re.sub(r'\?.*$', '', FILEEXT) if not FILEEXT: # A more usable option that empty. # The extension can be fixed after downloading, but then the 'already downloaded' check will be harder. FILEEXT = '.jpg' # Only append numbers if more than one file FILENUM = ('_%d' % FILECOUNT if len(URLS) > 1 else '') # create filename based on given input from user if ARGS.filename_format == 'url': FILENAME = '%s%s%s' % (pathsplitext( pathbasename(URL))[0], '', FILEEXT) elif ARGS.filename_format == 'title': FILENAME = '%s%s%s' % (slugify( ITEM['title']), FILENUM, FILEEXT) if len(FILENAME) >= 256: shortened_item_title = slugify( ITEM['title'])[:256 - len(FILENAME)] FILENAME = '%s%s%s' % (shortened_item_title, FILENUM, FILEEXT) elif ARGS.filename_format == 'title-id': FILENAME = '%s%s (%s)%s' % (slugify( ITEM['title']), FILENUM, ITEM['id'], FILEEXT) if len(FILENAME) >= 256: shortened_item_title = slugify( ITEM['title'])[:256 - len(FILENAME)] FILENAME = '%s%s%s' % (shortened_item_title, FILENUM, FILEEXT) else: FILENAME = '%s%s%s' % (ITEM['id'], FILENUM, FILEEXT) # join file with directory FILEPATH = pathjoin(ARGS.dir, FILENAME) # Improve debuggability list URL before download too. # url may be wrong so skip that if URL.encode('utf-8') == 'http://': raise URLError('Url is empty') else: text_templ = ' Attempting to download URL[{}] as [{}].' print( text_templ.format(URL.encode('utf-8'), FILENAME.encode('utf-8'))) # Download the image download_from_url(URL, FILEPATH) # Image downloaded successfully! print(' Sucessfully downloaded URL [%s] as [%s].' % (URL, FILENAME)) DOWNLOADED += 1 FILECOUNT += 1 if ARGS.num and DOWNLOADED >= ARGS.num: FINISHED = True break except WrongFileTypeException as ERROR: print(' %s' % (ERROR, )) _log_wrongtype(url=URL, target_dir=ARGS.dir, filecount=FILECOUNT, _downloaded=DOWNLOADED, filename=FILENAME) SKIPPED += 1 except FileExistsException as ERROR: print(' %s' % (ERROR, )) ERRORS += 1 FILECOUNT += 1 if ARGS.update: print(' Update complete, exiting.') FINISHED = True break except HTTPError as ERROR: print(' HTTP ERROR: Code %s for %s.' % (ERROR.code, URL)) FAILED += 1 except URLError as ERROR: print(' URL ERROR: %s!' % (URL, )) FAILED += 1 except InvalidURL as ERROR: print(' Invalid URL: %s!' % (URL, )) FAILED += 1 except Exception as exc: _log.exception("Problem with %r: %r", URL, exc) FAILED += 1 if FINISHED: break LAST = ITEM['id'] if ITEM is not None else None print( 'Downloaded {} files'.format(DOWNLOADED), '(Processed {}, Skipped {}, Exists {})'.format(TOTAL, SKIPPED, ERRORS))
def main(): ARGS = parse_args(sys.argv[1:]) logging.basicConfig(level=logging.INFO) print(parse_reddit_argument(ARGS.reddit)) TOTAL = DOWNLOADED = ERRORS = SKIPPED = FAILED = 0 FINISHED = False # Create the specified directory if it doesn't already exist. if not pathexists(ARGS.dir): mkdir(ARGS.dir) # If a regex has been specified, compile the rule (once) RE_RULE = None if ARGS.regex: RE_RULE = re.compile(ARGS.regex) # compile reddit comment url to check if url is one of them reddit_comment_regex = re.compile(r'.*reddit\.com\/r\/(.*?)\/comments') LAST = ARGS.last start_time = None ITEM = None sort_type = ARGS.sort_type if sort_type: sort_type = sort_type.lower() while not FINISHED: ITEMS = getitems( ARGS.reddit, multireddit=ARGS.multireddit, previd=LAST, reddit_sort=sort_type) # measure time and set the program to wait 4 second between request # as per reddit api guidelines end_time = time.clock() if start_time is not None: elapsed_time = end_time - start_time if elapsed_time <= 4: # throttling time.sleep(4 - elapsed_time) start_time = time.clock() if not ITEMS: # No more items to process break for ITEM in ITEMS: TOTAL += 1 # not downloading if url is reddit comment if ('reddit.com/r/' + ARGS.reddit + '/comments/' in ITEM['url'] or re.match(reddit_comment_regex, ITEM['url']) is not None): print(' Skip:[{}]'.format(ITEM['url'])) continue if ITEM['score'] < ARGS.score: if ARGS.verbose: print(' SCORE: {} has score of {}'.format(ITEM['id'], ITEM['score']), 'which is lower than required score of {}.'.format(ARGS.score)) SKIPPED += 1 continue elif ARGS.sfw and ITEM['over_18']: if ARGS.verbose: print(' NSFW: %s is marked as NSFW.' % (ITEM['id'])) SKIPPED += 1 continue elif ARGS.nsfw and not ITEM['over_18']: if ARGS.verbose: print(' Not NSFW, skipping %s' % (ITEM['id'])) SKIPPED += 1 continue elif ARGS.regex and not re.match(RE_RULE, ITEM['title']): if ARGS.verbose: print(' Regex not matched') SKIPPED += 1 continue elif ARGS.skipAlbums and 'imgur.com/a/' in ITEM['url']: if ARGS.verbose: print(' Album found, skipping %s' % (ITEM['id'])) SKIPPED += 1 continue if ARGS.title_contain and ARGS.title_contain.lower() not in ITEM['title'].lower(): if ARGS.verbose: print(' Title does not contain "{}",'.format(ARGS.title_contain), 'skipping {}'.format(ITEM['id'])) SKIPPED += 1 continue FILECOUNT = 0 try: URLS = extract_urls(ITEM['url']) except Exception: _log.exception("Failed to extract urls for %r", URLS) continue for URL in URLS: try: # Find gfycat if requested if URL.endswith('gif') and ARGS.mirror_gfycat: check = gfycat().check(URL) if check.get("urlKnown"): URL = check.get('webmUrl') FILEEXT = pathsplitext(URL)[1] # Trim any http query off end of file extension. FILEEXT = re.sub(r'\?.*$', '', FILEEXT) if not FILEEXT: # A more usable option that empty. # The extension can be fixed after downloading, but then the 'already downloaded' check will be harder. FILEEXT = '.jpg' # Only append numbers if more than one file FILENUM = ('_%d' % FILECOUNT if len(URLS) > 1 else '') # create filename based on given input from user if ARGS.filename_format == 'url': FILENAME = '%s%s%s' % (pathsplitext(pathbasename(URL))[0], '', FILEEXT) elif ARGS.filename_format == 'title': FILENAME = '%s%s%s' % (slugify(ITEM['title']), FILENUM, FILEEXT) if len(FILENAME) >= 256: shortened_item_title = slugify(ITEM['title'])[:256-len(FILENAME)] FILENAME = '%s%s%s' % (shortened_item_title, FILENUM, FILEEXT) else: FILENAME = '%s%s%s' % (ITEM['id'], FILENUM, FILEEXT) # join file with directory FILEPATH = pathjoin(ARGS.dir, FILENAME) # Improve debuggability list URL before download too. # url may be wrong so skip that if URL.encode('utf-8') == 'http://': raise URLError('Url is empty') else: text_templ = ' Attempting to download URL[{}] as [{}].' print(text_templ.format(URL.encode('utf-8'), FILENAME.encode('utf-8'))) # Download the image try: download_from_url(URL, FILEPATH) # Image downloaded successfully! print(' Sucessfully downloaded URL [%s] as [%s].' % (URL, FILENAME)) DOWNLOADED += 1 FILECOUNT += 1 except Exception as exc: print(' %s' % (exc,)) ERRORS += 1 if ARGS.num and DOWNLOADED >= ARGS.num: FINISHED = True break except WrongFileTypeException as ERROR: print(' %s' % (ERROR,)) _log_wrongtype(url=URL, target_dir=ARGS.dir, filecount=FILECOUNT, _downloaded=DOWNLOADED, filename=FILENAME) SKIPPED += 1 except FileExistsException as ERROR: print(' %s' % (ERROR,)) ERRORS += 1 if ARGS.update: print(' Update complete, exiting.') FINISHED = True break except HTTPError as ERROR: print(' HTTP ERROR: Code %s for %s.' % (ERROR.code, URL)) FAILED += 1 except URLError as ERROR: print(' URL ERROR: %s!' % (URL,)) FAILED += 1 except InvalidURL as ERROR: print(' Invalid URL: %s!' % (URL,)) FAILED += 1 except Exception as exc: _log.exception("Problem with %r: %r", URL, exc) FAILED += 1 if FINISHED: break LAST = ITEM['id'] if ITEM is not None else None print('Downloaded {} files'.format(DOWNLOADED), '(Processed {}, Skipped {}, Exists {})'.format(TOTAL, SKIPPED, ERRORS))
def old_tcweb_fnames_remove_duplicates(fname, mins_to_remove=10, remove_files=False): # 20201010.222325.WP162020.gmi.GPM.37V.40kts.14p2.1p0.jpg # 20201010.222325.WP162020.gmi.GPM.89H.40kts.14p2.1p0.jpg.yaml matching_fnames = [] removed_fnames = [] saved_fnames = [] ext1 = pathsplitext(fname)[-1] ext2 = pathsplitext(pathsplitext(fname)[0])[-1] ext3 = pathsplitext(pathsplitext(pathsplitext(fname)[0])[0])[-1] if (ext1 == '.jpg') or (ext1 == '.yaml' and ext2 == '.jpg'): LOG.info( 'MATCHES EXT FORMAT. jpg or jpg.yaml. Attempting to remove old_tcweb duplicates' ) else: LOG.info( 'NOT REMOVING DUPLICATES. Not old_tcweb filename, not jpg or jpg.yaml.' ) return [], [] dirname = pathdirname(fname) basename = pathbasename(fname) # 20201010.222325.WP162020.gmi.GPM.37V.40kts.14p2.1p0.jpg parts = basename.split('.') if (len(parts) == 10 and ext1 == '.yaml') or (len(parts) == 9 and ext1 == '.jpg'): LOG.info( 'NOT REMOVING DUPLICATES. Not old_tcweb filename, does not contain 9 or 10 fields.' ) return [], [] try: # 20201010.222325.WP162020.gmi.GPM.37V.40kts.14p2.1p0.jpg yyyymmdd = parts[0] hhmnss = parts[1] stormname = parts[2] sensor = parts[3] platform = parts[4] product = parts[5] intensity = parts[6] coverage = parts[7] res = parts[8] if 'p' not in coverage or 'p' not in res: LOG.info( 'NOT REMOVING DUPLICATES. Not old_tcweb filename, coverage or res not "NNpN.' ) return [], [] if 'kts' not in intensity: LOG.info( 'NOT REMOVING DUPLICATES. Not old_tcweb filename, intensity does not contain "kts".' ) return [], [] except IndexError: LOG.info( 'NOT REMOVING DUPLICATES. Unmatched filename format, incorrect number of . delimited fields' ) return [], [] try: fname_dt = datetime.strptime(yyyymmdd + hhmnss, '%Y%m%d%H%M%S') except ValueError: LOG.info( 'NOT REMOVING DUPLICATES. Unmatched old_tcweb filename format, incorrect date time string.' ) return [], [] timediff = timedelta(minutes=mins_to_remove) for currdt in minrange(fname_dt - timediff, fname_dt + timediff): # 20201010.222325.WP162020.gmi.GPM.37V.40kts.14p2.1p0.jpg # 20201010.222325.WP162020.gmi.GPM.37V.*.*.1p0.jpg* dtstr = currdt.strftime( '{0}/%Y%m%d.%H%M*.{1}.{2}.{3}.{4}.*.*.{5}.jpg*'.format( dirname, stormname, sensor, platform, product, res)) # print(dtstr) matching_fnames += glob(dtstr) max_coverage = 0 for matching_fname in matching_fnames: # 20201010.222325.WP162020.gmi.GPM.37V.40kts.14p2.1p0.jpg parts = pathbasename(matching_fname).split('.') coverage = float(parts[7].replace('p', '.')) max_coverage = max(coverage, max_coverage) gotone = False LOG.info('CHECKING DUPLICATE FILES') for matching_fname in list(set(matching_fnames)): # 20201010.222325.WP162020.gmi.GPM.37V.40kts.14p2.1p0.jpg parts = pathbasename(matching_fname).split('.') coverage = float(parts[7].replace('p', '.')) if coverage < max_coverage or gotone is True: removed_fnames += [matching_fname] # Test it out for a bit first if remove_files is True: LOG.info( 'DELETING DUPLICATE FILE with less coverage %s < %s %s', coverage, max_coverage, matching_fname) osunlink(matching_fname) else: LOG.info( 'TEST DELETING DUPLICATE FILE with less coverage %s < %s %s', coverage, max_coverage, matching_fname) else: if len(matching_fnames) == 1: LOG.info( 'SAVING DUPLICATE FILE (only one!) with max coverage %s %s', max_coverage, matching_fname) else: LOG.info('SAVING DUPLICATE FILE with max coverage %s %s', max_coverage, matching_fname) saved_fnames += [matching_fname] gotone = True return removed_fnames, saved_fnames
def update_fields(atcf_stormfilename, cc, conn, process=False): # Must be of form similar to # Gal912016.dat import re from datetime import datetime, timedelta from os.path import basename as pathbasename from os import stat as osstat updated_files = [] LOG.info('Checking '+atcf_stormfilename+' ... process '+str(process)) # Check if we match Gxxdddddd.dat filename format. If not just return and don't do anything. if not re.compile('G\D\D\d\d\d\d\d\d\.\d\d\d\d\d\d\d\d\d\d.dat').match(pathbasename(atcf_stormfilename)) and \ not re.compile('G\D\D\d\d\d\d\d\d\.dat').match(pathbasename(atcf_stormfilename)): LOG.info('') LOG.warning(' DID NOT MATCH REQUIRED FILENAME FORMAT, SKIPPING: '+atcf_stormfilename) return [] # Get all fields for the database entry for the current filename cc.execute("SELECT * FROM atcf_deck_stormfiles WHERE filename = ?", (atcf_stormfilename,)) data=cc.fetchone() file_timestamp = datetime.fromtimestamp(osstat(atcf_stormfilename).st_mtime) # Reads timestamp out as string - convert to datetime object. # Check if timestamp on file is newer than timestamp in database - if not, just return and don't do anything. if data: database_timestamp = datetime.strptime(cc.execute("SELECT last_updated from atcf_deck_stormfiles WHERE filename = ?", (atcf_stormfilename,)).fetchone()[0],'%Y-%m-%d %H:%M:%S.%f') if file_timestamp < database_timestamp: LOG.info('') LOG.info(atcf_stormfilename+' already in '+ATCF_DECKS_DB+' and up to date, not doing anything.') return [] lines = open(atcf_stormfilename,'r').readlines() start_line = lines[0].split(',') # Start 24h prior to start in sectorfile, for initial processing #storm_start_datetime = datetime.strptime(start_line[2],'%Y%m%d%H') start_datetime = datetime.strptime(start_line[2],'%Y%m%d%H') - timedelta(hours=24) end_datetime = datetime.strptime(lines[-1].split(',')[2],'%Y%m%d%H') start_vmax= start_line[8] vmax=0 for line in lines: currv = line.split(',')[8] track = line.split(',')[4] if currv and track == 'BEST' and float(currv) > vmax: vmax = float(currv) if data and database_timestamp < file_timestamp: LOG.info('') LOG.info('Updating start/end datetime and last_updated fields for '+atcf_stormfilename+' in '+ATCF_DECKS_DB) old_start_datetime,old_end_datetime,old_vmax = cc.execute("SELECT start_datetime,end_datetime,vmax from atcf_deck_stormfiles WHERE filename = ?", (atcf_stormfilename,)).fetchone() # Eventually add in storm_start_datetime #old_storm_start_datetime,old_start_datetime,old_end_datetime,old_vmax = cc.execute("SELECT storm_start_datetime,start_datetime,end_datetime,vmax from atcf_deck_stormfiles WHERE filename = ?", (atcf_stormfilename,)).fetchone() if old_start_datetime == start_datetime.strftime('%Y-%m-%d %H:%M:%S'): LOG.info(' UNCHANGED start_datetime: '+old_start_datetime) else: LOG.info(' Old start_datetime: '+old_start_datetime+' to new: '+start_datetime.strftime('%Y-%m-%d %H:%M:%S')) updated_files += [atcf_stormfilename] #if old_storm_start_datetime == storm_start_datetime.strftime('%Y-%m-%d %H:%M:%S'): # LOG.info(' UNCHANGED storm_start_datetime: '+old_storm_start_datetime) #else: # LOG.info(' Old storm_start_datetime: '+old_storm_start_datetime+' to new: '+storm_start_datetime.strftime('%Y-%m-%d %H:%M:%S')) if old_end_datetime == end_datetime.strftime('%Y-%m-%d %H:%M:%S'): LOG.info(' UNCHANGED end_datetime: '+old_end_datetime) else: LOG.info(' Old end_datetime: '+old_end_datetime+' to new: '+end_datetime.strftime('%Y-%m-%d %H:%M:%S')) updated_files += [atcf_stormfilename] if database_timestamp == file_timestamp: LOG.info(' UNCHANGED last_updated: '+database_timestamp.strftime('%Y-%m-%d %H:%M:%S')) else: LOG.info(' Old last_updated: '+database_timestamp.strftime('%Y-%m-%d %H:%M:%S')+' to new: '+file_timestamp.strftime('%Y-%m-%d %H:%M:%S')) updated_files += [atcf_stormfilename] if old_vmax == vmax: LOG.info(' UNCHANGED vmax: '+str(old_vmax)) else: LOG.info(' Old vmax: '+str(old_vmax)+' to new: '+str(vmax)) updated_files += [atcf_stormfilename] cc.execute('''UPDATE atcf_deck_stormfiles SET last_updated=?, start_datetime=?, end_datetime=?, vmax=? WHERE filename = ?''', #Eventually add in ? #storm_start_datetime=?, (file_timestamp, #storm_start_datetime, start_datetime, end_datetime, str(vmax), atcf_stormfilename,)) conn.commit() return updated_files start_lat = start_line[6] start_lon = start_line[7] storm_basin = start_line[0] storm_num = start_line[1] try: start_name= start_line[48]+start_line[49] except IndexError: start_name= start_line[41] if data == None: #print ' Adding '+atcf_stormfilename+' to '+ATCF_DECKS_DB cc.execute('''insert into atcf_deck_stormfiles( filename, last_updated, vmax, storm_num, storm_basin, start_datetime, start_lat, start_lon, start_vmax, start_name, end_datetime) values(?, ?,?, ?,?,?,?,?,?,?,?)''', # Eventually add in ? #end_datetime) values(?, ?, ?,?, ?,?,?,?,?,?,?,?)''', #storm_start_datetime, (atcf_stormfilename, file_timestamp, str(vmax), storm_num, storm_basin, #storm_start_datetime, start_datetime, start_lat, start_lon, start_vmax, start_name, end_datetime,)) LOG.info('') LOG.info(' Adding '+atcf_stormfilename+' to '+ATCF_DECKS_DB) updated_files += [atcf_stormfilename] conn.commit() # This ONLY runs if it is a brand new storm file and we requested # processing. if process: reprocess_storm(atcf_stormfilename) return updated_files
def metoctiff_filename_remove_duplicates(fname, mins_to_remove=10, remove_files=False): # 20201010.222325.GPM.gmi.89H.WP162020.14pc.jif.gz # 20201010.222325.GPM.gmi.89H.WP162020.14pc.jif.gz.yaml matching_fnames = [] removed_fnames = [] saved_fnames = [] ext1 = pathsplitext(fname)[-1] ext2 = pathsplitext(pathsplitext(fname)[0])[-1] ext3 = pathsplitext(pathsplitext(pathsplitext(fname)[0])[0])[-1] if (ext1 == '.gz' and ext2 == '.jif') or (ext1 == '.yaml' and ext2 == '.gz' and ext3 == '.jif'): LOG.info( 'MATCHES EXT FORMAT. .jif.gz or .jif.gz.yaml. Attempting to remove metoctiff duplicates' ) else: LOG.info( 'NOT REMOVING DUPLICATES. Not metoctiff filename, not .jif.gz or .jif.gz.yaml.' ) return [], [] dirname = pathdirname(fname) basename = pathbasename(fname) parts = basename.split('.') if (len(parts) == 10 and ext1 == '.yaml') or (len(parts) == 9 and ext1 == '.gz'): LOG.info( 'MATCHES NUMBER FIELDS. 9 or 10 fields. Attempting to remove metoctiff duplicates' ) else: LOG.info( 'NOT REMOVING DUPLICATES. Not metoctiff filename, does not contain 9 or 10 fields.' ) return [], [] try: # 20201010.222325.GPM.gmi.89H.WP162020.14pc.jif.gz yyyymmdd = parts[0] hhmnss = parts[1] platform = parts[2] sensor = parts[3] product = parts[4] stormname = parts[5] coverage = parts[6] if 'pc' not in coverage: LOG.info( 'NOT REMOVING DUPLICATES. Not metoctiff filename, coverage not "NNpc.' ) return [] except IndexError: LOG.info( 'NOT REMOVING DUPLICATES. Unmatched metoctiff filename format, incorrect number of . delimited fields' ) return [], [] try: fname_dt = datetime.strptime(yyyymmdd + hhmnss, '%Y%m%d%H%M%S') except ValueError: LOG.info( 'NOT REMOVING DUPLICATES. Unmatched metoctiff filename format, incorrect date time string.' ) return [], [] timediff = timedelta(minutes=mins_to_remove) for currdt in minrange(fname_dt - timediff, fname_dt + timediff): # 20201010.222325.GPM.gmi.19H.WP162020.14pc.jif.gz # Matches # 20201010.222325.GPM.gmi.19H.WP162020.*.jif.gz* dtstr = currdt.strftime( '{0}/%Y%m%d.%H%M*.{1}.{2}.{3}.{4}.*.jif.gz*'.format( dirname, platform, sensor, product, stormname)) # print(dtstr) matching_fnames += glob(dtstr) max_coverage = 0 for matching_fname in matching_fnames: # 20201010.222325.GPM.gmi.89H.WP162020.14pc.jif.gz parts = pathbasename(matching_fname).split('.') coverage = float(parts[6].replace('pc', '')) max_coverage = max(coverage, max_coverage) gotone = False LOG.info('CHECKING DUPLICATE FILES') for matching_fname in list(set(matching_fnames)): # 20201010.222325.GPM.gmi.89H.WP162020.14pc.jif.gz parts = pathbasename(matching_fname).split('.') coverage = float(parts[6].replace('pc', '')) if coverage < max_coverage or gotone is True: removed_fnames += [matching_fname] # Test it out for a bit first if remove_files is True: LOG.info( 'DELETING DUPLICATE FILE with less coverage %s < %s %s', coverage, max_coverage, matching_fname) osunlink(matching_fname) else: LOG.info( 'TEST DELETING DUPLICATE FILE with less coverage %s < %s %s', coverage, max_coverage, matching_fname) else: saved_fnames += [matching_fname] if len(matching_fnames) == 1: LOG.info( 'SAVING DUPLICATE FILE (only one!) with max coverage %s %s', max_coverage, matching_fname) else: LOG.info('SAVING DUPLICATE FILE with max coverage %s %s', max_coverage, matching_fname) gotone = True return removed_fnames, saved_fnames