示例#1
0
def process_imgur_url(url):
	"""
	Given an imgur URL, determine if it's a direct link to an image or an
	album.  If the latter, attempt to determine all images within the album

	Returns:
		list of imgur URLs
	"""
	if 'imgur.com/a/' in url or 'imgur.com/gallery/' in url:
		return extract_imgur_album_urls(url)

	# use beautifulsoup4 to find real link
	# find vid url only
	'''
	try:
		print("TRYING AT %s" % url)
		from bs4 import BeautifulSoup
		html = urlopen(url).read()
		soup = BeautifulSoup(html, 'lxml')
		vid = soup.find('div', {'class': 'video-container'})
		vid_type = 'video/webm'  # or 'video/mp4'
		vid_url = vid.find('source', {'type': vid_type}).get('src')
		if vid_url.startswith('//'):
			vid_url = 'http:' + vid_url
		return vid_url

	except Exception:
		# do nothing for awhile
		pass
	'''

	# Change .png to .jpg for imgur urls.
	if url.endswith('.png'):
		url = url.replace('.png', '.jpg')
	else:
		# Extract the file extension
		ext = pathsplitext(pathbasename(url))[1]
		if ext == '.gifv':
			url = url.replace('.gifv', '.gif')
		if not ext:
			# Append a default
			url += '.jpg'
	return [url]
示例#2
0
def process_imgur_url(url):
    """
    Given an imgur URL, determine if it's a direct link to an image or an
    album.  If the latter, attempt to determine all images within the album

    Returns:
        list of imgur URLs
    """
    if 'imgur.com/a/' in url:
        return extract_imgur_album_urls(url)

    # Change .png to .jpg for imgur urls.
    if url.endswith('.png'):
        url = url.replace('.png', '.jpg')
    else:
        # Extract the file extension
        ext = pathsplitext(pathbasename(url))[1]
        if not ext:
            # Append a default
            url += '.jpg'

    return [url]
示例#3
0
def process_imgur_url(url):
    """
    Given an imgur URL, determine if it's a direct link to an image or an
    album.  If the latter, attempt to determine all images within the album

    Returns:
        list of imgur URLs
    """
    if 'imgur.com/a/' in url or 'imgur.com/gallery/' in url:
        return extract_imgur_album_urls(url)

    # use beautifulsoup4 to find real link
    # find vid url only
    try:
        from bs4 import BeautifulSoup
        html = urlopen(url).read()
        soup = BeautifulSoup(html, 'lxml')
        vid = soup.find('div', {'class': 'video-container'})
        vid_type = 'video/webm'  # or 'video/mp4'
        vid_url = vid.find('source', {'type': vid_type}).get('src')
        if vid_url.startswith('//'):
            vid_url = 'http:' + vid_url
        return vid_url

    except Exception:
        # do nothing for awhile
        pass
    # Change .png to .jpg for imgur urls.
    if url.endswith('.png'):
        url = url.replace('.png', '.jpg')
    else:
        # Extract the file extension
        ext = pathsplitext(pathbasename(url))[1]
        if ext == '.gifv':
            url = url.replace('.gifv', '.gif')
        if not ext:
            # Append a default
            url += '.jpg'
    return [url]
def process_imgur_url(url):
    """
    Given an imgur URL, determine if it's a direct link to an image or an
    album.  If the latter, attempt to determine all images within the album

    Returns:
        list of imgur URLs
    """
    if 'imgur.com/a/' in url:
        return extract_imgur_album_urls(url)

    # Change .png to .jpg for imgur urls.
    if url.endswith('.png'):
        url = url.replace('.png', '.jpg')
    else:
        # Extract the file extension
        ext = pathsplitext(pathbasename(url))[1]
        if not ext:
            # Append a default
            url += '.jpg'

    return [url]
示例#5
0
def main(args=None):
    ARGS = parse_args(args if len(args) > 0 else sys.argv[1:])

    logging.basicConfig(level=logging.INFO)

    # value at first index is of current subreddit, second index is total
    TOTAL, DOWNLOADED, ERRORS, SKIPPED, FAILED = [0, 0], [0,
                                                          0], [0,
                                                               0], [0,
                                                                    0], [0, 0]
    PROG_REPORT = [TOTAL, DOWNLOADED, ERRORS, SKIPPED, FAILED]

    # Create the specified directory if it doesn't already exist.
    if not pathexists(ARGS.dir):
        mkdir(ARGS.dir)

    # If a regex has been specified, compile the rule (once)
    RE_RULE = None
    if ARGS.regex:
        RE_RULE = re.compile(ARGS.regex)

    # compile reddit comment url to check if url is one of them
    reddit_comment_regex = re.compile(r'.*reddit\.com\/r\/(.*?)\/comments')

    LAST = ARGS.last

    start_time = None
    ITEM = None

    sort_type = ARGS.sort_type
    if sort_type:
        sort_type = sort_type.lower()

    # check to see if ARGS.subreddit is subreddit or subreddit-list
    if os.path.isfile(
            ARGS.subreddit) and os.path.splitext(ARGS.subreddit)[1] != '':
        ARGS.subreddit_list = ARGS.subreddit

    if ARGS.subreddit_list:
        # ARGS.subreddit_list = ARGS.subreddit_list[0] # can't remember why I did this -jtara1
        subreddit_file = ARGS.subreddit_list
        subreddit_list = parse_subreddit_list(subreddit_file, ARGS.dir)
        if ARGS.verbose:
            print('subreddit_list = %s' % subreddit_list)
    elif not ARGS.subreddit_list:
        subreddit_list = [(ARGS.subreddit, ARGS.dir)]

    # file used to store last reddit id
    log_file = '._history.txt'

    # iterate through subreddit(s)
    for index, section in enumerate(subreddit_list):
        (ARGS.subreddit, ARGS.dir) = section
        FINISHED = False

        if ARGS.verbose:
            print('index: %s, %s, %s' % (index, ARGS.subreddit, ARGS.dir))

        # load last_id or create new entry for last_id in log_data
        log_data, last_id = process_subreddit_last_id(ARGS.subreddit,
                                                      ARGS.sort_type, ARGS.dir,
                                                      log_file, ARGS.dir)

        if ARGS.restart:
            last_id = ''

        TOTAL[0], DOWNLOADED[0], ERRORS[0], SKIPPED[0], FAILED[
            0], FILECOUNT = 0, 0, 0, 0, 0, 0
        # ITEMS loop - begin the loop to get reddit submissions & download media from them
        while not FINISHED:
            if ARGS.verbose:
                print()
            ITEMS = getitems(ARGS.subreddit,
                             multireddit=ARGS.multireddit,
                             previd=last_id,
                             reddit_sort=sort_type)
            # debug ITEMS variable value
            # if ARGS.verbose:
            #    history_log(os.getcwd(), 'ITEMS.txt', 'write', ITEMS)

            # measure time and set the program to wait 4 second between request
            # as per reddit api guidelines
            end_time = time.process_time()

            if start_time is not None:
                elapsed_time = end_time - start_time

                if elapsed_time <= 4:  # throttling
                    time.sleep(4 - elapsed_time)

            start_time = time.process_time()

            # No more items to process
            if not ITEMS:
                if ARGS.verbose:
                    print('No more ITEMS for %s %s' %
                          (ARGS.subreddit, ARGS.sort_type))
                break

            for ITEM in ITEMS:
                TOTAL[0] += 1

                if ('reddit.com/r/' + ARGS.subreddit + '/comments/'
                        in ITEM['url'] or re.match(reddit_comment_regex,
                                                   ITEM['url']) is not None):
                    # hotfix for when last item is comment submission which caused infinite looping
                    last_id = ITEM['id'] if ITEM is not None else None
                    if last_id:
                        log_data[ARGS.subreddit][
                            ARGS.sort_type]['last-id'] = last_id
                        history_log(ARGS.dir,
                                    log_file,
                                    mode='write',
                                    write_data=log_data)
                    continue

                # don't download if url is reddit metrics url
                if 'redditmetrics.com' in ITEM['url']:
                    if ARGS.verbose:
                        print('\t%s was skipped.' % ITEM['url'])

                    SKIPPED[0] += 1
                    continue

                if ITEM['score'] < ARGS.score:
                    if ARGS.verbose:
                        print('    SCORE: {} has score of {}'.format(
                            ITEM['id'], ITEM['score']))
                        'which is lower than required score of {}.'.format(
                            ARGS.score)

                    SKIPPED[0] += 1
                    continue
                elif ARGS.sfw and ITEM['over_18']:
                    if ARGS.verbose:
                        print('    NSFW: %s is marked as NSFW.' % (ITEM['id']))

                    SKIPPED[0] += 1
                    continue
                elif ARGS.nsfw and not ITEM['over_18']:
                    if ARGS.verbose:
                        print('    Not NSFW, skipping %s' % (ITEM['id']))

                    SKIPPED[0] += 1
                    continue
                elif ARGS.regex and not re.match(RE_RULE, ITEM['title']):
                    if ARGS.verbose:
                        print('    Regex match failed')

                    SKIPPED[0] += 1
                    continue
                elif ARGS.skipAlbums and 'imgur.com/a/' in ITEM['url']:
                    if ARGS.verbose:
                        print('    Album found, skipping %s' % (ITEM['id']))

                    SKIPPED[0] += 1
                    continue

                if ARGS.title_contain and ARGS.title_contain.lower(
                ) not in ITEM['title'].lower():
                    if ARGS.verbose:
                        print('    Title not contain "{}",'.format(
                            ARGS.title_contain))
                        'skipping {}'.format(ITEM['id'])

                    SKIPPED[0] += 1
                    continue

                try:
                    URLS = extract_urls(ITEM['url'])
                except URLError as e:
                    print('URLError %s' % e)
                    continue
                except Exception as e:
                    _log.exception("%s", e)
                    continue
                for URL in URLS:
                    try:
                        # Find gfycat if requested
                        if URL.endswith('gif') and ARGS.mirror_gfycat:
                            check = gfycat().check(URL)
                            if check.get("urlKnown"):
                                URL = check.get('webmUrl')

                        # Trim any http query off end of file extension.
                        FILEEXT = pathsplitext(URL)[1]
                        if '?' in FILEEXT:
                            FILEEXT = FILEEXT[:FILEEXT.index('?')]

                        # Only append numbers if more than one file
                        FILENUM = ('_%d' % FILECOUNT if len(URLS) > 1 else '')

                        # create filename based on given input from user
                        if ARGS.filename_format == 'url':
                            FILENAME = '%s%s%s' % (pathsplitext(
                                pathbasename(URL))[0], '', FILEEXT)
                        elif ARGS.filename_format == 'title':
                            FILENAME = '%s%s%s' % (slugify(
                                ITEM['title']), FILENUM, FILEEXT)

                            if len(FILENAME) >= 256:
                                shortened_item_title = slugify(
                                    ITEM['title'])[:256 - len(FILENAME)]
                                FILENAME = '%s%s%s' % (shortened_item_title,
                                                       FILENUM, FILEEXT)
                        else:
                            FILENAME = '%s%s%s' % (ITEM['id'], FILENUM,
                                                   FILEEXT)

                        # join file with directory
                        FILEPATH = pathjoin(ARGS.dir, FILENAME)

                        # Improve debuggability list URL before download too.
                        # url may be wrong so skip that
                        if URL.encode('utf-8') == 'http://':
                            raise URLError('Url is empty')

                        # Download the image
                        try:
                            dl = skp = 0
                            if 'imgur.com' in URL:
                                fname = os.path.splitext(FILENAME)[0]
                                save_path = os.path.join(os.getcwd(), ARGS.dir)
                                downloader = ImgurDownloader(URL,
                                                             save_path,
                                                             fname,
                                                             delete_dne=True,
                                                             debug=False)
                                (dl, skp) = downloader.save_images()
                            else:
                                download_from_url(URL, FILEPATH)
                                dl = 1
                            # Image downloaded successfully!
                            if ARGS.verbose:
                                print('Saved %s as %s' % (URL, FILENAME))
                            DOWNLOADED[0] += 1
                            SKIPPED[0] += skp
                            FILECOUNT += 1
                        except URLError:
                            print('We do not support reddituploads links yet'
                                  ' skipping....')
                        except FileExistsException as ERROR:
                            ERRORS[0] += 1
                            if ARGS.verbose:
                                print(ERROR.message)
                            if ARGS.update:
                                print('    Update complete, exiting.')
                                FINISHED = True
                                break
                        except ImgurException as e:
                            ERRORS[0] += 1
                        except Exception as e:
                            print(e)
                            ERRORS[0] += 1

                        if ARGS.num and (DOWNLOADED[0]) >= ARGS.num:
                            print('    Download num limit reached, exiting.')
                            FINISHED = True
                            break

                    except WrongFileTypeException as ERROR:
                        _log_wrongtype(url=URL,
                                       target_dir=ARGS.dir,
                                       filecount=FILECOUNT,
                                       _downloaded=DOWNLOADED[0],
                                       filename=FILENAME)
                        SKIPPED[0] += 1
                    except HTTPError as ERROR:
                        FAILED[0] += 1
                    except URLError as ERROR:
                        FAILED[0] += 1
                    except InvalidURL as ERROR:
                        FAILED[0] += 1
                    except Exception as exc:
                        FAILED[0] += 1

                # keep track of last_id id downloaded
                last_id = ITEM['id'] if ITEM is not None else None
                if last_id:
                    log_data[ARGS.subreddit][
                        ARGS.sort_type]['last-id'] = last_id
                    history_log(ARGS.dir,
                                log_file,
                                mode='write',
                                write_data=log_data)

                # break out of URL loop to end of ITEMS loop
                if FINISHED:
                    break

            # update variables in PROG_REPORT in SUBREDDIT loop
            for var in PROG_REPORT:
                var[1] += var[0]

    print('Downloaded from %i reddit submissions' % (DOWNLOADED[1]))
    print('(Processed %i, Skipped %i, Errors %i)' %
          (TOTAL[1], SKIPPED[1], ERRORS[1]))

    return DOWNLOADED[1]
示例#6
0
def main():
    ARGS = parse_args(sys.argv[1:])

    logging.basicConfig(level=logging.INFO)
    print parse_reddit_argument(ARGS.reddit)

    TOTAL = DOWNLOADED = ERRORS = SKIPPED = FAILED = 0
    FINISHED = False

    # Create the specified directory if it doesn't already exist.
    if not pathexists(ARGS.dir):
        mkdir(ARGS.dir)

    # If a regex has been specified, compile the rule (once)
    RE_RULE = None
    if ARGS.regex:
        RE_RULE = re.compile(ARGS.regex)

    # compile reddit comment url to check if url is one of them
    reddit_comment_regex = re.compile(r'.*reddit\.com\/r\/(.*?)\/comments')

    LAST = ARGS.last

    start_time = None
    ITEM = None

    sort_type = ARGS.sort_type
    if sort_type:
        sort_type = sort_type.lower()

    while not FINISHED:
        ITEMS = getitems(ARGS.reddit,
                         multireddit=ARGS.multireddit,
                         previd=LAST,
                         reddit_sort=sort_type)

        # measure time and set the program to wait 4 second between request
        # as per reddit api guidelines
        end_time = time.clock()

        if start_time is not None:
            elapsed_time = end_time - start_time

            if elapsed_time <= 4:  # throttling
                time.sleep(4 - elapsed_time)

        start_time = time.clock()

        if not ITEMS:
            # No more items to process
            break

        for ITEM in ITEMS:
            TOTAL += 1

            # not downloading if url is reddit comment
            if ('reddit.com/r/' + ARGS.reddit + '/comments/' in ITEM['url'] or
                    re.match(reddit_comment_regex, ITEM['url']) is not None):
                print '    Skip:[{}]'.format(ITEM['url'])
                continue

            if ITEM['score'] < ARGS.score:
                if ARGS.verbose:
                    print '    SCORE: {} has score of {}'.format(
                        ITEM['id'], ITEM['score'])
                    'which is lower than required score of {}.'.format(
                        ARGS.score)

                SKIPPED += 1
                continue
            elif ARGS.sfw and ITEM['over_18']:
                if ARGS.verbose:
                    print '    NSFW: %s is marked as NSFW.' % (ITEM['id'])

                SKIPPED += 1
                continue
            elif ARGS.nsfw and not ITEM['over_18']:
                if ARGS.verbose:
                    print '    Not NSFW, skipping %s' % (ITEM['id'])

                SKIPPED += 1
                continue
            elif ARGS.regex and not re.match(RE_RULE, ITEM['title']):
                if ARGS.verbose:
                    print '    Regex match failed'

                SKIPPED += 1
                continue
            elif ARGS.skipAlbums and 'imgur.com/a/' in ITEM['url']:
                if ARGS.verbose:
                    print '    Album found, skipping %s' % (ITEM['id'])

                SKIPPED += 1
                continue

            if ARGS.title_contain and ARGS.title_contain.lower(
            ) not in ITEM['title'].lower():
                if ARGS.verbose:
                    print '    Title not contain "{}",'.format(
                        ARGS.title_contain)
                    'skipping {}'.format(ITEM['id'])

                SKIPPED += 1
                continue

            FILECOUNT = 0
            try:
                URLS = extract_urls(ITEM['url'])
            except Exception:
                _log.exception("Failed to extract urls for %r", URLS)
                continue
            for URL in URLS:
                try:
                    # Find gfycat if requested
                    if URL.endswith('gif') and ARGS.mirror_gfycat:
                        check = gfycat().check(URL)
                        if check.get("urlKnown"):
                            URL = check.get('webmUrl')

                    # Trim any http query off end of file extension.
                    FILEEXT = pathsplitext(URL)[1]
                    if '?' in FILEEXT:
                        FILEEXT = FILEEXT[:FILEEXT.index('?')]

                    # Only append numbers if more than one file
                    FILENUM = ('_%d' % FILECOUNT if len(URLS) > 1 else '')

                    # create filename based on given input from user
                    if ARGS.filename_format == 'url':
                        FILENAME = '%s%s%s' % (pathsplitext(
                            pathbasename(URL))[0], '', FILEEXT)
                    elif ARGS.filename_format == 'title':
                        FILENAME = '%s%s%s' % (slugify(
                            ITEM['title']), FILENUM, FILEEXT)
                        if len(FILENAME) >= 256:
                            shortened_item_title = slugify(
                                ITEM['title'])[:256 - len(FILENAME)]
                            FILENAME = '%s%s%s' % (shortened_item_title,
                                                   FILENUM, FILEEXT)
                    else:
                        FILENAME = '%s%s%s' % (ITEM['id'], FILENUM, FILEEXT)
                    # join file with directory
                    FILEPATH = pathjoin(ARGS.dir, FILENAME)

                    # Improve debuggability list URL before download too.
                    # url may be wrong so skip that
                    if URL.encode('utf-8') == 'http://':
                        raise URLError('Url is empty')
                    else:
                        text_templ = '    Attempting to download URL[{}] as [{}].'
                        print text_templ.format(URL.encode('utf-8'),
                                                FILENAME.encode('utf-8'))

                    # Download the image
                    try:
                        download_from_url(URL, FILEPATH)
                        # Image downloaded successfully!
                        print '    Sucessfully downloaded URL [%s] as [%s].' % (
                            URL, FILENAME)
                        DOWNLOADED += 1
                        FILECOUNT += 1

                    except Exception, e:
                        print '    %s' % str(e)
                        ERRORS += 1

                    if ARGS.num and DOWNLOADED >= ARGS.num:
                        FINISHED = True
                        break
                except WrongFileTypeException as ERROR:
                    print '    %s' % (ERROR)
                    _log_wrongtype(url=URL,
                                   target_dir=ARGS.dir,
                                   filecount=FILECOUNT,
                                   _downloaded=DOWNLOADED,
                                   filename=FILENAME)
                    SKIPPED += 1
                except FileExistsException as ERROR:
                    print '    %s' % (ERROR)
                    ERRORS += 1
                    if ARGS.update:
                        print '    Update complete, exiting.'
                        FINISHED = True
                        break
                except HTTPError as ERROR:
                    print '    HTTP ERROR: Code %s for %s.' % (ERROR.code, URL)
                    FAILED += 1
def main():
    ARGS = parse_args(sys.argv[1:])

    logging.basicConfig(level=logging.INFO)
    print(parse_reddit_argument(ARGS.reddit))

    global lock
    lock.acquire(1)
    global TOTAL, DOWNLOADED, ERRORS, SKIPPED, FAILED, FILECOUNT
    FINISHED = False
    lock.release()
    threadList = []

    # Create the specified directory if it doesn't already exist.
    if not pathexists(ARGS.dir):
        mkdir(ARGS.dir)

    # If a regex has been specified, compile the rule (once)
    RE_RULE = None
    if ARGS.regex:
        RE_RULE = re.compile(ARGS.regex)

    # compile reddit comment url to check if url is one of them
    reddit_comment_regex = re.compile(r'.*reddit\.com\/r\/(.*?)\/comments')

    LAST = ARGS.last

    ITEM = None

    sort_type = ARGS.sort_type
    if sort_type:
        sort_type = sort_type.lower()

    while not FINISHED:
        ITEMS = getitems(
            ARGS.reddit, multireddit=ARGS.multireddit, previd=LAST,
            reddit_sort=sort_type)

        if not ITEMS:
            # No more items to process
            break

        for ITEM in ITEMS:
            TOTAL += 1

            # not downloading if url is reddit comment
            if ('reddit.com/r/' + ARGS.reddit + '/comments/' in ITEM['url'] or
                    re.match(reddit_comment_regex, ITEM['url']) is not None):
                print('    Skip:[{}]'.format(ITEM['url']))
                continue

            if ITEM['score'] < ARGS.score:
                if ARGS.verbose:
                    print('    SCORE: {} has score of {}'.format(ITEM['id'], ITEM['score']),
                          'which is lower than required score of {}.'.format(ARGS.score))

                SKIPPED += 1
                continue
            elif ARGS.sfw and ITEM['over_18']:
                if ARGS.verbose:
                    print('    NSFW: %s is marked as NSFW.' % (ITEM['id']))

                SKIPPED += 1
                continue
            elif ARGS.nsfw and not ITEM['over_18']:
                if ARGS.verbose:
                    print('    Not NSFW, skipping %s' % (ITEM['id']))

                SKIPPED += 1
                continue
            elif ARGS.regex and not re.match(RE_RULE, ITEM['title']):
                if ARGS.verbose:
                    print('    Regex not matched')

                SKIPPED += 1
                continue
            elif ARGS.skipAlbums and 'imgur.com/a/' in ITEM['url']:
                if ARGS.verbose:
                    print('    Album found, skipping %s' % (ITEM['id']))

                SKIPPED += 1
                continue

            if ARGS.title_contain and ARGS.title_contain.lower() not in ITEM['title'].lower():
                if ARGS.verbose:
                    print('    Title does not contain "{}",'.format(ARGS.title_contain),
                          'skipping {}'.format(ITEM['id']))

                SKIPPED += 1
                continue
            lock.acquire(1)
            FILECOUNT = 0
            lock.release()
            try:
                URLS = extract_urls(ITEM['url'])
            except Exception:
                _log.exception("Failed to extract urls for %r", URLS)
                continue
            for URL in URLS:
                try:
                    # Find gfycat if requested
                    if URL.endswith('gif') and ARGS.mirror_gfycat:
                        check = gfycat().check(URL)
                        if check.get("urlKnown"):
                            URL = check.get('webmUrl')

                    FILEEXT = pathsplitext(URL)[1]
                    # Trim any http query off end of file extension.
                    FILEEXT = re.sub(r'\?.*$', '', FILEEXT)
                    if not FILEEXT:
                        # A more usable option that empty.
                        # The extension can be fixed after downloading, but then the 'already downloaded' check will be harder.
                        FILEEXT = '.jpg'

                    # Only append numbers if more than one file
                    lock.acquire(1)
                    FILENUM = ('_%d' % FILECOUNT if len(URLS) > 1 else '')
                    lock.release()

                    # create filename based on given input from user
                    if ARGS.filename_format == 'url':
                        FILENAME = '%s%s%s' % (pathsplitext(pathbasename(URL))[0], '', FILEEXT)
                    elif ARGS.filename_format == 'title':
                        FILENAME = '%s%s%s' % (slugify(ITEM['title']), FILENUM, FILEEXT)
                        if len(FILENAME) >= 256:
                            shortened_item_title = slugify(ITEM['title'])[:256-len(FILENAME)]
                            FILENAME = '%s%s%s' % (shortened_item_title, FILENUM, FILEEXT)
                    else:
                        FILENAME = '%s%s%s' % (ITEM['id'], FILENUM, FILEEXT)
                    # join file with directory
                    FILEPATH = pathjoin(ARGS.dir, FILENAME)

                    # Improve debuggability list URL before download too.
                    # url may be wrong so skip that
                    if URL.encode('utf-8') == 'http://':
                        raise URLError('Url is empty')
                    else:
                        text_templ = '    Attempting to download URL[{}] as [{}].'
                        print(text_templ.format(URL.encode('utf-8'), FILENAME.encode('utf-8')))

                    # Download the image
                    while(threading.active_count() > 5):
                        time.sleep(5)

                    lock.acquire(1)
                    urlCopy = URL
                    filepathCopy = FILEPATH
                    t = threading.Thread(target=download_threaded, args=([urlCopy, filepathCopy, DOWNLOADED, FILECOUNT, ERRORS], ))
                    t.start()
                    lock.release()
                    threadList.append(t)

                    lock.acquire(1)
                    if ARGS.num and DOWNLOADED >= ARGS.num:
                        FINISHED = True
                        lock.release()
                        break
                    lock.release()
                except WrongFileTypeException as ERROR:
                    print('    %s' % (ERROR,))
                    lock.acquire(1)
                    _log_wrongtype(url=URL, target_dir=ARGS.dir,
                                   filecount=FILECOUNT, _downloaded=DOWNLOADED,
                                   filename=FILENAME)
                    lock.release()
                    SKIPPED += 1
                except FileExistsException as ERROR:
                    print('    %s' % (ERROR,))
                    ERRORS += 1
                    if ARGS.update:
                        print('    Update complete, exiting.')
                        FINISHED = True
                        break
                except HTTPError as ERROR:
                    print('    HTTP ERROR: Code %s for %s.' % (ERROR.code, URL))
                    FAILED += 1
                except URLError as ERROR:
                    print('    URL ERROR: %s!' % (URL,))
                    FAILED += 1
                except InvalidURL as ERROR:
                    print('    Invalid URL: %s!' % (URL,))
                    FAILED += 1
                except Exception as exc:
                    _log.exception("Problem with %r: %r", URL, exc)
                    FAILED += 1

            if FINISHED:
                break

        LAST = ITEM['id'] if ITEM is not None else None

    # Wait for each thread to finish downloading
    for t in threadList:
        t.join()

    print('Downloaded {} files'.format(DOWNLOADED),
          '(Processed {}, Skipped {}, Exists {})'.format(TOTAL, SKIPPED, ERRORS))
示例#8
0
def main():
    ARGS = parse_args(sys.argv[1:])

    logging.basicConfig(level=logging.INFO)
    print(parse_reddit_argument(ARGS.reddit))

    TOTAL = DOWNLOADED = ERRORS = SKIPPED = FAILED = 0
    FINISHED = False

    # Create the specified directory if it doesn't already exist.
    if not pathexists(ARGS.dir):
        mkdir(ARGS.dir)

    # If a regex has been specified, compile the rule (once)
    RE_RULE = None
    if ARGS.regex:
        RE_RULE = re.compile(ARGS.regex)

    # compile reddit comment url to check if url is one of them
    reddit_comment_regex = re.compile(r'.*reddit\.com\/r\/(.*?)\/comments')

    LAST = ARGS.last

    start_time = None
    ITEM = None

    sort_type = ARGS.sort_type
    if sort_type:
        sort_type = sort_type.lower()

    #for downloading comments
    reddit = praw.Reddit('bot1')
    redanno = dict()
    capdict = dict()
    captions = []

    #for downloading comments and saving in pre-format
    cap_l = []
    cap_5 = []

    while not FINISHED:
        ITEMS = getitems(
            ARGS.reddit, multireddit=ARGS.multireddit, previd=LAST,
            reddit_sort=sort_type)

        # measure time and set the program to wait 4 second between request
        # as per reddit api guidelines
        end_time = time.clock()

        if start_time is not None:
            elapsed_time = end_time - start_time

            if elapsed_time <= 4:  # throttling
                time.sleep(4 - elapsed_time)

        start_time = time.clock()

        if not ITEMS:
            # No more items to process
            break

        for ITEM in ITEMS:
            TOTAL += 1
            #print("This is ITEM['id'] : ", ITEM['id'])
            #print("This is ITEM : ", ITEM)

            # not downloading if url is reddit comment
            if ('reddit.com/r/' + ARGS.reddit + '/comments/' in ITEM['url'] or
                    re.match(reddit_comment_regex, ITEM['url']) is not None):
                print('    Skip:[{}]'.format(ITEM['url']))
                continue

            if ITEM['score'] < ARGS.score:
                if ARGS.verbose:
                    print('    SCORE: {} has score of {}'.format(ITEM['id'], ITEM['score']),
                          'which is lower than required score of {}.'.format(ARGS.score))

                SKIPPED += 1
                continue
            elif ARGS.sfw and ITEM['over_18']:
                if ARGS.verbose:
                    print('    NSFW: %s is marked as NSFW.' % (ITEM['id']))

                SKIPPED += 1
                continue
            elif ARGS.nsfw and not ITEM['over_18']:
                if ARGS.verbose:
                    print('    Not NSFW, skipping %s' % (ITEM['id']))

                SKIPPED += 1
                continue
            elif ARGS.regex and not re.match(RE_RULE, ITEM['title']):
                if ARGS.verbose:
                    print('    Regex not matched')

                SKIPPED += 1
                continue
            elif ARGS.skipAlbums and 'imgur.com/a/' in ITEM['url']:
                if ARGS.verbose:
                    print('    Album found, skipping %s' % (ITEM['id']))

                SKIPPED += 1
                continue

            if ARGS.title_contain and ARGS.title_contain.lower() not in ITEM['title'].lower():
                if ARGS.verbose:
                    print('    Title does not contain "{}",'.format(ARGS.title_contain),
                          'skipping {}'.format(ITEM['id']))

                SKIPPED += 1
                continue

            FILECOUNT = 0
            try:
                URLS = extract_urls(ITEM['url'])
                #print("This is URLS :", URLS)
                #download_comments(ITEM['id'])
                #Using the def function does not save all the comments from each submission
                comm = reddit.submission(id=ITEM['id'])
                comm.comment_sort = 'best'
                comm.comments.replace_more(limit=0)     # replace "more comments" from commentforest
                comm.comment_limit = 5
                comm_list = list(comm.comments)
                if len(comm_list) < 5:
                    continue
                #print("This is a list: ", comm_list)
                #print("This is len   : ", len(comm_list))
                #don't empty the cap_5 because reading become difficult
                #cap_5 = []
                for i in range(5):
                    #print("This is comment: ", comm_list[i].body)
                    capdict["image_id"] = ITEM['id']
                    capdict["title"] = ITEM['title']
                    capdict["post_upvotes"] = ITEM['ups']
                    capdict["comment"] = comm_list[i].body
                    capdict["comment_score"] = comm_list[i].score

                    captions.append(capdict)
                    #print(captions)
                    capdict = dict()

                    filename = ITEM['id'] + '.jpg#' + str(i)
                    caption = comm_list[i].body
                    caption = ' '.join(caption.split())
                    cap_5.append(filename + '\t' + caption)

                    if len(cap_5)%50 == 0:
                        print("\n")
                        print("{} images have been downloaded.".format((len(cap_5)//50) * 10))
                        print("-"*50)

                #cap_l.append(cap_5)


            except Exception:
                _log.exception("Failed to extract urls for %r", URLS)
                continue
            for URL in URLS:
                try:
                    # Find gfycat if requested
                    if URL.endswith('gif') and ARGS.mirror_gfycat:
                        check = gfycat().check(URL)
                        if check.get("urlKnown"):
                            URL = check.get('webmUrl')

                    FILEEXT = pathsplitext(URL)[1]
                    # Trim any http query off end of file extension.
                    FILEEXT = re.sub(r'\?.*$', '', FILEEXT)
                    if not FILEEXT:
                        # A more usable option that empty.
                        # The extension can be fixed after downloading, but then the 'already downloaded' check will be harder.
                        FILEEXT = '.jpg'

                    # Only append numbers if more than one file
                    FILENUM = ('_%d' % FILECOUNT if len(URLS) > 1 else '')

                    # create filename based on given input from user
                    if ARGS.filename_format == 'url':
                        FILENAME = '%s%s%s' % (pathsplitext(pathbasename(URL))[0], '', FILEEXT)
                    elif ARGS.filename_format == 'title':
                        FILENAME = '%s%s%s' % (slugify(ITEM['title']), FILENUM, FILEEXT)
                        if len(FILENAME) >= 256:
                            shortened_item_title = slugify(ITEM['title'])[:256-len(FILENAME)]
                            FILENAME = '%s%s%s' % (shortened_item_title, FILENUM, FILEEXT)
                    else:
                        FILENAME = '%s%s%s' % (ITEM['id'], FILENUM, FILEEXT)
                    # join file with directory
                    FILEPATH = pathjoin(ARGS.dir, FILENAME)

                    # Improve debuggability list URL before download too.
                    # url may be wrong so skip that
                    if URL.encode('utf-8') == 'http://':
                        raise URLError('Url is empty')
                    else:
                        text_templ = '    Attempting to download URL[{}] as [{}].'
                        #print(text_templ.format(URL.encode('utf-8'), FILENAME.encode('utf-8')))

                    # Download the image
                    try:
                        download_from_url(URL, FILEPATH)
                        #download_comments(ITEM[id])
                        # Image downloaded successfully
                        print('    Successfully downloaded URL [%s] as [%s].' % (URL, FILENAME))
                        DOWNLOADED += 1
                        FILECOUNT += 1


                    except Exception as exc:
                        print('    %s' % (exc,))
                        ERRORS += 1

                    if ARGS.num and DOWNLOADED >= ARGS.num:
                        FINISHED = True
                        break
                except WrongFileTypeException as ERROR:
                    print('    %s' % (ERROR,))
                    _log_wrongtype(url=URL, target_dir=ARGS.dir,
                                   filecount=FILECOUNT, _downloaded=DOWNLOADED,
                                   filename=FILENAME)
                    SKIPPED += 1
                except FileExistsException as ERROR:
                    print('    %s' % (ERROR,))
                    ERRORS += 1
                    if ARGS.update:
                        print('    Update complete, exiting.')
                        FINISHED = True
                        break
                except HTTPError as ERROR:
                    print('    HTTP ERROR: Code %s for %s.' % (ERROR.code, URL))
                    FAILED += 1
                except URLError as ERROR:
                    print('    URL ERROR: %s!' % (URL,))
                    FAILED += 1
                except InvalidURL as ERROR:
                    print('    Invalid URL: %s!' % (URL,))
                    FAILED += 1
                except Exception as exc:
                    _log.exception("Problem with %r: %r", URL, exc)
                    FAILED += 1

            if FINISHED:
                break

        LAST = ITEM['id'] if ITEM is not None else None
        """
        #finally saving the comments json
        redanno['pics'] = captions
        with open ("reddit_captions.json", "w") as f:
            json.dump(redanno, f, indent=4)

        #saving image id and their captions sequentially
        with open("annotations.txt", "w") as f:
            for s in cap_5:
                f.write(str(s) + "\n")
        """

    print('Downloaded {} files'.format(DOWNLOADED),
          '(Processed {}, Skipped {}, Exists {})'.format(TOTAL, SKIPPED, ERRORS))

    #finally saving the comments json
    redanno['pics'] = captions
    with open ("reddit_captions.json", "w") as f:
        json.dump(redanno, f, indent=4)

    #saving image id and their captions sequentially
    with open("annotations.txt", "w") as f:
        for s in cap_5:
            f.write(str(s) + "\n")
    print("annotations.txt has been saved.")
示例#9
0
def main():
    ARGS = parse_args(sys.argv[1:])

    logging.basicConfig(level=logging.INFO)
    print parse_reddit_argument(ARGS.reddit)

    TOTAL = DOWNLOADED = ERRORS = SKIPPED = FAILED = 0
    FINISHED = False

    # Create the specified directory if it doesn't already exist.
    if not pathexists(ARGS.dir):
        mkdir(ARGS.dir)

    # If a regex has been specified, compile the rule (once)
    RE_RULE = None
    if ARGS.regex:
        RE_RULE = re.compile(ARGS.regex, re.UNICODE)

    # compile reddit comment url to check if url is one of them
    reddit_comment_regex = re.compile(r'.*reddit\.com\/r\/(.*?)\/comments')

    LAST = ARGS.last

    start_time = None
    ITEM = None

    while not FINISHED:
        ITEMS = getitems(
            ARGS.reddit, multireddit=ARGS.multireddit, previd=LAST,
            reddit_sort=ARGS.sort_type)

        # measure time and set the program to wait 4 second between request
        # as per reddit api guidelines
        end_time = time.clock()

        if start_time is not None:
            elapsed_time = end_time - start_time

            if elapsed_time <= 4:  # throttling
                time.sleep(4 - elapsed_time)

        start_time = time.clock()

        if not ITEMS:
            # No more items to process
            break

        for ITEM in ITEMS:
            TOTAL += 1

            # not downloading if url is reddit comment
            if ('reddit.com/r/' + ARGS.reddit + '/comments/' in ITEM['url'] or
                    re.match(reddit_comment_regex, ITEM['url']) is not None):
                print '    Skip:[{}]'.format(ITEM['url'])
                continue

            # verifies dimensions
            # title of submission must contain image dimensions in format [ WIDTH x HEIGHT ]
            # brackets can also be parenthesis and the x may also be special character ×
            ITEM['title'] = ITEM['title'].replace(u'×', u'x')
            dim_pattern = re.compile(ur'[\[|\(][0-9]+[ ]*[x|X][ ]*[0-9]+[\]|\)]', re.UNICODE)
            dim_regex = dim_pattern.search(ITEM['title'])
            if dim_regex:
              dimension = dim_regex.group(0).replace('[','').replace(']','').replace('(','').replace(')','').replace(' ','')
              dimension = re.split('x|×|X',dimension)
              if len(dimension) == 2:
                if int(dimension[0]) < ARGS.width or int(dimension[1]) < ARGS.height:
                  if ARGS.verbose:
                      print '    DIMENSION: {} is smaller than {}x{}.'.format(ITEM['title'], ARGS.width, ARGS.height)                      
                  SKIPPED += 1
                  continue
                
            if ITEM['score'] >= ARGS.maxscore:
                if ARGS.verbose:
                    print '    SCORE: {} has score of {}'.format(ITEM['id'], ITEM['score'])
                    'which is higher than maximum score of {}.'.format(ARGS.maxscore)

                SKIPPED += 1
                continue
                
            if ITEM['score'] < ARGS.score:
                if ARGS.verbose:
                    print '    SCORE: {} has score of {}'.format(ITEM['id'], ITEM['score'])
                    'which is lower than required score of {}.'.format(ARGS.score)

                SKIPPED += 1
                continue
            elif ARGS.sfw and ITEM['over_18']:
                if ARGS.verbose:
                    print '    NSFW: %s is marked as NSFW.' % (ITEM['id'])

                SKIPPED += 1
                continue
            elif ARGS.nsfw and not ITEM['over_18']:
                if ARGS.verbose:
                    print '    Not NSFW, skipping %s' % (ITEM['id'])

                SKIPPED += 1
                continue
            elif ARGS.regex and not re.match(RE_RULE, ITEM['title']):
                if ARGS.verbose:
                    print '    Regex match failed'

                SKIPPED += 1
                continue
            elif ARGS.skipAlbums and 'imgur.com/a/' in ITEM['url']:
                if ARGS.verbose:
                    print '    Album found, skipping %s' % (ITEM['id'])

                SKIPPED += 1
                continue

            if ARGS.title_contain and ARGS.title_contain.lower() not in ITEM['title'].lower():
                if ARGS.verbose:
                    print '    Title not contain "{}",'.format(ARGS.title_contain)
                    'skipping {}'.format(ITEM['id'])

                SKIPPED += 1
                continue

            FILECOUNT = 0
            try:
                URLS = extract_urls(ITEM['url'])
            except Exception:
                _log.exception("Failed to extract urls for %r", URLS)
                continue
            for URL in URLS:
                try:
                    # Find gfycat if requested
                    if URL.endswith('gif') and ARGS.mirror_gfycat:
                        check = gfycat().check(URL)
                        if check.get("urlKnown"):
                            URL = check.get('webmUrl')

                    # Trim any http query off end of file extension.
                    FILEEXT = pathsplitext(URL)[1]
                    if '?' in FILEEXT:
                        FILEEXT = FILEEXT[:FILEEXT.index('?')]

                    # Only append numbers if more than one file
                    FILENUM = ('_%d' % FILECOUNT if len(URLS) > 1 else '')

                    # create filename based on given input from user
                    if ARGS.filename_format == 'url':
                        FILENAME = '%s%s%s' % (pathsplitext(pathbasename(URL))[0], '', FILEEXT)
                    elif ARGS.filename_format == 'title':
                        FILENAME = '%s%s%s' % (slugify(ITEM['title']), FILENUM, FILEEXT)
                        if len(FILENAME) >= 256:
                            shortened_item_title = slugify(ITEM['title'])[:256-len(FILENAME)]
                            FILENAME = '%s%s%s' % (shortened_item_title, FILENUM, FILEEXT)
                    else:
                        FILENAME = '%s%s%s' % (ITEM['id'], FILENUM, FILEEXT)
                    # join file with directory
                    FILEPATH = pathjoin(ARGS.dir, FILENAME)

                    # Improve debuggability list URL before download too.
                    # url may be wrong so skip that
                    if URL.encode('utf-8') == 'http://':
                        raise URLError('Url is empty')
                    else:
                        text_templ = '    Attempting to download URL[{}] as [{}].'
                        print text_templ.format(URL.encode('utf-8'), FILENAME.encode('utf-8'))

                    # Download the image
                    try:
                        download_from_url(URL, FILEPATH)
                        # Image downloaded successfully!
                        print '    Sucessfully downloaded URL [%s] as [%s].' % (URL, FILENAME)
                        DOWNLOADED += 1
                        FILECOUNT += 1

                    except Exception,e:
                        print '    %s' % str(e)
                        ERRORS += 1

                    if ARGS.num and DOWNLOADED >= ARGS.num:
                        FINISHED = True
                        break
                except WrongFileTypeException as ERROR:
                    print '    %s' % (ERROR)
                    _log_wrongtype(url=URL, target_dir=ARGS.dir,
                                   filecount=FILECOUNT, _downloaded=DOWNLOADED,
                                   filename=FILENAME)
                    SKIPPED += 1
                except FileExistsException as ERROR:
                    print '    %s' % (ERROR)
                    ERRORS += 1
                    if ARGS.update:
                        print '    Update complete, exiting.'
                        FINISHED = True
                        break
                except HTTPError as ERROR:
                    print '    HTTP ERROR: Code %s for %s.' % (ERROR.code, URL)
                    FAILED += 1
示例#10
0
def main():
    ARGS = parse_args(sys.argv[1:])

    logging.basicConfig(level=logging.INFO)
    print(parse_reddit_argument(ARGS.reddit))

    TOTAL = DOWNLOADED = ERRORS = SKIPPED = FAILED = 0
    FINISHED = False

    # Create the specified directory if it doesn't already exist.
    if not pathexists(ARGS.dir):
        mkdir(ARGS.dir)

    # If a regex has been specified, compile the rule (once)
    RE_RULE = None
    if ARGS.regex:
        RE_RULE = re.compile(ARGS.regex)

    # compile reddit comment url to check if url is one of them
    reddit_comment_regex = re.compile(r'.*reddit\.com\/r\/(.*?)\/comments')

    LAST = ARGS.last

    start_time = None
    ITEM = None

    sort_type = ARGS.sort_type
    if sort_type:
        sort_type = sort_type.lower()

    while not FINISHED:
        ITEMS = getitems(ARGS.reddit,
                         multireddit=ARGS.multireddit,
                         previd=LAST,
                         reddit_sort=sort_type,
                         user=ARGS.user)

        # measure time and set the program to wait 4 second between request
        # as per reddit api guidelines
        end_time = time.clock()

        if start_time is not None:
            elapsed_time = end_time - start_time

            if elapsed_time <= 4:  # throttling
                time.sleep(4 - elapsed_time)

        start_time = time.clock()

        if not ITEMS:
            # No more items to process
            break

        for ITEM in ITEMS:
            TOTAL += 1

            # not downloading if url is reddit comment
            if ('reddit.com/r/' + ARGS.reddit + '/comments/' in ITEM['url'] or
                    re.match(reddit_comment_regex, ITEM['url']) is not None):
                print('    Skip:[{}]'.format(ITEM['url']))
                continue

            if ITEM['score'] < ARGS.score:
                if ARGS.verbose:
                    print(
                        '    SCORE: {} has score of {}'.format(
                            ITEM['id'], ITEM['score']),
                        'which is lower than required score of {}.'.format(
                            ARGS.score))

                SKIPPED += 1
                continue
            elif ARGS.sfw and ITEM['over_18']:
                if ARGS.verbose:
                    print('    NSFW: %s is marked as NSFW.' % (ITEM['id']))

                SKIPPED += 1
                continue
            elif ARGS.nsfw and not ITEM['over_18']:
                if ARGS.verbose:
                    print('    Not NSFW, skipping %s' % (ITEM['id']))

                SKIPPED += 1
                continue
            elif ARGS.regex and not re.match(RE_RULE, ITEM['title']):
                if ARGS.verbose:
                    print('    Regex not matched')

                SKIPPED += 1
                continue
            elif ARGS.skipAlbums and 'imgur.com/a/' in ITEM['url']:
                if ARGS.verbose:
                    print('    Album found, skipping %s' % (ITEM['id']))

                SKIPPED += 1
                continue

            if ARGS.title_contain and ARGS.title_contain.lower(
            ) not in ITEM['title'].lower():
                if ARGS.verbose:
                    print(
                        '    Title does not contain "{}",'.format(
                            ARGS.title_contain),
                        'skipping {}'.format(ITEM['id']))

                SKIPPED += 1
                continue

            FILECOUNT = 0
            COMMENTS_ALBUM = False
            try:
                URLS = extract_urls(ITEM['url'])

                if ARGS.comment_album:
                    if re.search("album.+?comment", ITEM['title'],
                                 re.IGNORECASE):
                        comments_url = "https://www.reddit.com" + ITEM[
                            'permalink'] + ".json"
                        print(
                            '    Album in comments appears to be available for %s. Attempting to find URL in top comment: %s'
                            % (ITEM['title'], comments_url))
                        comment_album_urls = []

                        try:
                            time.sleep(4)
                            req = Request(comments_url)
                            json = urlopen(req).read()
                            data = JSONDecoder().decode(json)
                            comments = [
                                x['data'] for x in data[1]['data']['children']
                            ]
                            print('    First comment text: %s' % (comments[int(
                                ARGS.comment_album_offset)]['body']))
                            comment_urls = re.finditer(
                                r"[-a-zA-Z0-9@:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}\b(\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?",
                                comments[int(
                                    ARGS.comment_album_offset)]['body'])
                            for comment_url in comment_urls:
                                comment_url = comment_url.group()
                                comment_url = extract_urls(comment_url)
                                comment_album_urls += comment_url

                            if len(comment_album_urls) == 0:
                                print(
                                    '    Failed to retrieve album from comments'
                                )
                            else:
                                URLS = URLS + comment_album_urls
                                COMMENTS_ALBUM = True
                        except HTTPError as ERROR:
                            error_message = '\tHTTP ERROR: Code %s for %s' % (
                                ERROR.code, comments_url)
                            sys.exit(error_message)
                        except ValueError as ERROR:
                            if ERROR.args[
                                    0] == 'No JSON object could be decoded':
                                error_message = 'ERROR: subreddit "%s" does not exist' % (
                                    subreddit)
                                sys.exit(error_message)
                            raise ERROR
            except Exception:
                _log.exception("Failed to extract urls for %r", ITEM['url'])
                continue
            for URL in URLS:
                try:
                    # Find gfycat if requested
                    if URL.endswith('gif') and ARGS.mirror_gfycat:
                        check = gfycat().check(URL)
                        if check.get("urlKnown"):
                            URL = check.get('webmUrl')

                    FILEEXT = pathsplitext(URL)[1]
                    # Trim any http query off end of file extension.
                    FILEEXT = re.sub(r'\?.*$', '', FILEEXT)
                    if not FILEEXT:
                        # A more usable option that empty.
                        # The extension can be fixed after downloading, but then the 'already downloaded' check will be harder.
                        FILEEXT = '.jpg'

                    # Only append numbers if more than one file
                    FILENUM = ('_%d' % FILECOUNT if len(URLS) > 1 else '')

                    # create filename based on given input from user
                    if ARGS.filename_format == 'url':
                        FILENAME = '%s%s%s' % (pathsplitext(
                            pathbasename(URL))[0], '', FILEEXT)
                    elif ARGS.filename_format == 'title':
                        FILENAME = '%s%s%s' % (slugify(
                            ITEM['title']), FILENUM, FILEEXT)
                        if len(FILENAME) >= 256:
                            shortened_item_title = slugify(
                                ITEM['title'])[:256 - len(FILENAME)]
                            FILENAME = '%s%s%s' % (shortened_item_title,
                                                   FILENUM, FILEEXT)
                    elif ARGS.filename_format == 'title-id':
                        FILENAME = '%s%s (%s)%s' % (slugify(
                            ITEM['title']), FILENUM, ITEM['id'], FILEEXT)
                        if len(FILENAME) >= 256:
                            shortened_item_title = slugify(
                                ITEM['title'])[:256 - len(FILENAME)]
                            FILENAME = '%s%s%s' % (shortened_item_title,
                                                   FILENUM, FILEEXT)
                    else:
                        FILENAME = '%s%s%s' % (ITEM['id'], FILENUM, FILEEXT)
                    # join file with directory
                    FILEPATH = pathjoin(ARGS.dir, FILENAME)

                    # Improve debuggability list URL before download too.
                    # url may be wrong so skip that
                    if URL.encode('utf-8') == 'http://':
                        raise URLError('Url is empty')
                    else:
                        text_templ = '    Attempting to download URL[{}] as [{}].'
                        print(
                            text_templ.format(URL.encode('utf-8'),
                                              FILENAME.encode('utf-8')))

                    # Download the image
                    download_from_url(URL, FILEPATH)
                    # Image downloaded successfully!
                    print('    Sucessfully downloaded URL [%s] as [%s].' %
                          (URL, FILENAME))
                    DOWNLOADED += 1
                    FILECOUNT += 1

                    if ARGS.num and DOWNLOADED >= ARGS.num:
                        FINISHED = True
                        break
                except WrongFileTypeException as ERROR:
                    print('    %s' % (ERROR, ))
                    _log_wrongtype(url=URL,
                                   target_dir=ARGS.dir,
                                   filecount=FILECOUNT,
                                   _downloaded=DOWNLOADED,
                                   filename=FILENAME)
                    SKIPPED += 1
                except FileExistsException as ERROR:
                    print('    %s' % (ERROR, ))
                    ERRORS += 1
                    FILECOUNT += 1
                    if ARGS.update:
                        print('    Update complete, exiting.')
                        FINISHED = True
                        break
                except HTTPError as ERROR:
                    print('    HTTP ERROR: Code %s for %s.' %
                          (ERROR.code, URL))
                    FAILED += 1
                except URLError as ERROR:
                    print('    URL ERROR: %s!' % (URL, ))
                    FAILED += 1
                except InvalidURL as ERROR:
                    print('    Invalid URL: %s!' % (URL, ))
                    FAILED += 1
                except Exception as exc:
                    _log.exception("Problem with %r: %r", URL, exc)
                    FAILED += 1

            if FINISHED:
                break

        LAST = ITEM['id'] if ITEM is not None else None

    print(
        'Downloaded {} files'.format(DOWNLOADED),
        '(Processed {}, Skipped {}, Exists {})'.format(TOTAL, SKIPPED, ERRORS))
示例#11
0
def main():
    ARGS = parse_args(sys.argv[1:])

    logging.basicConfig(level=logging.INFO)
    print(parse_reddit_argument(ARGS.reddit))

    TOTAL = DOWNLOADED = ERRORS = SKIPPED = FAILED = 0
    FINISHED = False

    # Create the specified directory if it doesn't already exist.
    if not pathexists(ARGS.dir):
        mkdir(ARGS.dir)

    # If a regex has been specified, compile the rule (once)
    RE_RULE = None
    if ARGS.regex:
        RE_RULE = re.compile(ARGS.regex)

    # compile reddit comment url to check if url is one of them
    reddit_comment_regex = re.compile(r'.*reddit\.com\/r\/(.*?)\/comments')

    LAST = ARGS.last

    start_time = None
    ITEM = None

    sort_type = ARGS.sort_type
    if sort_type:
        sort_type = sort_type.lower()

    while not FINISHED:
        ITEMS = getitems(
            ARGS.reddit, multireddit=ARGS.multireddit, previd=LAST,
            reddit_sort=sort_type)

        # measure time and set the program to wait 4 second between request
        # as per reddit api guidelines
        end_time = time.clock()

        if start_time is not None:
            elapsed_time = end_time - start_time

            if elapsed_time <= 4:  # throttling
                time.sleep(4 - elapsed_time)

        start_time = time.clock()

        if not ITEMS:
            # No more items to process
            break

        for ITEM in ITEMS:
            TOTAL += 1

            # not downloading if url is reddit comment
            if ('reddit.com/r/' + ARGS.reddit + '/comments/' in ITEM['url'] or
                    re.match(reddit_comment_regex, ITEM['url']) is not None):
                print('    Skip:[{}]'.format(ITEM['url']))
                continue

            if ITEM['score'] < ARGS.score:
                if ARGS.verbose:
                    print('    SCORE: {} has score of {}'.format(ITEM['id'], ITEM['score']),
                          'which is lower than required score of {}.'.format(ARGS.score))

                SKIPPED += 1
                continue
            elif ARGS.sfw and ITEM['over_18']:
                if ARGS.verbose:
                    print('    NSFW: %s is marked as NSFW.' % (ITEM['id']))

                SKIPPED += 1
                continue
            elif ARGS.nsfw and not ITEM['over_18']:
                if ARGS.verbose:
                    print('    Not NSFW, skipping %s' % (ITEM['id']))

                SKIPPED += 1
                continue
            elif ARGS.regex and not re.match(RE_RULE, ITEM['title']):
                if ARGS.verbose:
                    print('    Regex not matched')

                SKIPPED += 1
                continue
            elif ARGS.skipAlbums and 'imgur.com/a/' in ITEM['url']:
                if ARGS.verbose:
                    print('    Album found, skipping %s' % (ITEM['id']))

                SKIPPED += 1
                continue

            if ARGS.title_contain and ARGS.title_contain.lower() not in ITEM['title'].lower():
                if ARGS.verbose:
                    print('    Title does not contain "{}",'.format(ARGS.title_contain),
                          'skipping {}'.format(ITEM['id']))

                SKIPPED += 1
                continue

            FILECOUNT = 0
            try:
                URLS = extract_urls(ITEM['url'])
            except Exception:
                _log.exception("Failed to extract urls for %r", URLS)
                continue
            for URL in URLS:
                try:
                    # Find gfycat if requested
                    if URL.endswith('gif') and ARGS.mirror_gfycat:
                        check = gfycat().check(URL)
                        if check.get("urlKnown"):
                            URL = check.get('webmUrl')

                    FILEEXT = pathsplitext(URL)[1]
                    # Trim any http query off end of file extension.
                    FILEEXT = re.sub(r'\?.*$', '', FILEEXT)
                    if not FILEEXT:
                        # A more usable option that empty.
                        # The extension can be fixed after downloading, but then the 'already downloaded' check will be harder.
                        FILEEXT = '.jpg'

                    # Only append numbers if more than one file
                    FILENUM = ('_%d' % FILECOUNT if len(URLS) > 1 else '')

                    # create filename based on given input from user
                    if ARGS.filename_format == 'url':
                        FILENAME = '%s%s%s' % (pathsplitext(pathbasename(URL))[0], '', FILEEXT)
                    elif ARGS.filename_format == 'title':
                        FILENAME = '%s%s%s' % (slugify(ITEM['title']), FILENUM, FILEEXT)
                        if len(FILENAME) >= 256:
                            shortened_item_title = slugify(ITEM['title'])[:256-len(FILENAME)]
                            FILENAME = '%s%s%s' % (shortened_item_title, FILENUM, FILEEXT)
                    else:
                        FILENAME = '%s%s%s' % (ITEM['id'], FILENUM, FILEEXT)
                    # join file with directory
                    FILEPATH = pathjoin(ARGS.dir, FILENAME)

                    # Improve debuggability list URL before download too.
                    # url may be wrong so skip that
                    if URL.encode('utf-8') == 'http://':
                        raise URLError('Url is empty')
                    else:
                        text_templ = '    Attempting to download URL[{}] as [{}].'
                        print(text_templ.format(URL.encode('utf-8'), FILENAME.encode('utf-8')))

                    # Download the image
                    try:
                        download_from_url(URL, FILEPATH)
                        # Image downloaded successfully!
                        print('    Sucessfully downloaded URL [%s] as [%s].' % (URL, FILENAME))
                        DOWNLOADED += 1
                        FILECOUNT += 1

                    except Exception as exc:
                        print('    %s' % (exc,))
                        ERRORS += 1

                    if ARGS.num and DOWNLOADED >= ARGS.num:
                        FINISHED = True
                        break
                except WrongFileTypeException as ERROR:
                    print('    %s' % (ERROR,))
                    _log_wrongtype(url=URL, target_dir=ARGS.dir,
                                   filecount=FILECOUNT, _downloaded=DOWNLOADED,
                                   filename=FILENAME)
                    SKIPPED += 1
                except FileExistsException as ERROR:
                    print('    %s' % (ERROR,))
                    ERRORS += 1
                    if ARGS.update:
                        print('    Update complete, exiting.')
                        FINISHED = True
                        break
                except HTTPError as ERROR:
                    print('    HTTP ERROR: Code %s for %s.' % (ERROR.code, URL))
                    FAILED += 1
                except URLError as ERROR:
                    print('    URL ERROR: %s!' % (URL,))
                    FAILED += 1
                except InvalidURL as ERROR:
                    print('    Invalid URL: %s!' % (URL,))
                    FAILED += 1
                except Exception as exc:
                    _log.exception("Problem with %r: %r", URL, exc)
                    FAILED += 1

            if FINISHED:
                break

        LAST = ITEM['id'] if ITEM is not None else None

    print('Downloaded {} files'.format(DOWNLOADED),
          '(Processed {}, Skipped {}, Exists {})'.format(TOTAL, SKIPPED, ERRORS))
def old_tcweb_fnames_remove_duplicates(fname,
                                       mins_to_remove=10,
                                       remove_files=False):
    # 20201010.222325.WP162020.gmi.GPM.37V.40kts.14p2.1p0.jpg
    # 20201010.222325.WP162020.gmi.GPM.89H.40kts.14p2.1p0.jpg.yaml
    matching_fnames = []
    removed_fnames = []
    saved_fnames = []
    ext1 = pathsplitext(fname)[-1]
    ext2 = pathsplitext(pathsplitext(fname)[0])[-1]
    ext3 = pathsplitext(pathsplitext(pathsplitext(fname)[0])[0])[-1]
    if (ext1 == '.jpg') or (ext1 == '.yaml' and ext2 == '.jpg'):
        LOG.info(
            'MATCHES EXT FORMAT. jpg or jpg.yaml. Attempting to remove old_tcweb duplicates'
        )
    else:
        LOG.info(
            'NOT REMOVING DUPLICATES. Not old_tcweb filename, not jpg or jpg.yaml.'
        )
        return [], []

    dirname = pathdirname(fname)
    basename = pathbasename(fname)
    # 20201010.222325.WP162020.gmi.GPM.37V.40kts.14p2.1p0.jpg
    parts = basename.split('.')
    if (len(parts) == 10 and ext1 == '.yaml') or (len(parts) == 9
                                                  and ext1 == '.jpg'):
        LOG.info(
            'NOT REMOVING DUPLICATES. Not old_tcweb filename, does not contain 9 or 10 fields.'
        )
        return [], []

    try:
        # 20201010.222325.WP162020.gmi.GPM.37V.40kts.14p2.1p0.jpg
        yyyymmdd = parts[0]
        hhmnss = parts[1]
        stormname = parts[2]
        sensor = parts[3]
        platform = parts[4]
        product = parts[5]
        intensity = parts[6]
        coverage = parts[7]
        res = parts[8]
        if 'p' not in coverage or 'p' not in res:
            LOG.info(
                'NOT REMOVING DUPLICATES. Not old_tcweb filename, coverage or res not "NNpN.'
            )
            return [], []
        if 'kts' not in intensity:
            LOG.info(
                'NOT REMOVING DUPLICATES. Not old_tcweb filename, intensity does not contain "kts".'
            )
            return [], []
    except IndexError:
        LOG.info(
            'NOT REMOVING DUPLICATES. Unmatched filename format, incorrect number of . delimited fields'
        )
        return [], []
    try:
        fname_dt = datetime.strptime(yyyymmdd + hhmnss, '%Y%m%d%H%M%S')
    except ValueError:
        LOG.info(
            'NOT REMOVING DUPLICATES. Unmatched old_tcweb filename format, incorrect date time string.'
        )
        return [], []
    timediff = timedelta(minutes=mins_to_remove)
    for currdt in minrange(fname_dt - timediff, fname_dt + timediff):
        # 20201010.222325.WP162020.gmi.GPM.37V.40kts.14p2.1p0.jpg
        # 20201010.222325.WP162020.gmi.GPM.37V.*.*.1p0.jpg*
        dtstr = currdt.strftime(
            '{0}/%Y%m%d.%H%M*.{1}.{2}.{3}.{4}.*.*.{5}.jpg*'.format(
                dirname, stormname, sensor, platform, product, res))
        # print(dtstr)
        matching_fnames += glob(dtstr)
    max_coverage = 0
    for matching_fname in matching_fnames:
        # 20201010.222325.WP162020.gmi.GPM.37V.40kts.14p2.1p0.jpg
        parts = pathbasename(matching_fname).split('.')
        coverage = float(parts[7].replace('p', '.'))
        max_coverage = max(coverage, max_coverage)

    gotone = False
    LOG.info('CHECKING DUPLICATE FILES')
    for matching_fname in list(set(matching_fnames)):
        # 20201010.222325.WP162020.gmi.GPM.37V.40kts.14p2.1p0.jpg
        parts = pathbasename(matching_fname).split('.')
        coverage = float(parts[7].replace('p', '.'))
        if coverage < max_coverage or gotone is True:
            removed_fnames += [matching_fname]
            # Test it out for a bit first
            if remove_files is True:
                LOG.info(
                    'DELETING DUPLICATE FILE with less coverage %s < %s %s',
                    coverage, max_coverage, matching_fname)
                osunlink(matching_fname)
            else:
                LOG.info(
                    'TEST DELETING DUPLICATE FILE with less coverage %s < %s %s',
                    coverage, max_coverage, matching_fname)
        else:
            if len(matching_fnames) == 1:
                LOG.info(
                    'SAVING DUPLICATE FILE (only one!) with max coverage %s %s',
                    max_coverage, matching_fname)
            else:
                LOG.info('SAVING DUPLICATE FILE with max coverage %s %s',
                         max_coverage, matching_fname)
            saved_fnames += [matching_fname]
            gotone = True
    return removed_fnames, saved_fnames
示例#13
0
def update_fields(atcf_stormfilename, cc, conn, process=False):
    # Must be of form similar to 
    # Gal912016.dat

    import re
    from datetime import datetime, timedelta
    from os.path import basename as pathbasename
    from os import stat as osstat

    updated_files = []

    LOG.info('Checking '+atcf_stormfilename+' ... process '+str(process))

    # Check if we match Gxxdddddd.dat filename format. If not just return and don't do anything.
    if not re.compile('G\D\D\d\d\d\d\d\d\.\d\d\d\d\d\d\d\d\d\d.dat').match(pathbasename(atcf_stormfilename)) and \
       not re.compile('G\D\D\d\d\d\d\d\d\.dat').match(pathbasename(atcf_stormfilename)):
        LOG.info('')
        LOG.warning('    DID NOT MATCH REQUIRED FILENAME FORMAT, SKIPPING: '+atcf_stormfilename)
        return []

    # Get all fields for the database entry for the current filename
    cc.execute("SELECT * FROM atcf_deck_stormfiles WHERE filename = ?", (atcf_stormfilename,))
    data=cc.fetchone()

    file_timestamp = datetime.fromtimestamp(osstat(atcf_stormfilename).st_mtime)
    # Reads timestamp out as string - convert to datetime object.
    # Check if timestamp on file is newer than timestamp in database - if not, just return and don't do anything.
    if data: 
        database_timestamp = datetime.strptime(cc.execute("SELECT last_updated from atcf_deck_stormfiles WHERE filename = ?", (atcf_stormfilename,)).fetchone()[0],'%Y-%m-%d %H:%M:%S.%f')
        if file_timestamp < database_timestamp:
            LOG.info('')
            LOG.info(atcf_stormfilename+' already in '+ATCF_DECKS_DB+' and up to date, not doing anything.')
            return []

    lines = open(atcf_stormfilename,'r').readlines()
    start_line = lines[0].split(',')
    # Start 24h prior to start in sectorfile, for initial processing
    #storm_start_datetime = datetime.strptime(start_line[2],'%Y%m%d%H')
    start_datetime = datetime.strptime(start_line[2],'%Y%m%d%H') - timedelta(hours=24)
    end_datetime = datetime.strptime(lines[-1].split(',')[2],'%Y%m%d%H')
    start_vmax= start_line[8]
    vmax=0
    for line in lines:
        currv = line.split(',')[8]
        track = line.split(',')[4]
        if currv and track == 'BEST' and float(currv) > vmax:
            vmax = float(currv)

    if data and database_timestamp < file_timestamp:
        LOG.info('')
        LOG.info('Updating start/end datetime and last_updated fields for '+atcf_stormfilename+' in '+ATCF_DECKS_DB)
        old_start_datetime,old_end_datetime,old_vmax = cc.execute("SELECT start_datetime,end_datetime,vmax from atcf_deck_stormfiles WHERE filename = ?", (atcf_stormfilename,)).fetchone()
        # Eventually add in storm_start_datetime
        #old_storm_start_datetime,old_start_datetime,old_end_datetime,old_vmax = cc.execute("SELECT storm_start_datetime,start_datetime,end_datetime,vmax from atcf_deck_stormfiles WHERE filename = ?", (atcf_stormfilename,)).fetchone()
        if old_start_datetime == start_datetime.strftime('%Y-%m-%d %H:%M:%S'):
            LOG.info('    UNCHANGED start_datetime: '+old_start_datetime)
        else:
            LOG.info('    Old start_datetime: '+old_start_datetime+' to new: '+start_datetime.strftime('%Y-%m-%d %H:%M:%S'))
            updated_files += [atcf_stormfilename]
        #if old_storm_start_datetime == storm_start_datetime.strftime('%Y-%m-%d %H:%M:%S'):
        #    LOG.info('    UNCHANGED storm_start_datetime: '+old_storm_start_datetime)
        #else:
        #    LOG.info('    Old storm_start_datetime: '+old_storm_start_datetime+' to new: '+storm_start_datetime.strftime('%Y-%m-%d %H:%M:%S'))
        if old_end_datetime == end_datetime.strftime('%Y-%m-%d %H:%M:%S'):
            LOG.info('    UNCHANGED end_datetime: '+old_end_datetime)
        else:
            LOG.info('    Old end_datetime: '+old_end_datetime+' to new: '+end_datetime.strftime('%Y-%m-%d %H:%M:%S'))
            updated_files += [atcf_stormfilename]
        if database_timestamp == file_timestamp:
            LOG.info('    UNCHANGED last_updated: '+database_timestamp.strftime('%Y-%m-%d %H:%M:%S'))
        else:
            LOG.info('    Old last_updated: '+database_timestamp.strftime('%Y-%m-%d %H:%M:%S')+' to new: '+file_timestamp.strftime('%Y-%m-%d %H:%M:%S'))
            updated_files += [atcf_stormfilename]
        if old_vmax == vmax:
            LOG.info('    UNCHANGED vmax: '+str(old_vmax))
        else:
            LOG.info('    Old vmax: '+str(old_vmax)+' to new: '+str(vmax))
            updated_files += [atcf_stormfilename]
        cc.execute('''UPDATE atcf_deck_stormfiles SET 
                        last_updated=?,
                        start_datetime=?,
                        end_datetime=?,
                        vmax=? 
                      WHERE filename = ?''', 
                      #Eventually add in ?
                      #storm_start_datetime=?,
                        (file_timestamp,
                        #storm_start_datetime,
                        start_datetime,
                        end_datetime,
                        str(vmax),
                        atcf_stormfilename,))
        conn.commit()
        return updated_files

    start_lat = start_line[6]
    start_lon = start_line[7]
    storm_basin = start_line[0]
    storm_num = start_line[1]
    try:
        start_name= start_line[48]+start_line[49]
    except IndexError:
        start_name= start_line[41]

    if data == None:
        #print '    Adding '+atcf_stormfilename+' to '+ATCF_DECKS_DB
        cc.execute('''insert into atcf_deck_stormfiles(
                        filename,
                        last_updated,
                        vmax,
                        storm_num,
                        storm_basin,
                        start_datetime,
                        start_lat,
                        start_lon,
                        start_vmax,
                        start_name,
                        end_datetime) values(?, ?,?, ?,?,?,?,?,?,?,?)''', 
                        # Eventually add in ?
                        #end_datetime) values(?, ?, ?,?, ?,?,?,?,?,?,?,?)''', 
                        #storm_start_datetime,
                        (atcf_stormfilename,
                            file_timestamp,
                            str(vmax),
                            storm_num,
                            storm_basin,
                            #storm_start_datetime,
                            start_datetime,
                            start_lat,
                            start_lon,
                            start_vmax,
                            start_name,
                            end_datetime,))
        LOG.info('')
        LOG.info('    Adding '+atcf_stormfilename+' to '+ATCF_DECKS_DB) 
        updated_files += [atcf_stormfilename]
        conn.commit()

        # This ONLY runs if it is a brand new storm file and we requested 
        # processing.
        if process:
            reprocess_storm(atcf_stormfilename)
    return updated_files
示例#14
0
def metoctiff_filename_remove_duplicates(fname,
                                         mins_to_remove=10,
                                         remove_files=False):
    # 20201010.222325.GPM.gmi.89H.WP162020.14pc.jif.gz
    # 20201010.222325.GPM.gmi.89H.WP162020.14pc.jif.gz.yaml
    matching_fnames = []
    removed_fnames = []
    saved_fnames = []
    ext1 = pathsplitext(fname)[-1]
    ext2 = pathsplitext(pathsplitext(fname)[0])[-1]
    ext3 = pathsplitext(pathsplitext(pathsplitext(fname)[0])[0])[-1]
    if (ext1 == '.gz' and ext2 == '.jif') or (ext1 == '.yaml' and ext2 == '.gz'
                                              and ext3 == '.jif'):
        LOG.info(
            'MATCHES EXT FORMAT. .jif.gz or .jif.gz.yaml. Attempting to remove metoctiff duplicates'
        )
    else:
        LOG.info(
            'NOT REMOVING DUPLICATES. Not metoctiff filename, not .jif.gz or .jif.gz.yaml.'
        )
        return [], []
    dirname = pathdirname(fname)
    basename = pathbasename(fname)
    parts = basename.split('.')
    if (len(parts) == 10 and ext1 == '.yaml') or (len(parts) == 9
                                                  and ext1 == '.gz'):
        LOG.info(
            'MATCHES NUMBER FIELDS. 9 or 10 fields. Attempting to remove metoctiff duplicates'
        )
    else:
        LOG.info(
            'NOT REMOVING DUPLICATES. Not metoctiff filename, does not contain 9 or 10 fields.'
        )
        return [], []

    try:
        # 20201010.222325.GPM.gmi.89H.WP162020.14pc.jif.gz
        yyyymmdd = parts[0]
        hhmnss = parts[1]
        platform = parts[2]
        sensor = parts[3]
        product = parts[4]
        stormname = parts[5]
        coverage = parts[6]
        if 'pc' not in coverage:
            LOG.info(
                'NOT REMOVING DUPLICATES. Not metoctiff filename, coverage not "NNpc.'
            )
            return []
    except IndexError:
        LOG.info(
            'NOT REMOVING DUPLICATES. Unmatched metoctiff filename format, incorrect number of . delimited fields'
        )
        return [], []
    try:
        fname_dt = datetime.strptime(yyyymmdd + hhmnss, '%Y%m%d%H%M%S')
    except ValueError:
        LOG.info(
            'NOT REMOVING DUPLICATES. Unmatched metoctiff filename format, incorrect date time string.'
        )
        return [], []
    timediff = timedelta(minutes=mins_to_remove)
    for currdt in minrange(fname_dt - timediff, fname_dt + timediff):
        # 20201010.222325.GPM.gmi.19H.WP162020.14pc.jif.gz
        # Matches
        # 20201010.222325.GPM.gmi.19H.WP162020.*.jif.gz*
        dtstr = currdt.strftime(
            '{0}/%Y%m%d.%H%M*.{1}.{2}.{3}.{4}.*.jif.gz*'.format(
                dirname, platform, sensor, product, stormname))
        # print(dtstr)
        matching_fnames += glob(dtstr)
    max_coverage = 0
    for matching_fname in matching_fnames:
        # 20201010.222325.GPM.gmi.89H.WP162020.14pc.jif.gz
        parts = pathbasename(matching_fname).split('.')
        coverage = float(parts[6].replace('pc', ''))
        max_coverage = max(coverage, max_coverage)

    gotone = False
    LOG.info('CHECKING DUPLICATE FILES')
    for matching_fname in list(set(matching_fnames)):
        # 20201010.222325.GPM.gmi.89H.WP162020.14pc.jif.gz
        parts = pathbasename(matching_fname).split('.')
        coverage = float(parts[6].replace('pc', ''))
        if coverage < max_coverage or gotone is True:
            removed_fnames += [matching_fname]
            # Test it out for a bit first
            if remove_files is True:
                LOG.info(
                    'DELETING DUPLICATE FILE with less coverage %s < %s %s',
                    coverage, max_coverage, matching_fname)
                osunlink(matching_fname)
            else:
                LOG.info(
                    'TEST DELETING DUPLICATE FILE with less coverage %s < %s %s',
                    coverage, max_coverage, matching_fname)
        else:
            saved_fnames += [matching_fname]
            if len(matching_fnames) == 1:
                LOG.info(
                    'SAVING DUPLICATE FILE (only one!) with max coverage %s %s',
                    max_coverage, matching_fname)
            else:
                LOG.info('SAVING DUPLICATE FILE with max coverage %s %s',
                         max_coverage, matching_fname)
            gotone = True

    return removed_fnames, saved_fnames