示例#1
0
def remap_filepath(old_path, new_filepath):
	""" Called if a better version of a file is found, this updates them all to the new location. """
	old_path = stringutil.normalize_file(old_path)
	new_filepath = stringutil.normalize_file(new_filepath)
	with lock('w'), closing(conn.cursor()) as cur:  # !cover
		cur.execute('UPDATE urls SET file_path=:nfp WHERE file_path = :ofp', {'nfp': new_filepath, 'ofp': old_path})
		conn.commit()
    def process_url(self, url, info):
        """ Accepts a URL and the array of file info generated for it by this class,
			and then attempts to download it using any possible handler.
			Returns whatever the handlers do, which should be a path to the file itself or the containing directory for an album.
				+Also returns False or None if no appropriate handler was found, or if the handler told us not to download anything.
		"""
        ret_val = False  # Default to 'False', meaning no file was located by a handler.
        for h in handlers.sorted_list:
            self.log.out(
                1,
                stringutil.color("Checking handler: %s" % h.tag,
                                 stringutil.Fore.CYAN))
            ret = False

            # noinspection PyBroadException
            try:
                ret = h.handle(url, info, self.handler_log)
            except Exception as ex:  # There are too many possible exceptions between all handlers to catch properly.
                stringutil.error(ex)
                pass  # Maybe consider stopping thread. I want to see errors reported, but don't want to interrupt users.

            if ret is None:  # cover
                # None is returned when the handler specifically wants this URL to be "finished", but not added to the files list.
                ret_val = None
                break
            if ret:
                # The handler will return a file/directory name if it worked properly.
                ret_val = stringutil.normalize_file(ret)
                break
        return ret_val
	def build_file_info(self, reddit_element):
		""" Generates a dict of file locations and element data that is passed down to every handler,
		so they can choose where best to save for themselves. """
		with HandlerThread.ele_lock:
			dir_pattern = './%s' % settings.save_subdir()
			file_pattern = '%s/%s' % (dir_pattern, settings.save_filename())

			basedir = stringutil.insert_vars(dir_pattern, reddit_element)
			basefile = stringutil.insert_vars(file_pattern, reddit_element)

			if basedir is None or basefile is None:
				# Cannot download this file, because the file path generated for it is too long
				return None  # !cover

			og = basefile
			i = 2
			while basefile in HandlerThread.used_files or manifest.get_file_matching(basefile):
				# Use local list of filenames used here, since used filenames won't be updated until done otherwise.
				basefile = og+' . '+str(i)
				basefile = stringutil.normalize_file(basefile)
				i += 1
			HandlerThread.used_files.append(basefile)  # blacklist this base name while we download.
			self.release_filenames.append(basefile)

			# Build an array of pre-generated possible locations & important data for handlers to have access to.
			return {
				'parent_dir'	: basedir,  # Some handlers will need to build the parent directory for their single file first.
				'single_file'	: basefile+"%s",  # If this handler can output a single file, it will use this path.
				'multi_dir' 	: basefile+"/",	 # Save directory for multi-file downloads.
				'post_title'	: reddit_element.title,			# The title of the Reddit post.
				'post_subreddit': reddit_element.subreddit,		# The subreddit this post came from.
				'user_agent'	: settings.get('auth.user_agent'),
			}
示例#4
0
def put_file_hash(f_path, f_hash, f_lastmtime):
    """ Adds the given hash data for the given filename. """
    f_path = stringutil.normalize_file(f_path)
    with lock('w'), closing(conn.cursor()) as cur:
        cur.execute(
            'INSERT OR REPLACE INTO hashes (file_path, lastmtime, hash) VALUES (?,?,?)',
            (f_path, f_lastmtime, f_hash))
        conn.commit()
	def process_ele(self, reddit_element):
		""" Accepts a RedditElement of Post/Comment details, then runs through the Handlers loaded from the other directory,
		 attempting to download the url.  """
		self.log.out(0, 'Processing new ele...')
		self.handler_log.clear()
		# print('\n\n\nProcessing ele: %s' % reddit_element.to_obj())
		self.log.out(0,
					 stringutil.out(
						 "[%s](%s): %s" % (reddit_element.type, reddit_element.subreddit, reddit_element.title),
						 False,
						 stringutil.Fore.LIGHTYELLOW_EX
					 )
		)
		was_new_ele = False
		for url in reddit_element.get_urls():
			was_new_url = True
			url_info = manifest.get_url_info(url)
			if url_info:
				was_new_url = False  # The manifest has seen this URL before. It may have failed last time, though.
				file = url_info['file_path']
				if file and os.path.exists(file):
					#  This URL has already been handled, and its file still exists.
					reddit_element.add_file(url, file)
					hashjar.add_hash(file)  # Update hash, just in case it doesn't have this file. (from legacy)
					continue

			was_new_ele = True
			# This URL hasn't been handled yet! Time to download it:
			file_info = self.build_file_info(reddit_element)  # Build the file information dict using this RedditElement
			if file_info is None:
				reddit_element.add_file(url, False)  # This mostly happens if the filename can't be generated.
			else:
				# Download file from new url, using the loaded Handlers:
				file_path = self.process_url(url, file_info)  # The important bit is here, & doesn't need the Lock.
				if file_path:
					file_path = stringutil.normalize_file(file_path)  # Normalize for all DB storage.
					if was_new_url:
						self.total_new_urls += 1
				else:
					self.total_failed_urls += 1
				if not self.keep_running:
					return  # Kill the thread after a potentially long-running download if the program has terminated. !cover
				reddit_element.add_file(url, self.check_duplicates(file_path))

		manifest.insert_post(reddit_element)  # Update Manifest with completed ele.
		if was_new_ele:
			self.total_new_posts += 1

		with HandlerThread.ele_lock:
			# Clear blacklisted filename list, just to release the memory.
			for r in self.release_filenames:
				HandlerThread.used_files.remove(r)
			self.release_filenames = []
示例#6
0
def add_hash(filename):
    """
	Add the given file to the Hash jar.
	:param filename: The path to the file to add.
	:return: ([Is New File], existing_file_path)
	"""
    if filename:
        filename = stringutil.normalize_file(filename)  # Normalize for safety.

    if not filename or not os.path.exists(filename) or os.path.isdir(filename):
        # Skip directories.
        return True, None

    pre = manifest.get_file_hash(
        filename
    )  # Start with a simple lookup to see if this path's hash is stored already.
    lmt = os.path.getmtime(filename)
    if pre:
        if lmt == pre['lastmtime']:
            # Hash already exists and file hasn't changed since its last storage.
            return False, filename

    # manifest.set_metadata(filename, 'hashed') # Debugging only.
    _, final_hash = _get_best_hash(
        filename
    )  # If we didn't find the hash, or this file has been modified, re-hash.
    if not final_hash:  # !cover
        stringutil.error(
            "HashCheck :: Error hit hashing file, passing file as new.")
        return True, None

    manifest.put_file_hash(filename, final_hash,
                           lmt)  # Store the hash of every file processed.
    # NOTE: Now that this file is stored, it's up to anything that deletes an archived file to also remove the hash.

    _it = manifest.hash_iterator(len(final_hash))
    for h in _it:
        if h['file_path'] == filename:
            continue  # Since we've just added this file's hash, we don't want to match with it!
        dist = _hamming_distance(h['hash'], final_hash)
        if dist < 4:
            # print('\tHashCheck :: Distance matches existing file (%s,%s): %s' % (final_hash, h, dist))
            _it.send(True)  # Release DB generator.
            return False, h['file_path']
    # print('\tHashCheck :: File is unique. Saved successfully.')
    return True, None
示例#7
0
def create(file):
    global conn, version
    with lock('w'):
        file = stringutil.normalize_file(file)
        build = file == ':memory:' or not os.path.isfile(file)
        conn = sqlite3.connect(file, check_same_thread=False)
        if build:
            with closing(conn.cursor()) as cur:
                cur.execute('''CREATE TABLE posts (
					id text PRIMARY KEY,
					author text COLLATE NOCASE,
					source_alias text COLLATE NOCASE,
					subreddit text COLLATE NOCASE,
					title text COLLATE NOCASE,
					type text COLLATE NOCASE,
					parent text COLLATE NOCASE,
					body text COLLATE NOCASE
				)''')
                cur.execute('''CREATE TABLE urls (
					post_id text, url text, file_path text COLLATE nocase
				)''')
                cur.execute('''CREATE TABLE hashes (
					file_path text PRIMARY KEY COLLATE nocase, lastmtime int, hash text
				)''')
                cur.execute('''CREATE TABLE metadata (
					meta_key text PRIMARY KEY, meta_val text
				)''')
                cur.execute('''CREATE INDEX url_index ON urls(post_id)''')
                cur.execute('''CREATE INDEX hash_index ON hashes(hash)''')
                conn.commit()
            with closing(conn.cursor()) as cur:
                cur.execute('INSERT INTO metadata VALUES (?,?)',
                            ('version', version))
                cur.execute('INSERT INTO metadata VALUES (?,?)',
                            ('author', 'ShadowMoose'))
                cur.execute('INSERT INTO metadata VALUES (?,?)',
                            ('website', 'https://goo.gl/hgBxN4'))
                conn.commit()
            print("Built DB.")
    print('Connected to DB.')
示例#8
0
def remove_file_hash(f_path):
    """ Remove any hashes for the given path. """
    f_path = stringutil.normalize_file(f_path)
    with lock('w'), closing(conn.cursor()) as cur:
        cur.execute('DELETE FROM hashes WHERE file_path=:fp', {'fp': f_path})
        conn.commit()
示例#9
0
def get_file_hash(f_path):
    """ Returns a dictionary of the given Hash info for the file, or None. """
    f_path = stringutil.normalize_file(f_path)
    return _select_fancy('hashes', ['lastmtime', 'hash'], 'file_path = :fname',
                         {'fname': f_path})