def merge_links(archive_path=OUTPUT_DIR, import_path=None, only_new=False): """get new links from file and optionally append them to links in existing archive""" all_links = [] if import_path: # parse and validate the import file raw_links = parse_links(import_path) all_links = validate_links(raw_links) # merge existing links in archive_path and new links existing_links = [] if archive_path: existing_links = parse_json_links_index(archive_path) all_links = validate_links(existing_links + all_links) num_new_links = len(all_links) - len(existing_links) if num_new_links and not only_new: print('[{green}+{reset}] [{}] Adding {} new links from {} to {}/index.json'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), num_new_links, pretty_path(import_path), pretty_path(archive_path), **ANSI, )) # else: # print('[*] [{}] No new links added to {}/index.json{}'.format( # datetime.now().strftime('%Y-%m-%d %H:%M:%S'), # archive_path, # ' from {}'.format(import_path) if import_path else '', # **ANSI, # )) if only_new: return new_links(all_links, existing_links) return all_links
def parse_path(self, path = ''): # no path? go to our last working dir if not len(path): return self._env['last_cwd']; # make sure the path is formatted pretty path = util.pretty_path(path) # parse the dir path for relative portions if path.startswith('/'): cur_dirs = ['/'] else: cur_dirs = self.env('cwd').split('/'); dirs = path.split('/') for dir in dirs: if dir == '' or dir == '.': # blanks can creep in on absolute paths, no worries continue rel_depth = 0 if dir == '..': if not len(cur_dirs): raise Exception ("URI is out of bounds: \"%s\"." % (path)) cur_dirs.pop() else: cur_dirs.append(dir) # always end up with an absolute path return util.pretty_path('/'.join(cur_dirs) + '/', True)
def parse_path(self, path=''): '''Returns a path that may contain relative references (e.g. "../foo") based on our current path.''' # no path? go to our last working dir if not len(path) or path == '-': return self._env['last_cwd'] # make sure the path is formatted pretty path = util.pretty_path(path, False, False) # parse the dir path for relative portions trailing = path.endswith('/') if path.startswith('/'): cur_dirs = ['/'] else: cur_dirs = self.env('cwd').split('/') dirs = path.split('/') for dir in dirs: if dir == '' or dir == '.': # blanks can creep in on absolute paths, no worries continue rel_depth = 0 if dir == '..': if not len(cur_dirs): raise Exception("URI is out of bounds: \"%s\"." % (path)) cur_dirs.pop() else: cur_dirs.append(dir) # always end up with an absolute path final_path = util.pretty_path('/'.join(cur_dirs), True, False) if trailing: final_path = final_path + '/' return final_path
def _get_uri(self, api): # absolute path? if api.startswith('/'): return util.pretty_path(api) else: # otherwise, parse against the current dir for the actual path return util.pretty_path(self.parse_path(api));
def write_links_index(out_dir, links): """create index.html file for a given list of links""" if not os.path.exists(out_dir): os.makedirs(out_dir) write_json_links_index(out_dir, links) write_html_links_index(out_dir, links) print('{green}[√] [{}] Updated main index files:{reset}'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), **ANSI)) print(' > {}/index.json'.format(pretty_path(out_dir))) print(' > {}/index.html'.format(pretty_path(out_dir)))
def load_links(archive_path=OUTPUT_DIR, import_path=None): """get new links from file and optionally append them to links in existing archive""" existing_links = [] if archive_path: existing_links = parse_json_links_index(archive_path) check_links_structure(existing_links) new_links = [] if import_path: # parse and validate the import file raw_links, parser_name = parse_links(import_path) new_links = validate_links(raw_links) check_links_structure(new_links) # merge existing links in archive_path and new links all_links = validate_links(existing_links + new_links) check_links_structure(all_links) num_new_links = len(all_links) - len(existing_links) if import_path and parser_name: print( ' > Adding {} new links to index from {} (parsed as {} format)'. format( num_new_links, pretty_path(import_path), parser_name, )) return all_links, new_links
def archive_links(archive_path, links, source=None, resume=None): check_dependencies() to_archive = Peekable(links_after_timestamp(links, resume)) idx, link = 0, to_archive.peek(0) try: for idx, link in enumerate(to_archive): link_dir = os.path.join(ARCHIVE_DIR, link['timestamp']) archive_link(link_dir, link) except (KeyboardInterrupt, SystemExit, Exception) as e: print( '{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}' .format( **ANSI, now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), idx=idx + 1, timestamp=link['timestamp'], total=len(links), )) print(' Continue where you left off by running:') print(' {} {}'.format( pretty_path(sys.argv[0]), link['timestamp'], )) if not isinstance(e, KeyboardInterrupt): raise e raise SystemExit(1)
def merge_links(archive_path=OUTPUT_DIR, import_path=None, only_new=False): """get new links from file and optionally append them to links in existing archive""" all_links = [] if import_path: # parse and validate the import file raw_links, parser_name = parse_links(import_path) all_links = validate_links(raw_links) # merge existing links in archive_path and new links existing_links = [] if archive_path: existing_links = parse_json_links_index(archive_path) all_links = validate_links(existing_links + all_links) num_new_links = len(all_links) - len(existing_links) if SHOW_PROGRESS: print() print(' > Adding {} new links to index from {} (parsed as {} format)'. format( num_new_links, pretty_path(import_path), parser_name, )) if only_new: return new_links(all_links, existing_links) return all_links
def write_links_index(out_dir, links, finished=False): """create index.html file for a given list of links""" check_links_structure(links) if not os.path.exists(out_dir): os.makedirs(out_dir) print('{green}[*] [{}] Saving main index files...{reset}'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), **ANSI, )) write_json_links_index(out_dir, links) print(' > {}/index.json'.format(pretty_path(out_dir))) write_html_links_index(out_dir, links, finished=finished) print(' > {}/index.html'.format(pretty_path(out_dir)))
def print_link_status_line(link_dir, link, is_new): print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n {blue}{url}{reset}'.format( symbol='+' if is_new else '*', symbol_color=ANSI['green' if is_new else 'black'], now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), **{**link, 'title': link['title'] or link['url']}, **ANSI, )) print(' > {}{}'.format(pretty_path(link_dir), ' (new)' if is_new else ''))
def update_banner(self): self.banner_cwd = os.getcwd() prettycwd = u.pretty_path(self.banner_cwd) if self.mode == 'normal': banner_txt = "es:" + prettycwd + " > " self.banner_uncoloredlen = len(banner_txt) self.banner = u.mk_g(banner_txt) if self.mode == 'speedy': banner_txt = "es:" + prettycwd + " $ " self.banner_uncoloredlen = len(banner_txt) self.banner = u.mk_y(banner_txt)
def log_link_archive(link_dir, link, update_existing): print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n {blue}{url}{reset}'.format( symbol='*' if update_existing else '+', symbol_color=ANSI['black' if update_existing else 'green'], now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), **link, **ANSI, )) print(' > {}{}'.format(pretty_path(link_dir), '' if update_existing else ' (new)')) if link['type']: print(' i {}'.format(link['type']))
def _build_url(self, api, query): path = util.pretty_path( '/'.join(['', self.url['path'], api]), True, False ) url = '%s://%s:%s%s' % ( self.url['scheme'], self.url['hostname'], self.url['port'], path ) # has the base URL been set to include query params? if self.url['query']: url = self.merge_query(url, self.url['query']) # add in manually passed query args if query: url = self.merge_query(url, query) # with everything merged into the URL, do a final split if url.find('?') >= 0: (url, query) = url.split('?', 1) else: query = '' # return the full URL, as well as our final query return url, query
def run(self, api, args = (), raw_response = False, full_response = False, get = None, post = None, files = None): # check and prep the data if api == '': raise Exception("Invalid service API: '%s'." %api) api = urllib.quote(api) curl = pycurl.Curl() data = [ ('OMEGA_ENCODING', (curl.FORM_CONTENTS, 'json')), ('OMEGA_API_PARAMS', (curl.FORM_CONTENTS, self.encode(args))) ] if self._credentials: data.append(('OMEGA_CREDENTIALS', (curl.FORM_CONTENTS, self.encode(self._credentials)))) # include any extra post data if post: (name, value) = post.split('=', 1) data.append((name, (curl.FORM_CONTENTS, value))) if files: # add in our files to the data for name in files: data.append((name, (curl.FORM_FILE, files[name]))) # figure our our URL and get args url = self._url if self._use_https: url = ''.join(('https://', self._hostname)) else: url = ''.join(('http://', self._hostname)) url = '/'.join((':'.join((url, str(self._port))), self._folder)) url = '/'.join((url, api)) url = re.sub(r'/+', '/', util.pretty_path(url)).replace(':/', '://', 1) if get: url = '?'.join((url, get)) # fire away curl.setopt(curl.URL, url) curl.setopt(curl.POST, 1) curl.setopt(curl.USERAGENT, self._useragent) curl.setopt(curl.COOKIEFILE, self._cookie_file) curl.setopt(curl.COOKIEJAR, self._cookie_file) if self._use_https: curl.setopt(curl.SSL_VERIFYPEER, 0) # TODO: don't always assume curl.setopt(curl.SSL_VERIFYHOST, 0) # TODO: don't always assume if data: curl.setopt(curl.HTTPPOST, data) else: curl.setopt(curl.POSTFIELDS, '&'.join(args)) response = StringIO.StringIO() curl.setopt(curl.WRITEFUNCTION, response.write) curl.perform() response = response.getvalue() http_code = curl.getinfo(curl.HTTP_CODE) content_type = curl.getinfo(curl.CONTENT_TYPE) or ""; if http_code < 200 or http_code >= 300: # see if we got json data back try: if content_type.startswith("application/json"): decoded = self.decode(response) if 'reason' in decoded: error = decoded['reason'] else: error = response else: error = response except: error = response raise Exception("Server returned HTTP code %s. Response:\n%s" % (str(http_code), str(error))) curl.close() if raw_response: return response else: # decode the response and check whether or not it was successful # TODO: check response encoding in header try: if content_type.startswith("application/json"): response = self.decode(response) else: return response except: raise Exception('Failed to decode API response.', response) # check to see if our API call was successful if 'result' in response and response['result'] == False: if 'reason' in response: if full_response: raise Exception('API "%s" failed.\n%s' % (urllib.unquote(api), dbg.obj2str(response))) else: raise Exception(response['reason']) else: raise Exception('API "%s" failed, but did not provide an explanation. Response: %s' % (api, response)) else: if full_response: return response else: if 'data' in response: return response['data'] else: return None
def request(self, method, api, params = (), raw_response = False, full_response = False, get = None, headers = {}, verbose = False, no_format = False): '''New REST-friendly API invoker''' # check and prep the data if method is None or method == '': method = 'GET' method = method.upper() if api == '' or api == None: raise Exception("Invalid service API: '%s'." %api) api = urllib.quote(api) if self._credentials: creds = self._credentials md5 = hashlib.md5(); md5.update(':'.join( [creds['username'], creds['password']] )) headers['Authentication'] = 'Basic ' + base64.b64encode( md5.hexdigest()) http = self._http # figure our our URL and get args url = self._url headers['Content-type'] = 'application/json' headers['Accept'] = 'application/json' url = util.pretty_path('/'.join(('', self._folder, api)), True) if get: url = '?'.join((url, get)) if method == 'GET': url = '?'.join((url, '&'.join([ '='.join( (urllib.quote(name), urllib.quote(str(params[name]))) ) for name in params ]))) data = None else: data = self.encode(params) # fire away if verbose: if self._use_https: proto = 'https' else: proto = 'http' sys.stderr.write( '# Request: %s %s://%s:%d%s, params: "%s", headers: "%s", cookies: "%s"\n' % ((method, proto, self._hostname, self._port, url, data, str(headers), str(http.cookies))) ) #http.request(method, url, data, headers) # be willing to try again if the socket got closed on us (e.g. timeout) tries = 0 max_tries = 3 response = None while tries < max_tries and response is None: tries += 1 try: # start the request http.putrequest(method, url) # send our headers for hdr, value in headers.iteritems(): http.putheader(hdr, value); # and our cookies too! if http.cookies: [http.putheader('Cookie', value) for value in http.cookies] # write the body header_names = headers.fromkeys([k.lower() for k in headers]) if data: body_len = len(data) if body_len: http.putheader('Content-Length', str(body_len)) http.endheaders() if data: http.send(data) # get our response back from the server and parse response = http.getresponse() except socket.error, v: http.connect() except:
def update_archive(archive_path, links, source=None, resume=None, append=True): """update or create index.html+json given a path to an export file containing new links""" start_ts = datetime.now().timestamp() if resume: print('{green}[▶] [{}] Resuming archive downloading from {}...{reset}'. format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), resume, **ANSI, )) else: print( '{green}[▶] [{}] Updating content for {} pages in archive...{reset}' .format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), len(links), **ANSI, )) check_links_structure(links) # prefetch the first link off the generator so that if we pause or fail # immediately we can show that we paused on the first link and not just None to_archive = Peekable(links_after_timestamp(links, resume)) idx, link = 0, to_archive.peek(0) # loop over links and archive them try: check_dependencies() for idx, link in enumerate(to_archive): link_dir = os.path.join(ARCHIVE_DIR, link['timestamp']) archive_link(link_dir, link) except (KeyboardInterrupt, SystemExit, Exception) as e: print( '\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}' .format( **ANSI, now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), idx=idx + 1, timestamp=link['timestamp'], total=len(links), )) print(' To view your archive, open: {}/index.html'.format( OUTPUT_DIR.replace(REPO_DIR + '/', ''))) print(' Continue where you left off by running:') print(' {} {}'.format( pretty_path(sys.argv[0]), link['timestamp'], )) if not isinstance(e, KeyboardInterrupt): print() raise e raise SystemExit(1) # print timing information & summary end_ts = datetime.now().timestamp() seconds = end_ts - start_ts if seconds > 60: duration = '{0:.2f} min'.format(seconds / 60, 2) else: duration = '{0:.2f} sec'.format(seconds, 2) print('{}[√] [{}] Update of {} pages complete ({}){}'.format( ANSI['green'], datetime.now().strftime('%Y-%m-%d %H:%M:%S'), len(links), duration, ANSI['reset'], )) print(' - {} entries skipped'.format(_RESULTS_TOTALS['skipped'])) print(' - {} entries updated'.format(_RESULTS_TOTALS['succeded'])) print(' - {} errors'.format(_RESULTS_TOTALS['failed'])) print(' To view your archive, open: {}/index.html'.format( OUTPUT_DIR.replace(REPO_DIR + '/', '')))
def parse_args(self, expr, arg_slice=None): args = { 'path': None, 'verb': None, 'api_args': {}, 'basic_auth': None, 'cmd_args': [], 'headers': {}, 'data': [], 'extract': [], 'exclude': [], 'invert_color': False, 'color': self.main_args['color'], 'formatted': self.main_args['formatted'], 'url': self.main_args['url'], 'verbose': False, 'stdout_redir': None, 'redir_type': None, 'shell': False, 'query': [], 'help': False, # user just wanted some help 'FILES': [], 'oauth': { 'consumer_key': None, 'consumer_secret': None, 'token': None, 'token_secret': None } } if isinstance(expr, basestring): parts = shlex.split(expr) else: parts = expr # already a list # check for any condensed parameters (e.g. -fr = -f, -r) old_parts = parts[:] for i in range(0, len(parts)): part = parts[i] if len(part) > 2 and part[0] == '-' and not (part[1] in ['-', '+', '=']): # expand the parameters out parts = parts[:i] + \ [''.join(['-', param]) for param in parts[i][1:]] + \ parts[i + 1:] i = 0 # iterate through each paramter and handle it while i < len(parts): part = parts[i] if len(part) == 0: pass elif part == '>' or part[0] == '>' or part == '>>': # output redirection! woot if part == '>' or parts == '>>': i += 1 if part == '>': args['redir_type'] = 'w' else: args['redir_type'] = 'a' if i == len(parts): raise Exception("Missing file path to output result to.") args['stdout_redir'] = parts[i] else: if len(part) > 1 and part[0:2] == '>>': args['stdout_redir'] = part[2:] args['redir_type'] = 'a' else: args['stdout_redir'] = part[1:] args['redir_type'] = 'w' elif part == '-B' or part == '--basic': i += 1 if i == len(parts): raise Exception("Missing HTTP basic auth user/pass parameter.") if ':' not in parts[i]: raise Exception("Expected HTTP basic auth in format 'user:pass'.") args['basic_auth'] = parts[i] elif part == '-F' or part == '--file': i += 1 if i == len(parts): raise Exception("Missing value for file to upload.") # collect up the name if parts[i].find('=') == -1 or parts[i].find('&') != -1: raise Exception("Invalid file name=file_path pair.") (name, path) = parts[i].split('=', 1) # make sure the file exists if not os.path.isfile(path): raise Exception("Unable to either read or locate file '%s." % path) args['FILES'][name] = path raise Exception("Not supported at the moment") elif part == '-Q' or part == '--query': i += 1 if i == len(parts): raise Exception("Missing query name=value pair.") # make sure we have a valid pair if parts[i].find('=') == -1 or parts[i].find('&') != -1: raise Exception("Invalid query name=value pair.") args['query'].append(parts[i]) elif part == '-i' or part == '--invert': args['invert_color'] = True elif part == '-c' or part == '--color': args['color'] = True elif part == '-C' or part == '--no-color': args['color'] = False elif part == '-v' or part == '--verbose': args['verbose'] = True elif part == '-f' or part == '--form': args['headers']['content-type'] = 'application/x-www-form-urlencoded' elif part == '-O' or part == '--oauth': # the next 4 parameters are for oauth if i + 4 == len(parts): raise Exception("Missing one of the following values for --oauth: consumer key, consumer secret, token, token secret.") next_params = [ 'consumer_key', 'consumer_secret', 'token', 'token_secret' ] for ctr in range(0, 4): args['oauth'][next_params[ctr]] = parts[i + ctr + 1] i += 4 elif part == '-h' or part == '--help': self.print_help() args['help'] = True elif part == '-H' or part == '--header': i += 1 if i == len(parts): raise Exception("Missing value for HTTP header.") h_parts = parts[i].split(': ', 1) if len(h_parts) != 2: raise Exception("Invalid HTTP header.") args['headers'][h_parts[0].lower()] = h_parts[1] elif part == '-s' or part == '--shell': args['shell'] = True elif part == '-j' or part == '--json': i += 1 if i == len(parts): raise Exception("Missing value for JSON API params.") try: api_args = self.decode(parts[i]) if isinstance(api_args, dict): args['api_args'].update(api_args) else: raise JSONException("JSON values must be a dictionary of arguments.") except JSONException as e: sys.stderr.write('Invalid JSON:' + e.message) raise e except Exception as e: sys.stderr.write('Invalid JSON:' + e.message) raise JSONException(e.message) elif part == '-r' or part == '--raw': args['formatted'] = False elif part == '--url' or part == '-u': i += 1 if i == len(parts): raise Exception("Missing value for URL.") args['url'] = parts[i] elif part == '-d' or part == '--data': i += 1 if i == len(parts): raise Exception("Missing value for --data.") part = parts[i] if part.index('=') == -1: raise Exception("Invalid parameter for --data: expected format NAME[+]=PATH") args['data'].append(DataMap(*part.split('=', 1))) elif part == '-x' or part == '--extract': i += 1 if i == len(parts): raise Exception("Missing value for --extract.") args['extract'].append(parts[i]) elif part == '-X' or part == '--exclude': i += 1 if i == len(parts): raise Exception("Missing value for --exclude.") args['exclude'].append(parts[i]) else: # we always pick up the command/method first if args['verb'] is None: args['verb'] = part.lower() # process any aliases if args['verb'] in self.method_aliases: args['verb'] = self.method_aliases[args['verb']] elif args['verb'] in self.http_methods and args['path'] is None: # collect the API -- unless this is a internal command args['path'] = util.pretty_path(self.parse_path(part), False, False) else: # anything else is a parameter if args['verb'] in self.http_methods: # get the name/value args['api_args'] = self.parse_param(part, args['api_args']) else: args['cmd_args'].append(part) i += 1 if arg_slice is not None: args = util.get_args(arg_slice, args) return args
sys.stderr.write('Invalid JSON:' + e.message) return elif part == '-f' or part == '--full': args['full_response'] = True elif part == '-r' or part == '--raw' or part == '-rr': # -rr = -r -r, twice means no formatting if part == '-rr' or args['raw_response']: args['raw_noformat'] = True args['raw_response'] = True else: # we always pick up the command first if cmd == None: cmd = part.lower() elif cmd in self._api_cmds and not api: # collect the API -- unless this is a internal command api = util.pretty_path(self.parse_path(part)) else: # anything else is a parameter if cmd in self._api_cmds: # get the name/value api_params = self.parse_param(part, api_params) else: cmd_args.append(part) i += 1 # get any redirection ready, if we can # FIXME: if the API fails the file shouldn't be written to if stdout_redir != None: try: file = open(stdout_redir, redir_type) except IOException, e: sys.stderr.write('! ' + error + '\n')