def run(self): total = 0 totals = {} bad_results = 0 with open(self.hjson_path) as f: sites = json.load(f) rs = (grequests.head(s.get('url'), hooks={'response': [self.hook_factory(s)]}) for s in sites.get('base_urls')) for r in grequests.imap(rs, size=20): total += 1 if totals.get(r.status_code): totals[r.status_code] += 1 else: totals[r.status_code] = 1 if r.status_code >= 400: bad_results += 1 print('========================================================') print('Summary') print('========================================================') print('Total requests: %d' % total) print('Bad responses: %d' % bad_results) for sc in totals: print('Status Code %d: %d' % (sc, totals[sc])) self.dispatcher.command_complete.emit(0)
def make_grequest(urls, content=False, size=5): """ Return the dict of (url,status_code, content_type Or content) of each list of url in urls """ try: reqs = set() ret = dict() if content: reqs = (grequests.get(url) for url in urls) else: reqs = (grequests.head(url) for url in urls) res = grequests.map(reqs, stream=False, size=size) for url, r in zip(urls, res): log.info('Made Request %s :: %d ' % (url, r.status_code)) if content: ret[url] = {'status_code': r.status_code, 'content': r.text} else: ret[url] = {'status_code': r.status_code} if ret: return ret raise Exception except Exception as e: log.exception('Error in make_grequest')
def cdn_images(urls): rs = [] for (k, v) in urls.items(): hostname = urlparse(k).hostname path = urlparse(k).path query = urlparse(k).query key = ''.join([ str(hashlib.md5(hostname).hexdigest())[:8], '/', hashlib.md5(k).hexdigest() ]) urls[k] = ''.join(['http://p.cdn.sohu.com/', key]) url = ''.join(['http://bjcnc.scs-in.sohucs.com/storage', path]) if query: url = ''.join([url, '?', query]) rs.append( grequests.head(url, headers={ 'x-scs-meta-mirror-host': hostname, 'x-scs-meta-upload-key': key })) grequests.map(rs) return urls
def scan(self, session): time_started = time.strftime('%Y-%m-%d %H:%M:%S') author = getpass.getuser() http, https = self.supported.keys()[0], self.supported.keys()[1] url_factory = self.url_factory() urls = url_factory(http, session) + url_factory(https, session) self.session = session async_requests = [ grequests.head( url=url, allow_redirects=False, headers={ 'User-Agent': settings.USER_AGENT, 'Host': host}, hooks=dict(response=self.success_hook), timeout=settings.TIMEOUT) for host, url in urls] grequests.map( requests=async_requests, size=settings.CONCURRENT_REQUESTS, exception_handler=self.failure_hook) time_ended = time.strftime('%Y-%m-%d %H:%M:%S') scan_instance = ScanInstance( start_time=time_started, end_time=time_ended, author=author) self.session.add(scan_instance) export_xlsx(session)
def make_grequest(urls, content=False, size=5): """ Return the dict of (url,status_code, content_type Or content) of each list of url in urls """ try: reqs = set() ret = dict() if content: reqs = (grequests.get(url) for url in urls) else: reqs = (grequests.head(url) for url in urls) res = grequests.map(reqs, stream=False, size=size) for url, r in zip(urls, res): log.info('Made Request %s :: %d ' % (url, r.status_code)) if content: ret[url] = { 'status_code': r.status_code, 'content': r.text } else: ret[url] = { 'status_code': r.status_code } if ret: return ret raise Exception except Exception as e: log.exception('Error in make_grequest')
def wake_sites(): rs = [] for user in User.query.filter(User.roles.any(Role.name == 'user'), User.roles.any(Role.name == 'decal')): if user.website: rs.append(grequests.head(user.website)) print colored('Waking %s' % user.website, 'yellow') grequests.map(rs) print colored("%d sites awoken!" % len(rs), 'green')
def check_transparency_portal_existance(dataset, portal_urls): dataset['transparency_portal_url'] = 'None' dataset['status_code'] = 0 for url in portal_urls: dataset['transparency_portal_url'] = dataset.apply(format_url, axis=1, args=(url,)) rs = (grequests.head(u) for u \ in list(dataset.loc[dataset['status_code'] == 0, 'transparency_portal_url'])) responses = grequests.map(rs, exception_handler=exception_handler) responses = [get_status_code(r) for r in responses] dataset.loc[dataset['status_code'] == 0, 'status_code'] = responses dataset.loc[dataset['status_code'] == 0, 'transparency_portal_url'] = 'None'
def test_fetch_cards_sending_requests_by_batches_not_blocking_for_responses(): """ Send requests but don't block for the response. Use a request pool to keep a threshold of maximum number of requests. Use a callback to get notified of the response. """ urls = [mtgurl.make_vanilla_url(cardname) for cardname in CARDS] reqs = (grequests.head(url, allow_redirects=True, callback=_on_response) for url in urls) pool = grequests.Pool(30) for req in reqs: grequests.send(req, pool) # Don't exit until we received the responses, otherwise we may lose some of them import time time.sleep(20)
def _check_urls(self): """ Concurrently checking bunch of URLs, limited by SITEMAP_CHECK_LIMIT setting, displaying progress in console. """ total = len(self.urls) for i in range(0, len(self.urls), settings.SITEMAP_CHECK_LIMIT): r = (grequests.head(u) for u in self.urls[i:i+settings.SITEMAP_CHECK_LIMIT]) rs = grequests.map(r) self.error_urls += filter(lambda x: x.status_code not in [200, 301, 302], rs) self.checked_urls += len(self.urls[i:i+settings.SITEMAP_CHECK_LIMIT]) remained = total - self.checked_urls progress = round(self.checked_urls / total, 2) sys.stdout.write('\rChecked %.2f%% (%s URLs, %d errors, %d remained)' % (progress, self.checked_urls, len(self.error_urls), remained)) sys.stdout.flush()
def check_transparency_portal_existance(dataset, portal_urls): dataset['transparency_portal_url'] = 'None' dataset['status_code'] = 0 for url in portal_urls: dataset['transparency_portal_url'] = dataset.apply(format_url, axis=1, args=(url, )) rs = (grequests.head(u) for u \ in list(dataset.loc[dataset['status_code'] == 0, 'transparency_portal_url'])) responses = grequests.map(rs, exception_handler=exception_handler) responses = [get_status_code(r) for r in responses] dataset.loc[dataset['status_code'] == 0, 'status_code'] = responses dataset.loc[dataset['status_code'] == 0, 'transparency_portal_url'] = 'None'
def fetch_by_url_async(urls): reqs = (grequests.head(url, allow_redirects=True) for url in urls) responses = grequests.map(reqs, size=10, exception_handler=_exception_handler) cardset_found = {} for response in responses: card_found = True try: _check_found(response) except NotFound: card_found = False finally: cardset_found[response.url] = card_found return cardset_found
def run(self, fn_on_response, *args, **kwargs): logger.info('executor start') start_time = time.time() no_more_task, urls = self.get_next_task(self.max_workers) while True: if len(urls): break tmp_urls = [] urls = (grequests.head(u, timeout=self.timeout) for u in urls) for r in grequests.imap(urls): fn_on_response(r) no_more_task, urls = self.get_next_task(self.max_workers) if len(urls): break end_time = time.time() logger.info('executor done, %.3fs' % (end_time - start_time))
def main(): today = datetime.date.today() with open('top-1m_{0}.csv' .format(today), 'r') as csvfile: raw = csv.reader(csvfile) alexa = [(rank, 'http://' + host) for rank, host in raw] alexa_chunks = [] for x in range(0, len(alexa), 10): req = [grequests.head(u[1], timeout=1, allow_redirects=False) for u in alexa[x:x+10]] rsp = grequests.map(req) durations = [] for a in rsp: try: durations.append((a.url, a.elapsed)) except: pass print durations
def check_urls(host, port, is_https=False, url_list=CGI_FILES): """ Checks if which urls exist :return: Sequence of URLs to try and attack """ import grequests attack_path = 'http://' if is_https: attack_path = 'https://' attack_path = attack_path + str(host) + ":" + str(port) attack_urls = [attack_path + url for url in url_list] reqs = (grequests.head(u, verify=False, timeout=TIMEOUT) for u in attack_urls) resps = grequests.map(reqs, size=15) valid_resps = [ resp for resp in resps if resp and resp.status_code == requests.codes.ok ] urls = [resp.url for resp in valid_resps] return urls
def _check_urls(self): """ Concurrently checking bunch of URLs, limited by SITEMAP_CHECK_LIMIT setting, displaying progress in console. """ total = len(self.urls) for i in range(0, len(self.urls), settings.SITEMAP_CHECK_LIMIT): r = (grequests.head(u) for u in self.urls[i:i + settings.SITEMAP_CHECK_LIMIT]) rs = grequests.map(r) self.error_urls += filter( lambda x: x.status_code not in [200, 301, 302], rs) self.checked_urls += len(self.urls[i:i + settings.SITEMAP_CHECK_LIMIT]) remained = total - self.checked_urls progress = round(self.checked_urls / total, 2) sys.stdout.write( '\rChecked %.2f%% (%s URLs, %d errors, %d remained)' % (progress, self.checked_urls, len(self.error_urls), remained)) sys.stdout.flush()
def convert_url(filtered): """ Next step is getting a jobs listings url, we do this by checking the header of 'careerjet.co.th' link """ job = (grequests.head(u, allow_redirects=False, verify=False, timeout=5) for x, u in filtered) try: mp = grequests.map(job) except requests.ConnectionError as e: print e except BaseException as e: print e converted_list = [] db = conn() cursor = db.cursor() for url in mp: converted_list.append([url.headers['location']]) cursor.execute( "UPDATE job_url SET converted_url='{url}' WHERE url='{original_url}'".format(url=url.headers['location'], original_url=url.url)) db.commit() print 'data was updated :', url.headers['location'] db.close()
def check_urls_in_rfm_resolve_correctly(records): """ Do HEAD requests on all files in remote file manifests to test that the links we build point to real files. :return: """ rfms = [r[1] for r in records] urls, file_lengths = [], [] for rfm in rfms: for record in rfm: urls.append(record['url']) file_lengths.append(record['length']) rs = (grequests.head(u) for u in urls) map = grequests.map(rs) failures = [ url for request, url in zip(map, urls) if request.status_code != 200 ] if not failures: print('SUCCESS: All "URL"s in the remote file manifests ' 'resolved to files on s3! ({} files checked)'.format(len(urls))) else: print('FAIL: The following URLs did not resolve: \n{}'.format( '\n'.join(failures))) responses = [(url, length, request.headers['Content-Length']) for request, url, length in zip(map, urls, file_lengths) if int(request.headers['Content-Length']) != int(length)] infos = [('RFM Length: {}, Expected: {}, URL: {}' ''.format(length, expected, url)) for url, length, expected in responses] if not responses: print('SUCCESS: All sizes match the rfms!') else: print('FAIL: The following URLs have mismatching size: \n{}' ''.format('\n'.join(infos)))
def startTaskParallal(dataLIst): concurrent_limit = 100 hdrs = {'connection': 'keep-alive'} urls = [] tmp_data = {} for row in dataLIst: id, d, firstLabel, secondLabel, title, link, isExternal = row isExternal = int(isExternal) url = link if not isExternal: url = d + link urls.append([id, url, isExternal]) tmp_data[id] = row rs = (grequests.head( u[1], allow_redirects=False, params={'uniqueid': u[0]}) if u[2] else grequests.get(u[1], allow_redirects=False, params={'uniqueid': u[0]}) for u in urls) #res = grequests.map(rs) for res in grequests.imap(rs, size=concurrent_limit): if res is not None: url = res.url #print(url) params = parse_qs(urlparse(url).query) if 'uniqueid' in params: key = int(params['uniqueid'][0]) id, d, firstLabel, secondLabel, title, link, isExternal = tmp_data[ key] data.append( (d, firstLabel, secondLabel, title, link, res.status_code, len(res.content), isExternal)) ids.append(id) print(len(ids))
def test_fetch_cards_sending_requests_by_batches_blocking_for_responses(): """ Send requests blocking for the response. Use a request pool to keep a threshold of maximum number of requests. Block until all responses are received. """ import time start = time.time() urls = [mtgurl.make_vanilla_url(cardname) for cardname in CARDS] reqs = (grequests.head(url, allow_redirects=True) for url in urls) responses = grequests.imap(reqs, size=30) delay = 1 time.sleep(delay) index = 0 for response in responses: print(response.url) assert response assert response.status_code in [200] index += 1 stop = time.time() print(stop - start - delay)
def validate_images(results, image_urls): """ Make sure images exist before we display them. Treat redirects as broken links since 99% of the time the redirect leads to a generic "not found" placeholder. Results are cached in redis and shared amongst all API servers in the cluster. """ if not image_urls: return start_time = time.time() # Pull matching images from the cache. redis = get_redis_connection("default") cache_prefix = 'valid:' cached_statuses = redis.mget([cache_prefix + url for url in image_urls]) cached_statuses = [ int(b.decode('utf-8')) if b is not None else None for b in cached_statuses ] # Anything that isn't in the cache needs to be validated via HEAD request. to_verify = {} for idx, url in enumerate(image_urls): if cached_statuses[idx] is None: to_verify[url] = idx reqs = (grequests.head(u, allow_redirects=False, timeout=0.2, verify=False) for u in to_verify.keys()) verified = grequests.map(reqs, exception_handler=_validation_failure) # Cache newly verified image statuses. to_cache = {} for idx, url in enumerate(to_verify.keys()): cache_key = cache_prefix + url if verified[idx]: status = verified[idx].status_code # Response didn't arrive in time. Try again later. else: status = -1 to_cache[cache_key] = status thirty_minutes = 60 * 30 twenty_four_hours_seconds = 60 * 60 * 24 pipe = redis.pipeline() if len(to_cache) > 0: pipe.mset(to_cache) for key, status in to_cache.items(): # Cache successful links for a day, and broken links for 120 days. if status == 200: pipe.expire(key, twenty_four_hours_seconds) elif status == -1: # Content provider failed to respond; try again in a short interval pipe.expire(key, thirty_minutes) else: pipe.expire(key, twenty_four_hours_seconds * 120) pipe.execute() # Merge newly verified results with cached statuses for idx, url in enumerate(to_verify): cache_idx = to_verify[url] if verified[idx] is not None: cached_statuses[cache_idx] = verified[idx].status_code else: cached_statuses[cache_idx] = -1 # Delete broken images from the search results response. for idx, _ in enumerate(cached_statuses): del_idx = len(cached_statuses) - idx - 1 status = cached_statuses[del_idx] if status == 429 or status == 403: log.warning( 'Image validation failed due to rate limiting or blocking. ' 'Affected URL: {}'.format(image_urls[idx])) elif status != 200: log.info('Deleting broken image with ID {} from results.'.format( results[del_idx]['identifier'])) del results[del_idx] end_time = time.time() log.info('Validated images in {} '.format(end_time - start_time))
def validate_images(query_hash, start_slice, results, image_urls): """ Make sure images exist before we display them. Treat redirects as broken links since 99% of the time the redirect leads to a generic "not found" placeholder. Results are cached in redis and shared amongst all API servers in the cluster. """ logger = parent_logger.getChild("validate_images") if not image_urls: logger.info("no image urls to validate") return logger.debug("starting validation") start_time = time.time() # Pull matching images from the cache. redis = django_redis.get_redis_connection("default") cache_prefix = "valid:" cached_statuses = redis.mget([cache_prefix + url for url in image_urls]) cached_statuses = [ int(b.decode("utf-8")) if b is not None else None for b in cached_statuses ] logger.debug(f"len(cached_statuses)={len(cached_statuses)}") # Anything that isn't in the cache needs to be validated via HEAD request. to_verify = {} for idx, url in enumerate(image_urls): if cached_statuses[idx] is None: to_verify[url] = idx logger.debug(f"len(to_verify)={len(to_verify)}") reqs = (grequests.head(u, allow_redirects=False, timeout=2, verify=False) for u in to_verify.keys()) verified = grequests.map(reqs, exception_handler=_validation_failure) # Cache newly verified image statuses. to_cache = {} for idx, url in enumerate(to_verify.keys()): cache_key = cache_prefix + url if verified[idx]: status = verified[idx].status_code # Response didn't arrive in time. Try again later. else: status = -1 to_cache[cache_key] = status thirty_minutes = 60 * 30 twenty_four_hours_seconds = 60 * 60 * 24 pipe = redis.pipeline() if len(to_cache) > 0: pipe.mset(to_cache) for key, status in to_cache.items(): # Cache successful links for a day, and broken links for 120 days. if status == 200: logger.debug("healthy link " f"key={key} ") pipe.expire(key, twenty_four_hours_seconds) elif status == -1: logger.debug("no response from provider " f"key={key}") # Content provider failed to respond; try again in a short interval pipe.expire(key, thirty_minutes) else: logger.debug("broken link " f"key={key} ") pipe.expire(key, twenty_four_hours_seconds * 120) pipe.execute() # Merge newly verified results with cached statuses for idx, url in enumerate(to_verify): cache_idx = to_verify[url] if verified[idx] is not None: cached_statuses[cache_idx] = verified[idx].status_code else: cached_statuses[cache_idx] = -1 # Create a new dead link mask new_mask = [1] * len(results) # Delete broken images from the search results response. for idx, _ in enumerate(cached_statuses): del_idx = len(cached_statuses) - idx - 1 status = cached_statuses[del_idx] if status == 429 or status == 403: logger.warning( "Image validation failed due to rate limiting or blocking. " f"url={image_urls[idx]} " f"status={status} ") elif status != 200: logger.info("Deleting broken image from results " f"id={results[del_idx]['identifier']} " f"status={status} ") del results[del_idx] new_mask[del_idx] = 0 # Merge and cache the new mask mask = get_query_mask(query_hash) if mask: new_mask = mask[:start_slice] + new_mask save_query_mask(query_hash, new_mask) end_time = time.time() logger.debug("end validation " f"end_time={end_time} " f"start_time={start_time} " f"delta={end_time - start_time} ")
def main(): parse_argument(sys.argv[1:]) if LOCAL: all_links = get_local_license() else: all_links = get_global_license() GITHUB_BASE = ("https://raw.githubusercontent.com/creativecommons" "/creativecommons.org/master/docroot/legalcode/") errors_total = 0 for license in all_links: try: license_name = license.string except AttributeError: license_name = license caught_errors = 0 page_url = "{}{}".format(GITHUB_BASE, license_name) print("\n") print("Checking:", license_name) # Refer to issue for more info on samplingplus_1.0.br.htm: # https://github.com/creativecommons/cc-link-checker/issues/9 if license_name == "samplingplus_1.0.br.html": continue filename = license_name[:-len(".html")] base_url = create_base_link(filename) print("URL:", base_url) if LOCAL: source_html = request_local_text(license_name) else: source_html = request_text(page_url) license_soup = BeautifulSoup(source_html, "lxml") links_in_license = license_soup.find_all("a") verbose_print("Number of links found:", len(links_in_license)) verbose_print("Errors and Warnings:") valid_anchors, valid_links = get_scrapable_links( base_url, links_in_license) if valid_links: memoized_results = get_memoized_result(valid_links, valid_anchors) stored_links = memoized_results[0] stored_anchors = memoized_results[1] stored_result = memoized_results[2] check_links = memoized_results[3] check_anchors = memoized_results[4] if check_links: rs = ( # Since we're only checking for validity, we can retreive # only the headers/metadata grequests.head(link, timeout=REQUESTS_TIMEOUT) for link in check_links) responses = list() # Explicitly close connections to free up file handles and # avoid Connection Errors per: # https://stackoverflow.com/questions/21978115/using-grequests-to-make-several-thousand-get-requests-to-sourceforge-get-max-r/22839550#22839550 for response in grequests.map( rs, exception_handler=exception_handler): try: responses.append(response.status_code) response.close() except AttributeError: responses.append(response) memoize_result(check_links, responses) stored_anchors += check_anchors stored_result += responses stored_links += check_links caught_errors = write_response( stored_links, stored_result, base_url, license_name, stored_anchors, ) if caught_errors: errors_total += caught_errors ERR_CODE = 1 print("\nCompleted in: {}".format(time.time() - START_TIME)) if OUTPUT_ERR: output_summary(all_links, errors_total) print("\nError file present at: ", OUTPUT.name) output_test_summary(errors_total) sys.exit(ERR_CODE)
def main(): # get argument parser = OptionParser( usage='Usage: %prog [<CDN_URL>]' '\n\nArguments:' '\n CDN_URL Of the format "<scheme>://<fqdn>".' ' Trailing "/" not allowed.' '\n\nExamples:' '\n %prog https://tiles.cdn.mozilla.net' ) parser.set_defaults( quiet=False, verbose=False, ) parser.add_option( '-q', '--quiet', action='store_true', dest='quiet', help="Don't report NOTICE", ) parser.add_option( '-v', '--verbose', action='store_true', dest='verbose', help='Report SUCCESS', ) options, args = parser.parse_args() try: from splice.environment import Environment config = Environment.instance().config cdn = 'https://%s.s3.amazonaws.com' % config.S3['bucket'] tile_index_key = config.S3['tile_index_key'] except Exception: cdn = 'https://tiles.cdn.mozilla.net' tile_index_key = 'tile_index_v3.json' channels = [ 'desktop', 'android', 'desktop-prerelease', 'hello' ] if len(args) == 1: cdn = args.pop() elif len(args) > 1: parser.parse_args(['-h']) if not options.quiet: print( 'NOTICE: crawling: %s/%s_%s' % (cdn, tuple(channels), tile_index_key) ) print('NOTICE: calculating tiles urls') errors = [] # extract tiles urls from tile index try: urls = [ tiles_url for index in validate( grequests.imap( (grequests.get('%s/%s_%s' % (cdn, channel, tile_index_key), allow_redirects=False,) for channel in channels), size=10 ), options.verbose, errors, ) for key, value in index.json().iteritems() if '/' in key for tiles_url in value.values() ] tiles_urls = set() for url in urls: if type(url) is list: tiles_urls.update(url) else: tiles_urls.add(url) if not options.quiet: print('NOTICE: tiles urls extracted: %s' % len(tiles_urls)) print('NOTICE: calculating image urls') # extract image urls from tiles image_urls = set([ image_url for tiles in validate( grequests.imap( (grequests.get(tiles_url, allow_redirects=False) for tiles_url in tiles_urls), size=10 ), options.verbose, errors, ) for value_x in tiles.json().values() for value_y in value_x for key, image_url in value_y.iteritems() if key in ['imageURI', 'enhancedImageURI'] ]) if not options.quiet: print('NOTICE: image urls extracted: %s' % len(image_urls)) print('NOTICE: validating image urls') # Two things to notice here: # 1. expanding the list comprehension is necessary to get the 'validate' # step above to actually evaluate (it's lazy.) # 2. the actual value of the list comprehension is dropped, not returned. [ valid.url for valid in validate( grequests.imap( (grequests.head(image_url, allow_redirects=False) for image_url in image_urls), size=10 ), options.verbose, errors, ) ] except Exception as e: msg = 'ERROR: %s' % e print(msg) print(traceback.format_exc()) errors.append(msg) if errors: exit(1)
def main(): args = parse_argument(sys.argv[1:]) if args.local: license_names = get_local_licenses() else: license_names = get_github_licenses() if args.log_level <= INFO: print("Number of files to be checked:", len(license_names)) errors_total = 0 exit_status = 0 for license_name in license_names: caught_errors = 0 context_printed = False filename = license_name[:-len(".html")] base_url = create_base_link(args, filename) context = f"\n\nChecking: {license_name}\nURL: {base_url}" if args.local: source_html = request_local_text(license_name) else: page_url = "{}{}".format(GITHUB_BASE, license_name) source_html = request_text(page_url) license_soup = BeautifulSoup(source_html, "lxml") links_in_license = license_soup.find_all("a") link_count = len(links_in_license) if args.log_level <= INFO: print(f"{context}\nNumber of links found: {link_count}") context_printed = True valid_anchors, valid_links, context_printed = get_scrapable_links( args, base_url, links_in_license, context, context_printed) if valid_links: memoized_results = get_memoized_result(valid_links, valid_anchors) stored_links = memoized_results[0] stored_anchors = memoized_results[1] stored_result = memoized_results[2] check_links = memoized_results[3] check_anchors = memoized_results[4] if check_links: rs = ( # Since we're only checking for validity, we can retreive # only the headers/metadata grequests.head(link, timeout=REQUESTS_TIMEOUT) for link in check_links) responses = list() # Explicitly close connections to free up file handles and # avoid Connection Errors per: # https://stackoverflow.com/a/22839550 for response in grequests.map( rs, exception_handler=exception_handler): try: responses.append(response.status_code) response.close() except AttributeError: responses.append(response) memoize_result(check_links, responses) stored_anchors += check_anchors stored_result += responses stored_links += check_links caught_errors = write_response( args, stored_links, stored_result, base_url, license_name, stored_anchors, context, context_printed, ) if caught_errors: errors_total += caught_errors exit_status = 1 print("\nCompleted in: {}".format(time.time() - START_TIME)) if args.output_errors: output_summary(args, license_names, errors_total) print("\nError file present at: ", args.output_errors.name) output_test_summary(errors_total) sys.exit(exit_status)
def check_legalcode(args): print("\n\nChecking LegalCode License...\n\n") license_names = get_legalcode(args) if args.log_level <= INFO: print("Number of files to be checked:", len(license_names)) errors_total = 0 exit_status = 0 for license_name in license_names: caught_errors = 0 context_printed = False filename = license_name[:-len(".html")] base_url = create_base_link(args, filename) context = f"\n\nChecking: legalcode\nURL: {base_url}" if args.local: source_html = request_local_text(LICENSE_LOCAL_PATH, license_name) else: page_url = "{}{}".format(LICENSE_GITHUB_BASE, license_name) source_html = request_text(page_url) license_soup = BeautifulSoup(source_html, "lxml") links_found = license_soup.find_all("a") link_count = len(links_found) if args.log_level <= INFO: print(f"{context}\nNumber of links found: {link_count}") context_printed = True valid_anchors, valid_links, context_printed = get_scrapable_links( args, base_url, links_found, context, context_printed) if valid_links: memoized_results = get_memoized_result(valid_links, valid_anchors) stored_links = memoized_results[0] stored_anchors = memoized_results[1] stored_result = memoized_results[2] check_links = memoized_results[3] check_anchors = memoized_results[4] if check_links: rs = ( # Since we're only checking for validity, we can retreive # only the headers/metadata grequests.head(link, timeout=REQUESTS_TIMEOUT) for link in check_links) responses = list() # Explicitly close connections to free up file handles and # avoid Connection Errors per: # https://stackoverflow.com/a/22839550 for response in grequests.map( rs, exception_handler=exception_handler): try: responses.append(response.status_code) response.close() except AttributeError: responses.append(response) memoize_result(check_links, responses) stored_anchors += check_anchors stored_result += responses stored_links += check_links caught_errors = write_response( args, stored_links, stored_result, base_url, license_name, stored_anchors, context, context_printed, ) if caught_errors: errors_total += caught_errors exit_status = 1 return license_names, errors_total, exit_status
def check_rdfs(args, index=False): if index: print("\n\nChecking index.rdf...\n\n") rdf_obj_list = get_index_rdf(args) else: print("\n\nChecking RDFs...\n\n") rdf_obj_list = get_rdf(args) if args.log_level <= INFO: if not index: print("Number of RDF files to be checked:", len(rdf_obj_list)) else: print( "Number of RDF objects/sections to be checked in index.rdf:", len(rdf_obj_list), ) errors_total = 0 exit_status = 0 for rdf_obj in rdf_obj_list: caught_errors = 0 context_printed = False rdf_url = (rdf_obj["rdf:about"] if index else f"{rdf_obj['rdf:about']}rdf") links_found = get_links_from_rdf(rdf_obj) checking = "URL" if not index else "RDF_ABOUT" context = f"\n\nChecking: \n{checking}: {rdf_url}" link_count = len(links_found) if args.log_level <= INFO: print(f"{context}\nNumber of links found: {link_count}") context_printed = True base_url = rdf_url valid_anchors, valid_links, context_printed = get_scrapable_links( args, base_url, links_found, context, context_printed, rdf=True, ) if valid_links: memoized_results = get_memoized_result(valid_links, valid_anchors) stored_links = memoized_results[0] stored_anchors = memoized_results[1] stored_result = memoized_results[2] check_links = memoized_results[3] check_anchors = memoized_results[4] if check_links: rs = ( # Since we're only checking for validity, # we can retreive # only the headers/metadata grequests.head(link, timeout=REQUESTS_TIMEOUT) for link in check_links) responses = list() # Explicitly close connections to free up file handles and # avoid Connection Errors per: # https://stackoverflow.com/a/22839550 for response in grequests.map( rs, exception_handler=exception_handler): try: responses.append(response.status_code) response.close() except AttributeError: responses.append(response) memoize_result(check_links, responses) stored_anchors += check_anchors stored_result += responses stored_links += check_links caught_errors = write_response( args, stored_links, stored_result, rdf_url, rdf_obj, stored_anchors, context, context_printed, ) if caught_errors: errors_total += caught_errors exit_status = 1 return rdf_obj_list, errors_total, exit_status
def check_deeds(args): print("\n\nChecking Deeds...\n\n") license_names = get_legalcode(args) if args.log_level <= INFO: print("Number of files to be checked:", len(license_names)) errors_total = 0 exit_status = 0 for license_name in license_names: caught_errors = 0 context_printed = False filename = license_name[:-len(".html")] deed_base_url = create_base_link(args, filename, for_deeds=True) # Deeds template: # https://github.com/creativecommons/cc.engine/blob/master/cc/engine/templates/licenses/standard_deed.html # Scrapping the html found on the active site if deed_base_url: context = f"\n\nChecking: deed\nURL: {deed_base_url}" page_url = deed_base_url source_html = request_text(page_url) license_soup = BeautifulSoup(source_html, "lxml") links_found = license_soup.find_all("a") link_count = len(links_found) if args.log_level <= INFO: print(f"{context}\nNumber of links found: {link_count}") context_printed = True base_url = deed_base_url valid_anchors, valid_links, context_printed = get_scrapable_links( args, base_url, links_found, context, context_printed) if valid_links: memoized_results = get_memoized_result(valid_links, valid_anchors) stored_links = memoized_results[0] stored_anchors = memoized_results[1] stored_result = memoized_results[2] check_links = memoized_results[3] check_anchors = memoized_results[4] if check_links: rs = ( # Since we're only checking for validity, # we can retreive # only the headers/metadata grequests.head(link, timeout=REQUESTS_TIMEOUT) for link in check_links) responses = list() # Explicitly close connections to free up file handles and # avoid Connection Errors per: # https://stackoverflow.com/a/22839550 for response in grequests.map( rs, exception_handler=exception_handler): try: responses.append(response.status_code) response.close() except AttributeError: responses.append(response) memoize_result(check_links, responses) stored_anchors += check_anchors stored_result += responses stored_links += check_links caught_errors = write_response( args, stored_links, stored_result, base_url, license_name, stored_anchors, context, context_printed, ) if caught_errors: errors_total += caught_errors exit_status = 1 return license_names, errors_total, exit_status
def send_requests(phone: str, count: int): password = GenerateInfo().password() username = GenerateInfo().username() email = GenerateInfo().email() vodafone = (f"+{phone[:2]}(" + f"{phone[2:5]}) " + f"{phone[5:8]}-" + f"{phone[8:10]}-" + f"{phone[10:12]}") russian_name = GenerateInfo().russian_name() iteration = 0 while iteration < count: requests = [ grequests.head( "https://secure.online.ua/ajax/check_phone/", params={"reg_phone": "+" + phone}, headers=head, ), grequests.post( "https://www.ozon.ru/api/composer-api.bx/_action/fastEntry", json={ "phone": phone, "otpId": 0 }, headers=head, ), grequests.post( "http://www.vodafone.ua/shop/ru/vodafone_customer/register/sendSms/", data={ "is_ajax": "true", "phone_number": vodafone, }, headers=head, ), grequests.post( "https://uklon.com.ua/api/v1/account/code/send", headers=uklon1, json={"phone": phone}, ), grequests.post( "https://partner.uklon.com.ua/api/v1/registration/sendcode", headers=uklon2, json={"phone": phone}, ), grequests.post( "https://www.moyo.ua/identity/registration", data={ "firstname": russian_name, "phone": phone, "email": email, }, headers=head, ), grequests.post( "https://koronapay.com/transfers/online/api/users/otps", data={"phone": phone}, headers=head, ), grequests.post( "https://n13423.yclients.com/api/v1/book_code/312054", data=json.dumps({"phone": phone}), headers=frisor, ), grequests.post( "https://kasta.ua/api/v2/login/", data={"phone": phone}, headers=head, ), grequests.post( "https://izi.ua/api/auth/register", json={ "phone": "+" + phone, "name": russian_name, "is_terms_accepted": "true", }, headers=head, ), grequests.post( "https://junker.kiev.ua/postmaster.php", data={ "tel": phone[2:], "name": username, "action": "callme", }, headers=head, ), grequests.post( "https://allo.ua/ua/customer/account/createPostVue/?currentTheme=main¤tLocale=uk_UA", data={ "firstname": russian_name, "telephone": phone, "email": email, "password": password, "form_key": "Zqqj7CyjkKG2ImM8", }, headers=head, ), grequests.post( "https://stores-api.zakaz.ua/user/signup/", json={"phone": phone}, headers=zakaz, ), grequests.post( "https://youla.ru/web-api/auth/request_code", data={"phone": phone}, headers=head, ), grequests.post( "https://cloud.mail.ru/api/v2/notify/applink", json={ "phone": "+" + phone, "api": 2, "email": email, "x-email": "x-email", }, headers=head, ), grequests.post( "https://myapi.beltelecom.by/api/v1/auth/check-phone?lang=ru", data={"phone": phone}, headers=head, ), grequests.post( url= f"https://www.sportmaster.ua/?module=users&action=SendSMSReg&phone=+{phone}", headers=head, ), grequests.post( "https://crm.getmancar.com.ua/api/veryfyaccount", json={ "phone": "+" + phone, "grant_type": "password", "client_id": "gcarAppMob", "client_secret": "SomeRandomCharsAndNumbersMobile", }, headers=head, ), grequests.post( "https://www.icq.com/smsreg/requestPhoneValidation.php", data={ "msisdn": phone, "locale": "en", "countryCode": "ru", "version": "1", "k": "ic1rtwz1s1Hj1O0r", "r": "46763", }, headers=head, ), grequests.post( "https://api.pozichka.ua/v1/registration/send", json={"RegisterSendForm": { "phone": "+" + phone }}, headers=head, ), grequests.post( "https://register.sipnet.ru/cgi-bin/exchange.dll/RegisterHelper", params={ "oper": 9, "callmode": 1, "phone": "+" + phone }, headers=head, ), grequests.post( "https://city24.ua/personalaccount/account/registration", data={"PhoneNumber": phone}, headers=head, ), grequests.post( "https://helsi.me/api/healthy/accounts/login", json={ "phone": phone, "platform": "PISWeb" }, headers=head, ), grequests.post( "https://cloud.mail.ru/api/v2/notify/applink", json={ "phone": "+" + phone, "api": 2, "email": email }, headers=head, ), grequests.post( "https://auth.multiplex.ua/login", json={"login": phone}, headers=head, ), grequests.post( "https://account.my.games/signup_send_sms/", data={"phone": phone}, headers=head, ), grequests.post( "https://cabinet.planetakino.ua/service/sms", params={"phone": phone}, headers=head, ), grequests.post( "https://youla.ru/web-api/auth/request_code", data={"phone": phone}, headers=head, ), grequests.post( "https://rutube.ru/api/accounts/sendpass/phone", data={"phone": "+" + phone}, headers=head, ), grequests.post( "https://www.mvideo.ru/internal-rest-api/common/atg/rest/actors/VerificationActor/getCode", params={"pageName": "registerPrivateUserPhoneVerificatio"}, data={ "phone": phone, "recaptcha": "off", "g-recaptcha-response": "" }, headers=head, ), grequests.post( "https://passport.twitch.tv/register?trusted_request=true", json={ "birthday": { "day": 12, "month": 10, "year": 2000 }, "client_id": "kd1unb4b3q4t58fwlpcbzcbnm76a8fp", "include_verification_code": True, "password": password, "phone_number": phone, "username": username, }, headers=head, ), grequests.post( "https://lk.belkacar.ru/register", data={"phone": "+" + phone}, headers=head, ), grequests.post( "https://api.ivi.ru/mobileapi/user/register/phone/v6", data={"phone": phone}, headers=head, ), grequests.post( "https://lk.belkacar.ru/get-confirmation-code", data={"phone": "+" + phone}, headers=head, ), grequests.post( "https://secure.online.ua/ajax/check_phone/", params={"reg_phone": phone}, header=head, ), grequests.post( "https://api.delitime.ru/api/v2/signup", data={ "SignupForm[username]": phone, "SignupForm[device_type]": 3 }, headers=head, ), grequests.post( "https://apteka366.ru/login/register/sms/send", data={"phone": phone}, headers=head, ), grequests.head( "https://fundayshop.com/ru/ru/secured/myaccount/myclubcard/resultClubCard.jsp?type=sendConfirmCode&phoneNumber={}" .format("+" + phone), headers=head, ), grequests.post( "https://gorzdrav.org/login/register/sms/send", data={"phone": phone}, headers=head, ), grequests.post( "https://eda.yandex/api/v1/user/request_authentication_code", json={"phone_number": phone}, headers=head, ), grequests.post( "https://eda.yandex/api/v1/user/request_authentication_code", json={"phone_number": "+" + phone}, headers=head, ), grequests.post( "https://my.dianet.com.ua/send_sms/", data={"phone": phone}, headers=head, ), grequests.post( "https://shafa.ua/api/v3/graphiql", json={ "operationName": "RegistrationSendSms", "variables": { "phoneNumber": "+" + phone }, "query": "mutation RegistrationSendSms($phoneNumber: String!) {\n unauthorizedSendSms(phoneNumber: $phoneNumber) {\n isSuccess\n userToken\n errors {\n field\n messages {\n message\n code\n __typename\n }\n __typename\n }\n __typename\n }\n}\n", }, headers=head, ), grequests.post( "https://my.telegram.org/auth/send_password", data={"phone": "+" + phone}, headers=head, ), grequests.head( f"https://cabinet.planetakino.ua/service/sms?phone={phone}", headers=head, ), grequests.post( "https://api.boosty.to/oauth/phone/authorize", data={"client_id": "+" + phone}, headers=head, ), grequests.post( "https://md-fashion.com.ua/bpm/validate-contact", data={"phone": "+" + phone}, headers=head, ), ] grequests.map(requests, gtimeout=3) iteration += 1 if iteration >= 5 and count >= 10: sleep(randint(2, 4)) print( f"\033[1;{choice(['32m', '33m', '34m', '35m', '36m'])}{iteration}/{count} кругов" )
def thumb(request): # get the width height and source width = int(request.GET.get('width', 0)) height = int(request.GET.get('height', 0)) src = request.GET.get('src') # only continue if they passed a source if src: # the parts of our url parts = src.split('/') # some logical size limits if width > settings.MAX_IMAGE_SIZE: width = settings.MAX_IMAGE_SIZE if height > settings.MAX_IMAGE_SIZE: height = settings.MAX_IMAGE_SIZE # get the stuff we need out of our parts bucket = parts[3] filename = parts[-1] ext = filename.split('.')[-1] # create a thumbname for the image thumb_name = '%s_thumber_%s_%s.%s' % (''.join( filename.split('.')[0:-1]), width, height, ext) thumb_src = '%s/_thumber/%s' % ('/'.join(parts[0:-1]), thumb_name) # put the thumbs in a dir called _thumber thumb_path = '/%s/_thumber/%s' % ('/'.join(parts[4:-1]), thumb_name) response = None # check to see if our thumb exists with a head request def exception_handle(req, exception): global response response = None #response = urllib2.urlopen(HeadRequest(thumb_src)) response = grequests.map([grequests.head(thumb_src)], ) if response[0].status_code >= 400: response = None # PROCESS THE IMAGE HERE # handle the image being missing # this is where we actually do the image thumbing and upload it to amazon if not response: # setup a place to store the local file and download it local_thumb_path = 'tmp/' + filename def exception_handler_image(req, exception): logger.debug("Image does not exist") logger.exception(exception) raise Http404 #image = urllib2.urlopen(src) image = grequests.map([grequests.get(src)]) #except urllib2.HTTPError, e: # write out the file with open(local_thumb_path, 'wb') as f: f.write(image[0].read()) # do the resizing, save our image image = Image.open(local_thumb_path) image = image.resize((width, height), Image.ANTIALIAS) image.save(local_thumb_path) # send it back to where it came from upload_image(local_thumb_path, thumb_path, bucket) return redirect(thumb_src)
def head(self, url, **kwargs): """HTTP HEAD Method.""" kwargs['auth'] = self.auth req = grequests.head(url, **kwargs) return self._run(req)
def query_get_task_with_details(bot_memo, present_skill, bot_nlp): if ((bot_memo == {} or bot_memo['index']) and present_skill == 'get_task'): #requests can be used for synchronous requests # r = requests.get("https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/TaskCollection?sap-client=400&$filter=Status%20eq%20%27READY%27&$format=json", auth=HTTPBasicAuth('pritamsa', 'rupu@0801')) # body1 = r.json() #grequests is faster url1 = [ "https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/TaskCollection?sap-client=400&$filter=Status%20eq%20%27READY%27&$format=json" ] rs1 = (grequests.get(u, auth=('pritamsa', 'rupu@0801')) for u in url1) #both imap and map can be used #reque = grequests.imap(rs,size=1) reque1 = grequests.map(rs1, size=1) response_array1 = [] for response1 in reque1: print(response1) x1 = response1.json() response_array1.append(x1) body1 = response_array1[0] no_of_tasks = len(body1["d"]["results"]) if (body1["d"]["results"]): #task details instance_id = body1["d"]["results"][0]["InstanceID"] task_title = body1["d"]["results"][0]["TaskTitle"] scrapped_po_no = task_title.split("order ", 1)[1] body2, body3 = take_action_async(scrapped_po_no) #po_header detail created_by_user = body2["d"]["CreatedByUser"] SupplierName = body2["d"]["SupplierName"] PurchaseOrderNetAmount = body2["d"]["PurchaseOrderNetAmount"] DocumentCurrency = body2["d"]["DocumentCurrency"] PurchaseOrderNetAmount = body2["d"]["PurchaseOrderNetAmount"] final_reply_string = '' concat_string_for_multiple_lineitems = '' per_item_desc_dict = {} all_item_details = {} #po item detail no_of_line_items = len(body3["d"]["results"]) for i in range(no_of_line_items): Material = body3["d"]["results"][i]["Material_Text"] Plant = body3["d"]["results"][i]["Plant"] OrderQuantity = body3["d"]["results"][i]["OrderQuantity"] netPriceItem = body3["d"]["results"][i]["NetPriceAmount"] documentCurrency = body3["d"]["results"][i]["DocumentCurrency"] price_present_item_with_currency = netPriceItem + documentCurrency item_no = 'item : ' + str(i + 1) # print(item_no) #item_no = dict(item_no) per_item_desc_dict = { item_no: { 'Material': Material, 'Plant': Plant, 'OrderQuantity': OrderQuantity, 'netPriceItem': price_present_item_with_currency } } all_item_details.update(per_item_desc_dict) #use this when sending the item details as string all in one reply # concat_string_for_multiple_lineitems = concat_string_for_multiple_lineitems \ # + 'Material: ' + Material + '.\n' + 'plant: ' + Plant + '.\n' \ # + 'OrderQuantity: ' + OrderQuantity + '.\n' get_task_string = '' get_task_string_with_header_detail = '' get_task_string = task_title + '.' + '\n' get_task_string_with_header_detail = 'created by user: '******'.' + '\n' + 'SupplierName: ' + SupplierName \ + '.' + '\n' + 'PurchaseOrderNetAmount: ' + PurchaseOrderNetAmount + ' ' + DocumentCurrency + '.'+'\n' #final_reply_string = 'Now you have got, '+ str(no_of_tasks) + ' pending tasks to approve. ' + get_task_string + get_task_string_with_header_detail +'You have: ' + str(no_of_line_items) +' items.\n'+ concat_string_for_multiple_lineitems + " say approve to approve this task or say ignore to skip this task and move on to your next task, or say next to get your next task with details." final_reply_string = 'Now you have got, ' + str( no_of_tasks ) + ' pending tasks to approve. ' + get_task_string + get_task_string_with_header_detail + 'You have: ' + str( no_of_line_items ) + ' items.\n' + " say get item details to get all the item details in this purchase order. Or,say approve to approve this task or say ignore to skip this task and move on to your next task, or say next to get your next task with details." return final_reply_string, 1, instance_id, created_by_user, SupplierName, ( PurchaseOrderNetAmount + ' ' + DocumentCurrency ), '', all_item_details, no_of_line_items, scrapped_po_no #return 1for memory index as no memo is present in the beggining else: final_reply_string = 'no more tasks to approve in your inbox.' return final_reply_string, 1, bot_memo, bot_memo, bot_memo, bot_memo, '', '', '', bot_memo elif ( (bot_memo['index']) and (present_skill == 'get_next_task' or present_skill == 'ignore_task')): #requests can be used for synchronous requests # r = requests.get("https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/TaskCollection?sap-client=400&$filter=Status%20eq%20%27READY%27&$format=json", auth=HTTPBasicAuth('pritamsa', 'rupu@0801')) # body1 = r.json() #grequests is faster url1 = [ "https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/TaskCollection?sap-client=400&$filter=Status%20eq%20%27READY%27&$format=json" ] rs1 = (grequests.get(u, auth=('pritamsa', 'rupu@0801')) for u in url1) #both imap and map can be used #reque = grequests.imap(rs,size=1) reque1 = grequests.map(rs1, size=1) response_array1 = [] for response1 in reque1: print(response1) x1 = response1.json() response_array1.append(x1) body1 = response_array1[0] no_of_tasks = len(body1["d"]["results"]) if ((len(body1["d"]["results"]) == 1)): instance_id = body1["d"]["results"][0]["InstanceID"] task_title = body1["d"]["results"][0]["TaskTitle"] scrapped_po_no = task_title.split("order ", 1)[1] body2, body3 = take_action_async(scrapped_po_no) #po_header detail created_by_user = body2["d"]["CreatedByUser"] SupplierName = body2["d"]["SupplierName"] PurchaseOrderNetAmount = body2["d"]["PurchaseOrderNetAmount"] DocumentCurrency = body2["d"]["DocumentCurrency"] PurchaseOrderNetAmount = body2["d"]["PurchaseOrderNetAmount"] final_reply_string = '' concat_string_for_multiple_lineitems = '' per_item_desc_dict = {} all_item_details = {} #po item detail no_of_line_items = len(body3["d"]["results"]) for i in range(no_of_line_items): Material = body3["d"]["results"][i]["Material_Text"] Plant = body3["d"]["results"][i]["Plant"] OrderQuantity = body3["d"]["results"][i]["OrderQuantity"] netPriceItem = body3["d"]["results"][i]["NetPriceAmount"] documentCurrency = body3["d"]["results"][i]["DocumentCurrency"] price_present_item_with_currency = netPriceItem + documentCurrency item_no = 'item : ' + str(i + 1) # print(item_no) #item_no = dict(item_no) per_item_desc_dict = { item_no: { 'Material': Material, 'Plant': Plant, 'OrderQuantity': OrderQuantity, 'netPriceItem': price_present_item_with_currency } } all_item_details.update(per_item_desc_dict) #use this when sending the item details as string all in one reply # concat_string_for_multiple_lineitems = concat_string_for_multiple_lineitems \ # + 'Material: ' + Material + '.\n' + 'plant: ' + Plant + '.\n' \ # + 'OrderQuantity: ' + OrderQuantity + '.\n' get_task_string = '' get_task_string_with_header_detail = '' get_task_string = task_title + '.' + '\n' get_task_string_with_header_detail = 'created by user: '******'.' + '\n' + 'SupplierName: ' + SupplierName \ + '.' + '\n' + 'PurchaseOrderNetAmount: ' + PurchaseOrderNetAmount + ' ' + DocumentCurrency + '.'+'\n' # final_reply_string = 'Now you have got, '+ str(no_of_tasks) + ' pending tasks to approve. ' + get_task_string + get_task_string_with_header_detail +'You have: ' + str(no_of_line_items) +' items.\n'+ concat_string_for_multiple_lineitems + " say approve to approve this task or say ignore to skip this task and move on to your next task, or say next to get your next task with details." final_reply_string = 'Now you have got, ' + str( no_of_tasks ) + ' pending tasks to approve. ' + get_task_string + get_task_string_with_header_detail + 'You have: ' + str( no_of_line_items ) + ' items.\n' + " say get item details to get all the item details in this purchase order. Or,say approve to approve this task or say ignore to skip this task and move on to your next task, or say next to get your next task with details." return final_reply_string, 1, instance_id, created_by_user, SupplierName, ( PurchaseOrderNetAmount + ' ' + DocumentCurrency ), '', all_item_details, no_of_line_items, scrapped_po_no #return 1for memory index as no memo is present in the beggining elif ((len(body1["d"]["results"]) > 1) and bot_memo['index'] < len(body1["d"]["results"])): #task details instance_id = body1["d"]["results"][ bot_memo['index']]["InstanceID"] task_title = task_title = body1["d"]["results"][ bot_memo['index']]["TaskTitle"] #print(task_title) scrapped_po_no = task_title.split("order ", 1)[1] #print(scrapped_po_no) body2, body3 = take_action_async(scrapped_po_no) #po_header detail created_by_user = body2["d"]["CreatedByUser"] SupplierName = body2["d"]["SupplierName"] PurchaseOrderNetAmount = body2["d"]["PurchaseOrderNetAmount"] DocumentCurrency = body2["d"]["DocumentCurrency"] PurchaseOrderNetAmount = body2["d"]["PurchaseOrderNetAmount"] final_reply_string = '' concat_string_for_multiple_lineitems = '' per_item_desc_dict = {} all_item_details = {} #po item detail #only show one or two tasks no_of_line_items = len(body3["d"]["results"]) for i in range(no_of_line_items): Material = body3["d"]["results"][i]["Material_Text"] Plant = body3["d"]["results"][i]["Plant"] OrderQuantity = body3["d"]["results"][i]["OrderQuantity"] netPriceItem = body3["d"]["results"][i]["NetPriceAmount"] documentCurrency = body3["d"]["results"][i]["DocumentCurrency"] price_present_item_with_currency = netPriceItem + documentCurrency item_no = 'item : ' + str(i + 1) # print(item_no) #item_no = dict(item_no) per_item_desc_dict = { item_no: { 'Material': Material, 'Plant': Plant, 'OrderQuantity': OrderQuantity, 'netPriceItem': price_present_item_with_currency } } all_item_details.update(per_item_desc_dict) #use this when sending the item details as string all in one reply # concat_string_for_multiple_lineitems = concat_string_for_multiple_lineitems \ # + 'Material: ' + Material + '.\n' + 'plant: ' + Plant + '.\n' \ # + 'OrderQuantity: ' + OrderQuantity + '.\n' get_task_string = '' get_task_string_with_header_detail = '' get_task_string = task_title + '.' + '\n' get_task_string_with_header_detail = 'created by user: '******'.' + '\n' + 'SupplierName: ' + SupplierName \ + '.' + '\n' + 'PurchaseOrderNetAmount: ' + PurchaseOrderNetAmount + ' ' + DocumentCurrency + '.'+'\n' # final_reply_string = get_task_string + get_task_string_with_header_detail +'You have: ' + str(no_of_line_items) +' items in this P.O.\n'+ concat_string_for_multiple_lineitems + " say approve to approve this task or say ignore to skip this task and move on to your next task, or say next to get your next task with details." final_reply_string = 'Now you have got, ' + str( no_of_tasks ) + ' pending tasks to approve. ' + get_task_string + get_task_string_with_header_detail + 'You have: ' + str( no_of_line_items ) + ' items.\n' + " say get item details to get all the item details in this purchase order. Or,say approve to approve this task or say ignore to skip this task and move on to your next task, or say next to get your next task with details." #print(get_task_string) #print(final_reply_string) return final_reply_string, bot_memo[ 'index'] + 1, instance_id, created_by_user, SupplierName, ( PurchaseOrderNetAmount + ' ' + DocumentCurrency ), '', all_item_details, no_of_line_items, scrapped_po_no elif (len(body1["d"]["results"]) > 0) and (bot_memo['index'] >= len( body1["d"]["results"])): final_reply_string = 'no more tasks to approve in your inbox.' return final_reply_string, bot_memo['index'], len( body1["d"]["results"] ), bot_memo['created_by'], bot_memo['SupplierName'], bot_memo[ 'PurchaseOrderNetAmount'], '', '', '', bot_memo[ 'scrapped_po_no'] else: final_reply_string = 'I think there are no more pending approvals for you. Say, "get my tasks", to get your pending approvals.' return final_reply_string, bot_memo['index'], len( body1["d"]["results"] ), bot_memo['created_by'], bot_memo['SupplierName'], bot_memo[ 'PurchaseOrderNetAmount'], '', '', '', bot_memo[ 'scrapped_po_no'] #repeat intent is handled via bot memory not via code # elif((bot_memo['index']) and present_skill == 'repeat'): # r = requests.get("https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/TaskCollection?sap-client=400&$filter=Status%20eq%20%27READY%27&$format=json", auth=HTTPBasicAuth('pritamsa', 'rupu@0801')) # body1 = r.json() # if (body1["d"]["results"] and bot_memo['index'] <= len(body1["d"]["results"])): # #task details # instance_id = body1["d"]["results"][bot_memo['index']-1]["InstanceID"] # task_title = body1["d"]["results"][bot_memo['index']-1]["TaskTitle"] # scrapped_po_no = task_title.split("order ",1)[1] # body2,body3 = take_action_async(scrapped_po_no) # #po_header detail # created_by_user = body2["d"]["CreatedByUser"] # SupplierName = body2["d"]["SupplierName"] # PurchaseOrderNetAmount = body2["d"]["PurchaseOrderNetAmount"] # DocumentCurrency = body2["d"]["DocumentCurrency"] # PurchaseOrderNetAmount = body2["d"]["PurchaseOrderNetAmount"] # final_reply_string = '' # concat_string_for_multiple_lineitems = '' # #po item detail # #only show one or two tasks # no_of_line_items = len(body3["d"]["results"]) # for i in range(no_of_line_items): # Material = body3["d"]["results"][i]["Material_Text"] # Plant = body3["d"]["results"][i]["Plant"] # OrderQuantity = body3["d"]["results"][i]["OrderQuantity"] # concat_string_for_multiple_lineitems = concat_string_for_multiple_lineitems \ # + 'Material: ' + Material + '.\n' + 'plant: ' + Plant + '.\n' \ # + 'OrderQuantity: ' + OrderQuantity + '.\n' # get_task_string = '' # get_task_string_with_header_detail = '' # get_task_string = task_title + '\n' # get_task_string_with_header_detail = 'created_by_user: '******'.' +'\n' + 'SupplierName: ' + SupplierName \ # +'.' + '\n' + 'PurchaseOrderNetAmount: ' + PurchaseOrderNetAmount + ' ' + DocumentCurrency + '.' +'\n' # final_reply_string = get_task_string + get_task_string_with_header_detail +'You have: ' + str(no_of_line_items) +' items\n'+ concat_string_for_multiple_lineitems + " say approve to approve this task or say ignore to skip this task and move on to your next task, or say next to get your next task with details." # #print(get_task_string) # #print(final_reply_string) # return final_reply_string,bot_memo['index'],instance_id,created_by_user,SupplierName, (PurchaseOrderNetAmount + ' ' + DocumentCurrency) # elif(body1["d"]["results"] and bot_memo['index'] >= len(body1["d"]["results"])): # final_reply_string = 'no more tasks to approve...' # return final_reply_string,bot_memo['index'],len(body1["d"]["results"]),created_by_user,SupplierName, (PurchaseOrderNetAmount + ' ' + DocumentCurrency) # else: # final_reply_string = 'I am facing some issues now please try later' # return final_reply_string,bot_memo['index'],len(body1["d"]["results"]),created_by_user,SupplierName, (PurchaseOrderNetAmount + ' ' + DocumentCurrency) elif ((bot_memo['index']) and present_skill == 'approve'): after_approval_reply = 'successfully approved, please say,"get my tasks", to get your previous pending aapprovals from the beggining, or, say next to move on to your next task.' approval_failure_reply = "there was an issue with the server, Please try again later to approve..." session = requests.Session() header = {'x-csrf-token': 'Fetch'} present_task_instance_id = bot_memo['instanceID'] # response = session.head("https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/TaskCollection?sap-client=400&$filter=Status%20eq%20%27READY%27&$format=json", auth=HTTPBasicAuth('pritamsa', 'rupu@0801'),headers=header) # if (response.status_code != 200): # return approval_failure_reply ,bot_memo['index'],present_task_instance_id,bot_memo['created_by'],bot_memo['SupplierName'], bot_memo['PurchaseOrderNetAmount'],approval_failure_reply,'','',bot_memo['scrapped_po_no'] # elif (response.status_code == 200): # cookie = session.cookies.get_dict() # print(cookie) # csrf = response.headers['x-csrf-token'] # #print(csrf) # #post # #approve # header_2 = {'x-csrf-token':csrf} # approve_po = session.post("https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/Decision?sap-client=400&SAP__Origin='S4HMYINBOCLNT200'&InstanceID="+ "'"+present_task_instance_id +"'""&DecisionKey='0001'&Comments='test%20approve'",auth=HTTPBasicAuth('pritamsa', 'rupu@0801'),headers=header_2,cookies=cookie) # print('***************************************************************') # print(approve_po.status_code) # approval request posted asynchronously url3 = [ "https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/TaskCollection?sap-client=400&$filter=Status%20eq%20%27READY%27&$format=json" ] head_res1 = (grequests.head(u, auth=('pritamsa', 'rupu@0801'), headers=header) for u in url3) #both imap and map can be used #reque = grequests.imap(rs,size=1) reque3 = grequests.map(head_res1, size=1) response_array3 = [] for response3 in reque3: if (response3.status_code != 200): print("hey problem") return approval_failure_reply, bot_memo[ 'index'], present_task_instance_id, bot_memo[ 'created_by'], bot_memo['SupplierName'], bot_memo[ 'PurchaseOrderNetAmount'], approval_failure_reply, '', '', bot_memo[ 'scrapped_po_no'] else: cookie = response3.cookies.get_dict() print(cookie) csrf = response3.headers['x-csrf-token'] print(csrf) header_2 = {'x-csrf-token': csrf} url_post = [ "https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/Decision?sap-client=400&SAP__Origin='S4HMYINBOCLNT200'&InstanceID=" + "'" + present_task_instance_id + "'" "&DecisionKey='0001'&Comments='test%20approve'" ] post_res = (grequests.post(u_post, auth=('pritamsa', 'rupu@0801'), headers=header_2, cookies=cookie) for u_post in url_post) post_reque = grequests.map(post_res, size=1) response_array_post = [] for response_post in post_reque: if (response_post.status_code != 200): print( "hey problem in approving the request. Please try again later." ) return approval_failure_reply, bot_memo[ 'index'], present_task_instance_id, bot_memo[ 'created_by'], bot_memo['SupplierName'], bot_memo[ 'PurchaseOrderNetAmount'], approval_failure_reply, '', '', bot_memo[ 'scrapped_po_no'] else: return after_approval_reply, bot_memo[ 'index'], present_task_instance_id, bot_memo[ 'created_by'], bot_memo['SupplierName'], bot_memo[ 'PurchaseOrderNetAmount'], after_approval_reply, '', '', bot_memo[ 'scrapped_po_no'] #after this call the "next" task showing skill in bot elif ((bot_memo['index']) and present_skill == 'reject'): after_rejection_reply = 'successfully rejected, please say,"get my tasks", to get your previous pending aapprovals from the beggining, or, say next to move on to your next task.' rejection_failure_reply = "there was an issue with the server, Please try again later to approve..." session = requests.Session() header = {'x-csrf-token': 'Fetch'} present_task_instance_id = bot_memo['instanceID'] # response = session.head("https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/TaskCollection?sap-client=400&$filter=Status%20eq%20%27READY%27&$format=json", auth=HTTPBasicAuth('pritamsa', 'rupu@0801'),headers=header) # if (response.status_code != 200): # return approval_failure_reply ,bot_memo['index'],present_task_instance_id,bot_memo['created_by'],bot_memo['SupplierName'], bot_memo['PurchaseOrderNetAmount'],approval_failure_reply,'','',bot_memo['scrapped_po_no'] # elif (response.status_code == 200): # cookie = session.cookies.get_dict() # print(cookie) # csrf = response.headers['x-csrf-token'] # #print(csrf) # #post # #approve # header_2 = {'x-csrf-token':csrf} # approve_po = session.post("https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/Decision?sap-client=400&SAP__Origin='S4HMYINBOCLNT200'&InstanceID="+ "'"+present_task_instance_id +"'""&DecisionKey='0001'&Comments='test%20approve'",auth=HTTPBasicAuth('pritamsa', 'rupu@0801'),headers=header_2,cookies=cookie) # print('***************************************************************') # print(approve_po.status_code) # approval request posted asynchronously url4 = [ "https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/TaskCollection?sap-client=400&$filter=Status%20eq%20%27READY%27&$format=json" ] head_res4 = (grequests.head(u, auth=('pritamsa', 'rupu@0801'), headers=header) for u in url4) #both imap and map can be used #reque = grequests.imap(rs,size=1) reque4 = grequests.map(head_res4, size=1) response_array4 = [] for response4 in reque4: if (response4.status_code != 200): print("hey problem") return rejection_failure_reply, bot_memo[ 'index'], present_task_instance_id, bot_memo[ 'created_by'], bot_memo['SupplierName'], bot_memo[ 'PurchaseOrderNetAmount'], rejection_failure_reply, '', '', bot_memo[ 'scrapped_po_no'] else: cookie = response4.cookies.get_dict() print(cookie) csrf = response4.headers['x-csrf-token'] print(csrf) header_2 = {'x-csrf-token': csrf} url_post = [ "https://p2001172697trial-trial.apim1.hanatrial.ondemand.com/p2001172697trial/Workflow_approval/Decision?sap-client=400&SAP__Origin='S4HMYINBOCLNT200'&InstanceID=" + "'" + present_task_instance_id + "'" "&DecisionKey='0002'&Comments='test%20reject'" ] post_res = (grequests.post(u_post, auth=('pritamsa', 'rupu@0801'), headers=header_2, cookies=cookie) for u_post in url_post) post_reque = grequests.map(post_res, size=1) response_array_post = [] for response_post in post_reque: if (response_post.status_code != 200): print( "hey problem in rejecting P.O. . Please try again later." ) return rejection_failure_reply, bot_memo[ 'index'], present_task_instance_id, bot_memo[ 'created_by'], bot_memo['SupplierName'], bot_memo[ 'PurchaseOrderNetAmount'], rejection_failure_reply, '', '', bot_memo[ 'scrapped_po_no'] else: return after_rejection_reply, bot_memo[ 'index'], present_task_instance_id, bot_memo[ 'created_by'], bot_memo['SupplierName'], bot_memo[ 'PurchaseOrderNetAmount'], after_rejection_reply, '', '', bot_memo[ 'scrapped_po_no'] #after this call the "next" task showing skill in bot # THIS LOGIC BELOW NEEDS TO BE RE_WRITTEN #************************************************************************************************************ # elif((bot_nlp['ordinal'] and len(bot_nlp['ordinal']) <= bot_memo['no_of_line_items']) and present_skill == 'get_item_details'): elif (present_skill == 'get_item_details'): if (bot_nlp['ordinal'] and len(bot_nlp['ordinal']) <= bot_memo['no_of_line_items']): # filter_item_ordinally = 'item : '+ (bot_nlp['ordinal'][bot_nlp['ordinal']['index']]['rank']) # print(filter_item_ordinally) print('///////////////////////////////////////////////////') nlp_ordinal_filter_index = bot_nlp['ordinal'][0][ 'index'] #this is the first element's index of nlp entity ordinal array individual_item_filter_string = 'item : ' + str( nlp_ordinal_filter_index + 1) item_level_reply_ordinally = bot_memo['all_item_details'][ individual_item_filter_string] print(item_level_reply_ordinally) return str(item_level_reply_ordinally).strip( '{}' ), bot_memo['index'], bot_memo['instanceID'], bot_memo[ 'created_by'], bot_memo['SupplierName'], bot_memo[ 'PurchaseOrderNetAmount'], bot_memo[ 'after_approval_reply'], bot_memo[ 'all_item_details'], bot_memo[ 'no_of_line_items'], bot_memo['scrapped_po_no'] elif (bot_nlp['ordinal'] == False and bot_nlp['number'] and len(bot_nlp['number']) <= bot_memo['no_of_line_items']): # filter_item_ordinally = 'item : '+ (bot_nlp['ordinal'][bot_nlp['ordinal']['index']]['rank']) # print(filter_item_ordinally) print('///////////////////////////////////////////////////') nlp_number_filter_index = bot_nlp['number'][0][ 'scalar'] #this is the first element's index of nlp entity ordinal array individual_item_filter_string = 'item : ' + str( nlp_number_filter_index) item_level_reply_numerically = bot_memo['all_item_details'][ individual_item_filter_string] print(item_level_reply_numerically) return str( item_level_reply_numerically ), bot_memo['index'], bot_memo['instanceID'], bot_memo[ 'created_by'], bot_memo['SupplierName'], bot_memo[ 'PurchaseOrderNetAmount'], bot_memo[ 'after_approval_reply'], bot_memo[ 'all_item_details'], bot_memo[ 'no_of_line_items'], bot_memo['scrapped_po_no']
def main(doc, timeout, size, debug, allow_codes, whitelist): """ Examples: simple call $ vl README.md Adding debug outputs $ vl README.md --debug Adding a custom timeout for each url. time on seconds. $ vl README.md -t 3 Adding a custom size param, to add throttle n requests per time $ vl README -s 1000 Skipping some error codes. This will allow 500 and 404 responses to be ignored $ vl README.md -a 500,404 Adding Whitelists $ vl README.md -w server1.com,server2.com """ t0 = time.time() links = [i[0] for i in LINK_RE.findall(doc.read())] request_urls = [] counts = {} for link in links: # no static if is_static(link): STATICS.append(link) continue # no dupes if link in counts: counts[link] += 1 continue else: counts[link] = 1 parsed = urlparse(link) # fix no scheme links if not parsed.scheme: link = 'http://{0}'.format(link) # whitelisted if whitelist: exists = [i for i in whitelist if i in parsed.netloc] if exists: WHITELISTED.append(link) continue request_urls.append(link) # removing dupes counts_keys = counts.keys() DUPES.extend([(i, counts[i]) for i in counts_keys if counts[i] > 1]) requests = (grequests.head(u, timeout=timeout, verify=False) for u in request_urls) responses = grequests.imap(requests, exception_handler=handle_exception, size=size) for res in responses: color = 'green' if is_error_code(res.status_code): if res.status_code not in allow_codes: ERRORS.append((res.status_code, res.url)) color = 'red' else: WHITELISTED.append(res.url) status = click.style(str(res.status_code), fg=color) click.echo('[{}] {}'.format(status, res.url)) errors_len = len(ERRORS) exceptions_len = len(EXCEPTIONS) dupes_len = len(DUPES) white_len = len(WHITELISTED) if errors_len: click.echo() click.echo('Failed URLs:') for code, url in ERRORS: code = click.style(str(code), fg='red') click.echo('[{0}] {1}'.format(code, url)) if exceptions_len and debug: import ssl click.echo('Exceptions raised:') click.echo('Note: OpenSSL Version = {0}'.format(ssl.OPENSSL_VERSION)) click.secho('Check URLs for possible false positives', fg='yellow') for url, exception in EXCEPTIONS: click.echo('- {0}'.format(url)) click.secho('{0}'.format(exception), fg='red', bold=True) if dupes_len and debug: # pragma: nocover click.echo('Dupes:') for url, count in DUPES: click.secho('- {0} - {1} times'.format(url, count), fg='yellow', bold=True) if white_len and debug: click.echo() click.echo('Whitelisted (allowed codes and whitelisted param)') for url in WHITELISTED: click.secho('- {0}'.format(url), fg='magenta') click.secho('Total Links Parsed {0}'.format(len(links)), fg='green') click.secho('Total Errors {0}'.format(errors_len), fg='red') click.secho('Total Exceptions {0}'.format(exceptions_len), fg='red') click.secho('Total Dupes {0}'.format(dupes_len), fg='yellow') click.secho('Total whitelisted {0}'.format(white_len), fg='yellow') click.secho('Total static {0}'.format(len(STATICS)), fg='yellow') if debug: click.echo('Execution time: {0:.2f} seconds'.format(time.time() - t0)) if errors_len: sys.exit(1)
def main(sensor, start_date, days, api_endpoint): api = Api(api_endpoint) logger.info('Checking consistencty for %s between %s + %s' % (sensor, start_date, days)) aoi_nw = (-180, 90) aoi_se = (180, -90) aoi_ne = (aoi_se[0], aoi_nw[1]) aoi_sw = (aoi_nw[0], aoi_se[1]) aoi = [aoi_nw, aoi_ne, aoi_se, aoi_sw, aoi_nw] wrong_urls = list() for delta_day in range(1, days): start_time = time.time() start_date_date = parse(start_date)+ datetime.timedelta(days=delta_day) end_date_date = start_date_date + datetime.timedelta(days=1) logger.info('Checking consistencty for %s between %s and %s' % (sensor, start_date_date.isoformat(), end_date_date.isoformat())) # Object representation results = api.search_dataset(aoi, 100, start_date_date, end_date_date, sensor, full_objects=False) url_resources = list() missing_urls = list() missing_types = list() for r in results: if r['resources']['s3public']['zip'] != None: url_resources.append(r['resources']['s3public']['zip']) else: missing_urls.append('%s:%s' % (r['tile_identifier'], r['entity_id'])) missing_types.append('zip') if r['resources']['metadata']!= None: url_resources.append(r['resources']['metadata']) else: missing_urls.append('%s:%s' % (r['tile_identifier'], r['entity_id'])) missing_types.append('metadata') if r['resources']['quicklook'] != None: url_resources.append(r['resources']['quicklook']) else: missing_urls.append('%s:%s' % (r['tile_identifier'], r['entity_id'])) missing_types.append('quicklook') logger.info('total scans: %d' %len(url_resources)) logger.info('already missed resources: %d' %len(missing_urls)) if False: for counter, res in enumerate(url_resources): req = requests.head(res) if req.status_code != requests.codes.ok: print res, req.status_code missing_urls.append(res) print res if (counter % 25) == 0: print counter else: counter = 0 for url_parts in chunks(url_resources, 500): counter+=1 rs = (grequests.head(u) for u in url_parts) res = grequests.map(rs) for req in res: if req is not None: if req.status_code != requests.codes.ok: wrong_urls.append(res) missing_types.append('zip_registered') else: print req.url, req if len(wrong_urls) > 0: for item in wrong_urls: print item for req in item: if req.status_code != requests.codes.ok: append_data('/tmp/wrong_urls.txt', req.url) if len(missing_urls) > 0: append_data('/tmp/missing_urls.txt', missing_urls) if len(missing_types) > 0: for type in ['zip_registered', 'quicklook', 'metadata', 'zip']: logger.info('%d:%s' % (operator.countOf(missing_types, type), type)) logger.info('wrong resources resources: %d' % len(wrong_urls)) logger.info('Executed in %f secs.' % (time.time()-start_time)) print 'Wrong URLs:', wrong_urls