def summarized(): if request.method == 'POST': user_input = request.form['user_input'] summary_length = int(request.form['summary_length']) if checkers.is_url(user_input): if not evaluate.check_url_valid(user_input): abort(404, user_input) if request.form.get('checkmode'): check_mode = request.form['checkmode'] ref_summary = request.form['ref_summary'] if checkers.is_url(ref_summary): if not evaluate.check_url_valid(ref_summary): abort(404, ref_summary) else: check_mode = False ref_summary = "" #BERT_summary #bert_summary = summarizers.bert_summary(user_input, ref_summary, check_mode, num_sentences_out = summary_length) #LSA_Summary lsa_summary = summarizers.lsa_summary(user_input, ref_summary, check_mode, num_sentences_out = summary_length) #luhn_summary luhn_summary = summarizers.luhn_summary(user_input, ref_summary, check_mode, num_sentences_out = summary_length) #LEX_Summary lex_summary = summarizers.lex_summary(user_input, ref_summary, check_mode, num_sentences_out = summary_length) #RESULTS sum_result = {"user_input": user_input, "summary_length": summary_length, "check_mode": check_mode, "ref_summary": ref_summary, "summaries": { #"BERT SUM": bert_summary, "Latent Semantic Analysis": lsa_summary, "Luhn": luhn_summary, "Lex Rank": lex_summary } } if request.form.get('checkmode'): #get best summary best_summary = evaluate.get_best_summary(sum_result) sum_result["best_summary"] = best_summary #Arrange summaries sum_result["summaries"] = evaluate.order_summary(best_summary, sum_result) return render_template("summary_results.html", results = sum_result) else: return render_template("summary.html")
def validate(row: pd.Series): """ 入力データに対する、お気持ち程度のバリデーションチェック """ official_page = row["official_page"] if official_page and checkers.is_url(official_page) == False: raise ValidationWarning("公式URL(officical_page)が不正です") detail_page = row["detail_page"] if detail_page and checkers.is_url(detail_page) == False: raise ValidationWarning("詳細ページ(detail_page)のURLが不正です") # 郵便番号、電話番号の書式のバリデーション(厳密ではない) remove_char_regex = r"[ -‐-‑ー−‒–—―ー ]" # (区切り文字適当) tel = re.sub(remove_char_regex, "", row["tel"]) if tel and not re.match(r"^0\d{9,10}$", tel): raise ValidationWarning("電話番号(tel)の書式が不正です") # 0始まりの半角数字9〜10桁 zip_code = re.sub(remove_char_regex, "", row["zip_code"]) if zip_code and not re.match(r"\d{7}$", zip_code): raise ValidationWarning("郵便番号(zip_code)の書式が不正です") # 半角数字7桁 # HTMLタグが含まれてほしくないやつに含まれている for target in [ "shop_name", "address", "official_page", "detail_page", "opening_hours", "closing_day", "area_name", ]: text = row.get(target) if not text: continue if len(text) != len(w3lib.html.remove_tags(text)): raise ValidationWarning(f"{target}にHTMLタグが含まれています") # 郵便番号でのジオコーディング結果に対する正当性チェック try: zip_code = row["zip_code"] if not zip_code: return pref = cached_posuto_pref(zip_code) except KeyError: # MEMO: posutoのデータには存在しない(特殊な)郵便番号が指定されている場合がある # いわゆる「大口事業所個別番号」というやつで、そういうのはどうしようもないのでバリデーション成功とする logger.info(f"不明な郵便番号です (「大口事業所個別番号」かも?) : zip code={zip_code}") return except Exception as e: # MEMO: その他特殊すぎる郵便番号などでposuto内部でエラーが起きた場合 logger.warning(e, stack_info=True) logger.warning(f"unknown posuto error, zip code={zip_code}") raise ValidationWarning(f"posutoでエラーになる郵便番号です(内部処理エラー)") norm_addr = row.get("normalized_address") if norm_addr and not norm_addr.startswith(pref): raise ValidationWarning( f"郵便番号から求められた都道府県は {pref} ですが、ジオコーディングされた住所は {norm_addr} です")
def extract_data(html_page): temporary_variable = '' parsed_html_page = parse_html_content(html_page.content) links = parsed_html_page.find_all('a') for link in links: if link.get('href') == '/locations/': temporary_variable = link.get('href') break if temporary_variable != '' and (not checkers.is_url(temporary_variable)): CustomConstants.URL_TO_BE_VISITED.add( NetworkUtil.get_absolute_url(temporary_variable)) else: return CustomConstants.SOMETHING_WENT_WRONG_WHILE_FETCHING_LOCATIONS html_page = NetworkUtil.read_from_network( CustomConstants.URL_TO_BE_VISITED.pop()) parsed_html_page = parse_html_content(html_page.content) location_cards = parsed_html_page.find_all(class_='location card') if len(location_cards) > 0: clear_set_data() for location_card in location_cards: link = location_card.get('href') if checkers.is_url(link): CustomConstants.URL_TO_BE_VISITED.add(link) else: link = NetworkUtil.get_absolute_url(link) CustomConstants.URL_TO_BE_VISITED(link) room_links = set() for location in CustomConstants.URL_TO_BE_VISITED: html = NetworkUtil.read_from_network(location) parsed_html = parse_html_content(html.content) room_links.update(extract_rooms_feed(parsed_html)) time.sleep(3.0) clear_set_data() room_detail_list = list() for room_link in room_links: html_page = NetworkUtil.read_from_network(room_link) parsed_html_page = parse_html_content(html_page.content) room_detail = extract_room_detail(parsed_html_page) room_detail_list.append(room_detail) time.sleep(3.0) return room_detail_list
def valid_url(url: str) -> bool: """Check that the URL is well formatted.""" parsed_url = urlparse(url) if not (checkers.is_url(parsed_url.geturl()) or checkers.is_ip_address(parsed_url.geturl())): # prepend http if missing parsed_url = parsed_url._replace(**{"scheme": "http"}) parsed_url = parsed_url._replace(**{"netloc": parsed_url[2] }) # move path to netloc parsed_url = parsed_url._replace(**{"path": ""}) # check again with fixed url if not (checkers.is_url(parsed_url.geturl()) or checkers.is_ip_address(parsed_url.geturl())): return False return True
def crawler(nthreads=None, url=None, output=None, all_links=None): """ Web crawler starts from URL to all found links under the same netloc. """ # Check if URL is valid url = url if not checkers.is_url(url): print(f'The url you have entered is not valid. URL: {str(url)}') exit(1) num_threads = nthreads output = output all_links = all_links crawler = Crawler(url=url, num_threads=num_threads, output=output, all_links=all_links) start = time() crawler.start() end = time() crawler.print_results() print(f'\n\nCrawling took {int(end-start)} seconds.') return
async def run_image(self, context, opts): """ Update the image link of a project @param context: @param opts: @return: """ user = User(context.message.author.id, context.guild.id, context) shortname = opts[0].lower() if opts else None img = opts[1] if len(opts) > 1 else None # Make sure the project exists. project = user.get_project(shortname) if not project: return await context.send( user.get_mention() + ', ' + lib.get_string('project:err:noexists', user.get_guild()).format(shortname)) # Check it's a valid image link. if not checkers.is_url(img) and img is not None: return await context.send( user.get_mention() + ', ' + lib.get_string( 'project:err:link', user.get_guild()).format(img)) project.set_image(img) return await context.send( user.get_mention() + ', ' + lib.get_string('project:image', user.get_guild()))
def load(source) -> etree._Element: # pylint: disable=protected-access ''' Load an XML document args: source: XML source. Either path, url, string, or loaded LXML Element returns: Loaded XML object tree, or None on invalid source ''' if not isinstance(source, (str, bytes)) or len(source) < 1: # pylint: disable=protected-access return source if isinstance(source, etree._ElementTree) else None source = source.strip() if source[0] == ord('<'): # Handle source as bytes source = io.BytesIO(source) elif source[0] == '<': # Handle source as string source = io.StringIO(source) elif checkers.is_file(source): # Handle source as local file pass # etree.parse handles local file paths natively elif checkers.is_url(source): # Handle source as URL response = requests.get(source, timeout=10) if not response: app.logger.warning( f"Failed to retrieve XML URL (or timed out): {source}") return None source = io.BytesIO(response.content) else: app.logger.warning( f"XML source is not valid file, URL, or XML string. {source[:40]}" + (len(source) > 40) * '...') return None return etree.parse(source)
def input_title(event, context): url = event['url'] # dynamo table info table = os.environ.get('DYNAMO_TABLE') # validate url if checkers.is_url(url): key = str(hash(url)) # dynamo operations dynamo = boto3.resource('dynamodb', region_name=region) table = dynamo.Table(table) table.put_item(Item={'titleid': key, 'url': url, 'status': 'PENDING'}) return { 'statusCode': 200, 'body': json.dumps({"id": key}), 'headers': { 'Content-Type': 'application/json', } } else: return { 'statusCode': 400, 'body': { "error": "invalid url" }, 'headers': { 'Content-Type': 'application/json', } }
def get_profile_urls(driver, n_pages=5): """ Return a list without repetitions of alphabetically sorted URLs taken from the results of a given query on Google search. :param driver: selenium chrome driver object :param n_pages: int number of google pages to loop over :return: list of linkedin-profile URLs """ linkedin_urls = [] for i in range(n_pages): urls = driver.find_elements_by_class_name("yuRUbf [href]") #links = [url.get_attribute('href') for url in urls] linkedin_urls += [ url.get_attribute('href') for url in urls if checkers.is_url(url.get_attribute('href')) and re.search( "^https://[a-z]+\.linkedin\..*$", url.get_attribute('href')) ] print(linkedin_urls) sleep(0.5) if i > 1: try: next_button_url = driver.find_element_by_css_selector( '#pnnext').get_attribute('href') driver.get(next_button_url) except NoSuchElementException: break linkedin_urls_no_rep = sorted( list(dict.fromkeys([url for url in linkedin_urls]))) return linkedin_urls_no_rep
async def parse_search(ctx, search: str, loop: asyncio.BaseEventLoop = None): loop = loop or asyncio.get_event_loop() source_type = "GDrive" if checkers.is_url(search): return search gdrive_folder_id = config['gdrive_id'] if not gdrive_folder_id: return search source_init = Source(ctx, source_type=source_type, loop=loop) try: sources = await source_init.get_playlist(gdrive_folder_id, include_name=True) except SourceError as e: await ctx.send( 'An error occurred while processing this request: {}'.format( str(e))) return search for each_source in sources: if search.lower() in each_source['name'].lower(): search = f"https://drive.google.com/file/d/{each_source['id']}/view" break return search
def create(event, context): data = json.loads(event['body']) if 'url' not in data: logging.error('URL parameter not provided') return { 'statusCode': 422, 'body': json.dumps({'error_message': 'Insufficient data'}) } url = data['url'] if not url: logging.error('URL value missing') return { 'statusCode': 422, 'body': json.dumps({'error_message': 'URL missing'}) } if not checkers.is_url(url): logging.error('URL is invalid') return { 'statusCode': 422, 'body': json.dumps({'error_message': 'URL invalid'}) } if 'id' in data: id = data['id'] else: id = generate(size=6) url_added = UrlModel(id=id, url=url, created=datetime.now()) url_added.save() return {'statusCode': 200, 'body': json.dumps({'id': id, 'url': url})}
def generate_shop_url(domain_or_url): url = None if checkers.is_domain(domain_or_url): url = f'https://{domain_or_url}' elif checkers.is_url(domain_or_url): url = domain_or_url return url
def test_build_url(self): """Test URL builder.""" # without URL parameters item_url = self.plg_utils.build_url( base_url="https://guts.github.io/mkdocs-rss-plugin/", path="changelog") self.assertTrue(checkers.is_url(item_url)) # with URL parameters item_url = self.plg_utils.build_url( base_url="https://guts.github.io/mkdocs-rss-plugin/", path="changelog", args_dict={"utm_source": "test_rss"}, ) print(item_url) self.assertTrue(checkers.is_url(item_url))
def register(url): """Registers a URL. Validates input URL, and returns with a non-zero exit code if the URL is invalid. If the URL is valid, adds the URL to a internal, persistent registry. """ # Check if URL is valid using regex if not checkers.is_url(url): print("Error: invalid URL.") return sys.exit(os.EX_DATAERR) home = os.path.expanduser("~") file_path = os.path.join(home, FILE_PATH) if not os.path.exists(FILE_DIR): try: os.makedirs(os.path.dirname(FILE_DIR)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise open(file_path, "a").close() with open(file_path, "r+") as file: for line in file: if url in line: file.close() return sys.exit(os.EX_OK) # Write a space delimited row of the form "url content_num_bytes, # load_time_seconds, refresh_time. file.write(" ".join([url, "0", "0", "0"]) + "\n") file.close() return sys.exit(os.EX_OK)
def validate_url(url): """ Determine if a given URL is valid. Return True if so, False if not """ if not url: raise ValueError("No url provided") return checkers.is_url(url)
async def handle_url(message: Message, state: FSMContext): if checkers.is_url(message.text): await States.url.set() await state.update_data(url=message.text) markup = get_keyboard() await message.answer("Options:", reply_markup=markup) else: await message.answer("Invalid link.")
def source_create_url_input(self, messege) -> str: '''Validates url input for source creation.''' input_text = input(WHITE + f"\n{messege}: ") if checkers.is_url(input_text) is False: print( RED + "Error: Entered data is invalid. Please check and try again.") return self.source_create_url_input(messege) return input_text # returns entered string if all validation rules are met
def test_extract_single(): extracted = extract_single("http://google.com") # Image path probably pretty stable; ideally we extract from a page # that never changes like a self-hosted testing page set up for this # purpose. assert re.findall("http://google.com/images.*.png", extracted["images"][0]) for link in extracted["links"]: assert checkers.is_url(link)
def url(u): """Download a video of a given URL.""" if checkers.is_url(u): if download(u): click.echo('Success!') else: click.echo('Download error. No fap :(') else: click.echo('URL error.')
def upload(ctx: Dict[str, Any], tags: str, filepath: str) -> None: api = ctx['api'] namespace = ctx['namespace'] t = remove_if_in_target(namespace, tags.split('/')) if checkers.is_url(filepath): link = f'<head><meta http-equiv="Refresh" content="0; URL={filepath}"></head>' print(api.put_latest(tags=namespace + t, content=str.encode(link))) return with Path(filepath).open(mode='rb') as f: print(api.put_latest(tags=namespace + t, content=f))
async def e_img(self, ctx, event_id, url): """ check if event exist, valid format, update image url field """ if self.get_event(event_id) is None: # no event with this id await ctx.send(error_msgs['no_event'].format(event_id)) elif checkers.is_url(url) is False: # format url invalid await ctx.send(error_msgs["url_format"]) else: # update image_url field await ctx.send( self.update_event(event_id, 'image_url', url, ctx.message.created_at))
def index_post(): text = request.form['text'] if checkers.is_url(text): key = Controller().db_insert(text) if key: return 'Your new url is http://127.0.0.1:5000/' + key else: return 'Error' else: return 'Error: Must be a valid url'
def f_valid_url(target): import requests from validator_collection import validators, checkers if checkers.is_url(target): request = requests.get(target) if request.status_code == 200: return 1 else: return 0 else: return 0
async def _network_reach(self): """reach website et return HTTP code""" response = "" if self.args: if len(self.args) == 1: if checkers.is_url(self.args[0]): response = network.network_reach(self.args[0], "") url = "https://" + self.args[0] if checkers.is_url(url): response = network.network_reach(self.args[0], "") if len(self.args) == 2: if checkers.is_url(self.args[0]) and self.args[1] == "details": response = network.network_reach(self.args[0], "details") url = "https://" + self.args[0] if checkers.is_url(url): response = network.network_reach(self.args[0], "details") else: response = "Im not soothsayer...Give me an url !" await send_text_to_room(self.client, self.room.room_id, response)
async def test(ctx, *, url): if (checkers.is_url(url)): await ctx.send("Just a sec...") await testportal.getTest(url) screenshots = os.listdir('screenshots') lenght = len(screenshots) for x in range(lenght): await ctx.send(file=discord.File('screenshots\screenshot' + str(x + 1) + '.png')) deleteScreenshots() else: await ctx.send("The Url is not valid")
def _parse_url(param, default): if param is not None and not isinstance(param, (bool, str)): raise ValueError(f"Invalid parameter input {param}.") if param is None or param is False: return False elif param is True: return default elif checkers.is_url(str(param)): return str(param) else: raise URLError(f"Invalid url: {param}")
def resumeur(request): input_texte = request.POST['input_texte'] #try: if checkers.is_url(input_texte): url = input_texte article_dico = article_extraction(url) #article_dico["summary_2"] = ktrain_texte_resumeur(article_dico["texte"], lang='fr') #else: #pass #except: pass return render(request, "summarizer/result.html", article_dico)
def read_from_network(url): try: if checkers.is_url(url): response = requests.get(url, headers=CustomConstants.REQUEST_HEADER) CustomConstants.URL_VISITED.add(url) return response else: CustomConstants.URL_TO_BE_VISITED.add(url) return CustomConstants.URL_IS_NOT_VALID except: CustomConstants.URL_TO_BE_VISITED.add(url) return CustomConstants.ERROR_OCCURED_WHILE_SENDING_REQUEST
async def convert(self, ctx, argument): with suppress(Exception): mem = await member_converter.convert(ctx, argument) return str(mem.avatar_url_as(static_format='png', size=1024)) with suppress(Exception): emoji = await emoji_converter.convert(ctx, str(argument)) return str(emoji.url) if ctx.message.attachments: with suppress(Exception): return str(ctx.message.attachments[0].url) elif checkers.is_url(str(argument)): return str(argument) else: return 'error'
def urls_to_collect(urls_file: str): """Collect urls given urls in a file.""" urls = [] for url in open(urls_file): url = url.strip() if url.startswith("#"): # comment lines should be ignored continue if len(url) == 0: # ignore empty lines continue if checkers.is_url(url) is not True: logging.warning("invalid url: %s" % url) continue urls.append(url) return urls