def extract_urls(text): extractor = URLExtract() if type(text) is str: urls = extractor.find_urls(text) return urls elif type(text) is list: urls = [] for x in text: url_x = extractor.find_urls(x) urls.extend(url_x) return urls else: print("Provided text type (%s) is not currently supported. Please supply either a list of string objects or a string object." % str(type(text)))
def str_clean2(self, item): extractor = URLExtract() url = tldextract.extract(item).registered_domain return url
print('', end='') pqc() #abstract pq() print(abstract, end='') pqc() #cover_image (may not work, but might as well try) pq() cover_image = 'https://cross-currents.berkeley.edu' + row['Image'] print(cover_image, end='') pqc() #pdf_url, extract from the File column extractor = URLExtract() pdf_urls = extractor.find_urls(row['File']) if len( pdf_urls ) >= 1: #sometimes the extractor finds more than one URL, we should just always use the first pdf_url = pdf_urls[0] else: pdf_url = 'ERROR, no PDF URL found, content-type: ' + row[ 'Content type'] + '; Content ID: ' + row[ 'Content ID'] + '; Article Type: ' + row['Article Type'] pq() print(urllib.parse.unquote(pdf_url), end='') pqc() #supplementalfile_url pq()
def google_scrap(termo, dadobruto=False, gdr=None): """ Performs Scrap in the Google search website/Realiza Scrap no Google Search. :param credencial: Password, e-mail or document/Senha, e-mail ou documento. :param dadobruto: To save the raw data/Para salvar o dado bruto. :param gdr: Uploads to Google Drive/Sobe arquivo para o Google Drive. """ block = ['https://pastebin.com'] final_urls = [] urls = [] clean = [] vazou = [] vazio = [] google_urls = [ 'https://www.google.com/search?q=site:pastebin.com+intext:leak&sxsrf=ALeKk03cedAQ3Y7jlzXHY8LImOO_gJGxMQ:1606136317667&source=lnt&tbs=qdr:d&sa=X&ved=2ahUKEwjcobOF3JjtAhWbF7kGHbo3BO4QpwV6BAgFECY&biw=1366&bih=629', 'https://www.google.com/search?biw=1366&bih=629&tbs=qdr%3Ad&sxsrf=ALeKk02AVQ6YXyUuLeavYsIZjr__SUBBKQ%3A1606136641749&ei=QbO7X5eGLaLD5OUP7dCoqAU&q=site%3Apastebin.com+intext%3A*%3A*&oq=site%3Apastebin.com+intext%3A*%3A*&gs_lcp=CgZwc3ktYWIQA1C3kgJY9KMCYN6nAmgAcAB4AIABVogBhwWSAQE4mAEAoAEBqgEHZ3dzLXdpesABAQ&sclient=psy-ab&ved=0ahUKEwjXqvef3ZjtAhWiIbkGHW0oClUQ4dUDCA0&uact=5', 'https://www.google.com/search?biw=1366&bih=629&tbs=qdr%3Ad&sxsrf=ALeKk008FbvhwTD4Qyhal8ibZGTuwj5DwQ%3A1606136886229&ei=NrS7X7-_DYHX5OUPpMaYqAg&q=site%3Apastebin.com+intext%3A%22Target%3A%22&oq=site%3Apastebin.com+intext%3A%22Target%3A%22&gs_lcp=CgZwc3ktYWIQA1DEG1iMMWCSNGgAcAB4AIABWYgBnQOSAQE1mAEAoAEBqgEHZ3dzLXdpesABAQ&sclient=psy-ab&ved=0ahUKEwi_ssGU3pjtAhWBK7kGHSQjBoUQ4dUDCA0&uact=5', 'https://www.google.com/search?biw=1366&bih=629&tbs=qdr%3Ad&sxsrf=ALeKk01een-cvWz4vY0qsb4w_IbGk4Ym0w%3A1606136893453&ei=PbS7X9f6GoDC5OUP7amBiA0&q=site%3Apastebin.com+intext%3Apassword&oq=site%3Apastebin.com+intext%3Apassword&gs_lcp=CgZwc3ktYWIQA1DlF1ivIGCNIWgAcAB4AIABZ4gB9gWSAQM4LjGYAQCgAQGqAQdnd3Mtd2l6wAEB&sclient=psy-ab&ved=0ahUKEwiXjfqX3pjtAhUAIbkGHe1UANEQ4dUDCA0&uact=5', 'https://www.google.com/search?biw=1366&bih=629&tbs=qdr%3Ad&sxsrf=ALeKk01EHsZ3TIvfjuSTMJN4z9lThqH_AA%3A1606136962270&ei=grS7X5P8D4Cg5OUPlP6QyAU&q=site%3Apastebin.com+intext%3Aemail&oq=site%3Apastebin.com+intext%3Aemail&gs_lcp=CgZwc3ktYWIQA1DD3ANY3_wDYMr-A2gBcAB4AIABX4gB2QmSAQIxNZgBAKABAaoBB2d3cy13aXrAAQE&sclient=psy-ab&ved=0ahUKEwiTxeK43pjtAhUAELkGHRQ_BFkQ4dUDCA0&uact=5' ] header = { 'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36', 'Accept': 'text/html, application/xhtml + xml, application/xml; q = 0.9, image/webp', 'Accept-Encoding': 'gzip', 'Accept-Language': 'en-US,en;q=0.9,es;q=0.8', 'Upgrade-Insecure-Requests': '1', 'Referer': 'https://www.google.com/' } for open_urls in google_urls: sleep_tempo = random.uniform(0, 6) r = requests.get(open_urls, headers=header) sites = r.text extractor = URLExtract() urls = extractor.find_urls(sites) for content in urls: if "google" not in content: if "http" in content: if content not in block: final_urls.append(content) sleep_numero = random.uniform(0, 8) time.sleep(sleep_numero) for rep in final_urls: if '&' in rep: rep = rep.replace('&', '') clean.append(rep) for search in clean: url = search sleep_tempo = random.uniform(0, 6) if 'pastebin' in search: alterar_string = search[-8:] reponse_gk = requests.get(search, headers=header) vazamento_gd = reponse_gk.text html = BeautifulSoup(reponse_gk.text, 'html.parser') if termo in vazamento_gd: if dadobruto == True: alt_string = alterar_string.replace('/','') Path(f'C:/Users/{getpass.getuser()}/Documents/Oblivion').mkdir(parents=True, exist_ok=True) with open (f'{documentos}/Oblivion/RAW_pastebin_{alt_string}.txt','w', encoding='utf-16') as file_gd: for content in html.select('.li1'): leak = content.select_one('.de1') lk = leak.text file_gd.write(lk) if gdr == 'PySide2.QtCore.Qt.CheckState.Checked': subir_arquivo_drive_raiz(id_gdr, f'RAW_pastebin_{alt_string}.txt', 'text/plain', f'{documentos}/Oblivion/RAW_pastebin_{alt_string}.txt') lista_temp_gd = vazamento_gd.split('\n') for i in lista_temp_gd: if termo in i: alt_string = alterar_string.replace('/', '') termo_novo = f'{termo}:pastebin.com/{alt_string}' return termo_novo
#Get polarity based on the blob polarityInt = analysis.sentiment.polarity #Get subjectivity based on the blob subjectivityInt = analysis.sentiment.subjectivity #If polarity is greater than 0 then the tweet is positive orelse negative if polarityInt > 0.0: polarityStr = 'Positive' else: polarityStr = 'Negative' #If subjectivity is greater than 0.5 then the tweet is subjective orelse objective if subjectivityInt > 0.5: subjectivityStr = 'Subjective' else: subjectivityStr = 'Objective' #initialize var for url url = None words = tweet.text.split() link = URLExtract() urls = link.find_urls(tweet.text) for word in words: if 'http' in word: url = word fileOut.writerow([tweet.text, polarityStr, subjectivityStr, url]) print(tweet.text) print('Polarity: ', polarityInt) print('Subjectivity:', subjectivityInt)
def get_data(self, url: str, freeword: str, date: str) -> dict: result_dict = { "取得日時": date, "検索語句": freeword, "店名": None, "住所": None, "定休日": None, "お店のホームページ": None, "席数": None, "スタッフ数": None, "スタッフ募集": None, "ホットペッパービューティ上のHP": None, "電話番号": None, "口コミ総数": 0, "総合": 0, "雰囲気": 0, "接客サービス": 0, "技術・仕上がり": 0, "メニュー・料金": 0 } error_num = 0 extractor = URLExtract() while True: if error_num >= 10: logger.warning("同一のURLに対するエラーが20回続いたので、このURLからの取得を終了します。") return result_dict try: req = requests.get(url) if int(req.status_code) != 200: logger.error("Error {}: このページを取得出来ません。".format( req.status_code)) return result_dict else: html = lxml.html.fromstring(req.text) result_tmp = { i.text_content(): j.text_content() for i, j in zip(html.cssselect("th.w120"), html.cssselect("th.w120 ~ td")) } result_dict["店名"] = [ i.text_content() for i in html.cssselect("p.detailTitle > a") ][0] result_dict["ホットペッパービューティ上のHP"] = url kuchikomi_dict = self.get_kuchikomi( url.split("?")[0] + "review/") result_tmp.update(kuchikomi_dict) for key in result_tmp.keys(): if key in result_dict.keys(): if key == "電話番号": result_dict[key] = self.get_tel( url.split("?")[0] + "tel/") elif key == "お店のホームページ" or key == "ホットペッパービューティ上のHP" or key == "スタッフ募集": result_dict[key] = extractor.find_urls( result_tmp[key])[0] else: result_dict[key] = result_tmp[key] return result_dict except ConnectionError: logger.warning("Connection Errorが発生しました。") error_num += 1 time.sleep(5)
def parse_eml(eml): hashes = [] urls = [] responses = [] for part in eml.walk(): if part.get_content_disposition() != "attachment" and \ part.get_content_type() == "text/plain" \ or part.get_content_type == "text/html": text = str(part.get_payload(decode=True)).replace("\\n", " ") extractor = URLExtract() urls = list(set(extractor.find_urls(text))) if part.get_content_disposition() == "attachment": attach = base64.b64decode(part.get_payload()) hashes.append(hashlib.sha256(attach).hexdigest()) print(f"hashes: {hashes}") print(f"urls: {urls}") for shasum in hashes: artifact = Artifact.query.filter_by(handle=shasum).first() if (artifact): print(f"{shasum} already exists in DB") responses.append(json.loads(artifact.response)) else: params = {'apikey': vtapi, 'resource': shasum} headers = {"Accept-Encoding": "gzip, deflate"} response = requests.get( 'https://www.virustotal.com/vtapi/v2/file/report', params=params, headers=headers) json_response = response.json() artifact = Artifact(handle=shasum, response=json.dumps(json_response)) db.session.add(artifact) db.session.commit() responses.append(json_response) for url in urls: artifact = Artifact.query.filter_by(handle=url).first() if (artifact): print(f"{url} already exists in DB") responses.append(json.loads(artifact.response)) else: headers = { "Accept-Encoding": "gzip, deflate", } params = {'apikey': vtapi, 'resource': url} response = requests.post( 'https://www.virustotal.com/vtapi/v2/url/report', params=params, headers=headers) json_response = response.json() artifact = Artifact(handle=url, response=json.dumps(json_response)) db.session.add(artifact) db.session.commit() responses.append(json_response) return responses
] claim_dict[id] = { 'claim': claim, 'title': title, 'claim_proc': proc_claim, 'title_proc': proc_title, 'claim_clean': clean_claim, 'title_clean': clean_title, } cnt += 1 json.dump(claim_dict, open('my_code/proc_data/claim_dict.json', 'w')) url_extr = URLExtract() for split in split_mp: data_loc = 'data/' + split data_dict = {} cnt = 0 with open(data_loc + '/tweets.queries.tsv', 'r') as f: for line in f: if cnt: id, tweet = line.strip().split('\t') urls = url_extr.find_urls(tweet) proc_twit = text_processor_twit.pre_process_doc(tweet) clean_twit = [ word for word in proc_twit if not re.search("[^a-z0-9.,\s]+", word)
def get_band_disco(soup, current_records): # Instancia de URLExtract. extractor = URLExtract() # Abrimos sesión con la base de datos. engine = create_engine('sqlite:///swedish_bands.db') # Estos dos son necesarios para cada sesión de base de datos. Base.metadata.create_all(bind=engine) Session = sessionmaker(bind=engine) session = Session() # Del objeto "soup" (el contenido será parecido a band_page.html) encuentra <div id="band_disco">. disco_finder = soup.find("div", {"id": "band_disco"}) # Los tags resultantes pasan a string. s_disco_finder = str(disco_finder) # Extrae todos los URLs presentes. disco_url = extractor.find_urls(s_disco_finder) # Toma el primer URL y asigna a una variable. url = disco_url[0] # Hace un request con dicho URL. r = requests.get(url) # Algo para los caracteres raros, por si los hay. r.encoding = 'utf-8' # Convierte el response en un objeto BeautifulSoup para su uso. disco_soup = BeautifulSoup(r.content, 'html.parser') # Del objeto "disco_soup" (el contenido será parecido a disco.html) obtiene todos los tags <tr>. disco_entries = disco_soup.find_all("tr") # Elimina el primero porque no se necesita. disco_entries.pop(0) # -> Por cada elemento en disco_entries: for item in disco_entries: # -> Instanciamos la discografía e insertamos. discography = fact.factory("discography") discography.band_id = current_records # -> Intentamos: try: # -> En un ciclo de x < 3: for x in range(3): # -> Busca todos los tags <td> usando el índice 'x'. s = item.find_all("td")[x] # -> Como en este caso los atributos de la discografía vienen en 3 partes, condicionamos: if x == 0: discography.name = str(s.getText()) if x == 1: discography.release_type = str(s.getText()) if x == 2: discography.year = str(s.getText()) # -> Agregamos el row. session.add(discography) # Guardamos cambios. session.commit() # Cerramos sesión. session.close() except Exception as e: # En caso de que la banda no tenga releases sólo pasa al siguiente. session.close()
def __init__(self, message=''): self.extractor = URLExtract() self.message = message
async def useless(event): # sourcery no-metrics """Custom profile pics""" input_str = event.pattern_match.group(2) ext = re.findall(r"-\w+", input_str) try: flag = ext[0].replace("-", "") input_str = input_str.replace(ext[0], "").strip() except IndexError: flag = None list_link = get_collection_list("CUSTOM_PFP_LINKS") if flag is None: if gvarstatus("CUSTOM_PFP") is not None and gvarstatus( "CUSTOM_PFP") == "true": return await edit_delete(event, "`Custom pfp is already enabled`") if not list_link: return await edit_delete( event, "**ಠ∀ಠ There no links for custom pfp...**") addgvar("CUSTOM_PFP", True) await edit_delete(event, "`Starting custom pfp....`") await custompfploop() return if flag == "l": if not list_link: return await edit_delete( event, "**ಠ∀ಠ There no links set for custom pfp...**") links = "**Available links for custom pfp are here:-**\n\n" for i, each in enumerate(list_link, start=1): links += f"**{i}.** {each}\n" await edit_delete(event, links, 60) return if flag == "s": if gvarstatus("CUSTOM_PFP") is not None and gvarstatus( "CUSTOM_PFP") == "true": delgvar("CUSTOM_PFP") await event.client( functions.photos.DeletePhotosRequest( await event.client.get_profile_photos("me", limit=1))) return await edit_delete(event, "`Custompfp has been stopped now`") return await edit_delete(event, "`Custompfp haven't enabled`") reply = await event.get_reply_message() if not input_str and reply: input_str = reply.text if not input_str: return await edit_delete( event, "**ಠ∀ಠ Reply to valid link or give valid link url as input...**") extractor = URLExtract() plink = extractor.find_urls(input_str) if len(plink) == 0: return await edit_delete( event, "**ಠ∀ಠ Reply to valid link or give valid link url as input...**") if flag == "a": for i in plink: if not is_in_list("CUSTOM_PFP_LINKS", i): add_to_list("CUSTOM_PFP_LINKS", i) await edit_delete( event, f"**{len(plink)} pictures sucessfully added to custom pfps**") elif flag == "r": for i in plink: if is_in_list("CUSTOM_PFP_LINKS", i): rm_from_list("CUSTOM_PFP_LINKS", i) await edit_delete( event, f"**{len(plink)} pictures sucessfully removed from custom pfps**")
class ImageEmbed: def __init__(self, client, channel_ids, twitter_consumer_key, twitter_consumer_secret, twitter_access_token_key, twitter_access_token_secret): self.client = client self.channel_ids = channel_ids self.extractor = URLExtract() self.httpsession = aiohttp.ClientSession() self.message_cache = deque(maxlen=100) self.forced_embeds = deque(maxlen=100) self.ready = asyncio.Event() self.ready.set() self.twitter_pattern = re.compile("twitter.com/\w+/status/(\d+)") self.deviantart_pattern = re.compile("deviantart\.com.*.\d") self.pixiv_pattern = re.compile("www\.pixiv\.net\/en\/artworks\/(\d+)") self.deviantart_url = "https://backend.deviantart.com/oembed?url={}" self.twitterapi = twitter.Api( consumer_key=twitter_consumer_key, consumer_secret=twitter_consumer_secret, access_token_key=twitter_access_token_key, access_token_secret=twitter_access_token_secret, tweet_mode="extended") self.pixiv_session_url = "https://api.pixiv.moe/session" self.pixiv_url = "https://www.pixiv.net/ajax/illust/{}?lang=en" def should_spoiler(self, url, content): url = re.escape(url) match = re.search("\|\|\s*{}\s+\|\|".format(url), content) if match: return True return False async def get_rich_embed(self, url, message, force_ignore_embeds): return await self.get_twitter_embed(url, message, force_ignore_embeds) or \ await self.get_deviantart_embed(url, message, force_ignore_embeds) or \ await self.get_pixiv_embed(url, message, force_ignore_embeds) async def on_message(self, message): await self.post_image_embeds(message) async def post_image_embeds(self, message, channel=None, force_ignore_embeds=False): if message.channel.id not in self.channel_ids or message.author == self.client.user: return if not channel: channel = message.channel self.ready.clear() urls = self.extractor.find_urls(message.content, True) urls = [url for url in urls if self.filter_link(url, message.content)] if any(self.pixiv_pattern.search(line) for line in urls) and not force_ignore_embeds: self.forced_embeds.append(message) if len(message.embeds): await message.edit(suppress=True) spoiler = [] embeds = [] for url in urls: rich_embed = await self.get_rich_embed(url, message, force_ignore_embeds) if not rich_embed: continue embed, attachment = rich_embed if embed: embeds.append((embed, attachment)) if self.should_spoiler(url, message.content): spoiler.append(embed) to_cache = [] for embed, attachment in embeds[:4]: if embed in spoiler: em_msg = await channel.send("||https://corr.in/s ||", embed=embed, files=attachment) else: em_msg = await channel.send(embed=embed, file=attachment) to_cache.append(em_msg) self.cache_message(message, to_cache) self.ready.set() def cache_message(self, message, embed_msgs): chosen = None for cache in self.message_cache: if message == cache["msg"]: chosen = cache break if not chosen: chosen = {"msg": message, "embed_msgs": []} self.message_cache.append(chosen) for em in embed_msgs: chosen["embed_msgs"].append(em) async def on_message_delete(self, message): if message.channel.id not in self.channel_ids or message.author == self.client.user: return await self.ready.wait() chosen = None for cache in self.message_cache: if message == cache["msg"]: chosen = cache break if chosen: for to_delete in chosen["embed_msgs"]: try: await to_delete.delete() except discord.errors.NotFound: continue self.message_cache.remove(chosen) async def on_message_edit(self, before, after): urls = [] if after in self.forced_embeds and len(after.embeds): await after.edit(suppress=True) return for embed in after.embeds: if embed.url: url = embed.url url = url.replace("mobile.twitter.com", "twitter.com") urls.append(url) await self.ready.wait() chosen = None for cache in self.message_cache: if after == cache["msg"]: chosen = cache break if chosen: for potential in list(chosen["embed_msgs"]): if len(potential.embeds) and potential.embeds[0].url in urls: try: await potential.delete() except discord.errors.NotFound: continue chosen["embed_msgs"].remove(potential) def filter_link(self, url, message_content): return message_content.count("<" + url + ">") < message_content.count(url) async def get_twitter_embed(self, url, message, force_ignore_embeds): url = url.replace("mobile.twitter.com", "twitter.com") twitter_id = self.twitter_pattern.search(url) if not twitter_id: return None twitter_id = int(twitter_id.group(1)) tweet_status = self.twitterapi.GetStatus(twitter_id) if not tweet_status: return None if not hasattr(tweet_status, "media") or not tweet_status.media or len( tweet_status.media) == 0: return None if message not in self.forced_embeds and not force_ignore_embeds: for embed in message.embeds: if embed.footer and embed.footer.text == "Twitter": if url == embed.url: return None embed = discord.Embed(description=tweet_status.full_text, color=1942002, url=url) embed.set_footer( text="Twitter", icon_url="https://abs.twimg.com/icons/apple-touch-icon-192x192.png" ) embed.set_image(url=tweet_status.media[0].media_url_https + "?name=large") embed.set_author(name="{} ({})".format(tweet_status.user.name, tweet_status.user.screen_name), url="https://twitter.com/{}".format( tweet_status.user.screen_name), icon_url=tweet_status.user.profile_image_url_https) embed.add_field(name="Retweets", value=tweet_status.retweet_count, inline=True) embed.add_field(name="Likes", value=tweet_status.favorite_count, inline=True) return embed, None async def get_deviantart_embed(self, url, message, force_ignore_embeds): da_link = self.deviantart_pattern.search(url) if not da_link: return None da_link = da_link[0] if message not in self.forced_embeds and not force_ignore_embeds: for embed in message.embeds: if embed.provider and embed.provider.name == "DeviantArt": if da_link in embed.url: return None async with self.httpsession.get( self.deviantart_url.format(da_link)) as resp: if resp.status < 200 or resp.status >= 300: return None result = await resp.json() if result["type"] != "photo": return None embed = discord.Embed(title="{} by {} on DeviantArt".format( result["title"], result["author_name"]), color=395021, url=url) embed.set_image(url=result["url"]) embed.set_author( name=result["author_name"], url=result["author_url"], icon_url= "https://st.deviantart.net/eclipse/icons/android-192.png") return embed, None async def get_pixiv_embed(self, url, message, force_ignore_embeds): pixiv_link = self.pixiv_pattern.search(url) if not pixiv_link: return None pixiv_id = int(pixiv_link.group(1)) pixiv = await self.fetch_pixiv(pixiv_id) if not pixiv: return None embed = discord.Embed(description=pixiv.get("description", None), color=12123135, url=url, title=pixiv.get("title", None)) embed.set_footer( text="Pixiv", icon_url="https://s.pximg.net/common/images/apple-touch-icon.png") image = pixiv["urls"]["regular"] file_object = None file_extension = image.split(".")[-1] file_name = "image.{}".format(file_extension) headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36", "accept-language": "en-US,en;q=0.9", "referer": "https://www.pixiv.net/" } async with self.httpsession.get(image, headers=headers) as resp: file_object = io.BytesIO(await resp.read()) file_object.seek(0) discord_file = discord.File(file_object, file_name) embed.set_image(url="attachment://{}".format(file_name)) embed.set_author(name="{}".format(pixiv["userName"]), url="https://www.pixiv.net/en/users/{}".format( pixiv["userId"])) return embed, discord_file async def fetch_pixiv(self, pixiv_id): now = datetime.datetime.now() headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36", "accept-language": "en-US,en;q=0.9", "referer": "https://www.pixiv.net/en/artworks/{}".format(pixiv_id) } async with self.httpsession.get(self.pixiv_url.format(pixiv_id), headers=headers) as resp: if resp.status < 200 or resp.status >= 300: return None result = await resp.json() return result["body"] return None
def check_citation(self, citation): #NOTE: when implementing, wrap the method in a try catch and print out any error + the citation status try: pattern = re.compile("[ ][0-9]{4}") result = pattern.search(citation) self.year = result.group(0)[1:] except: raise Exception("Unable to find year in citation.") self.citation_status = MLACitationStatus.AUTHOR cursor = 0 while True: ascii_value = ord(citation[cursor]) # check if the current character is not " &-'." or any alphanumeric in English or Latin-1 if citation[cursor:cursor + 2] != ". " and ( ascii_value == 32 or ascii_value == 39 or 44 <= ascii_value <= 46 or 65 <= ascii_value <= 90 or 97 <= ascii_value <= 122 or 192 <= ascii_value <= 255): cursor += 1 else: break if cursor != 0: author_section = "" if citation[cursor:cursor + 2] == ". ": author_section = citation[:cursor + 1] else: raise Exception( "Bad formatting in the author section (unknown error).") # three or more authors if ", et al." in author_section: temp = author_section.replace(", et al", "") authors = temp.split(", ") filteredAuthor = [self.filter_latin(i) for i in authors] if re.match("^[A-Za-z][A-Za-z-' ]+$", filteredAuthor[0]) is not None \ and re.match("^[A-Z][A-Za-z-'. ]+[.]$", filteredAuthor[1]) is not None: self.authors.append(authors[0] + ", et al.") else: raise Exception("Bad formatting in the author section: '" + author_section + "'") # two authors elif ", and " in author_section: authors = author_section.split(", and ") if ", " not in authors[0]: raise Exception("Bad formatting in the author section: '" + author_section + "'") firstAuthor = authors[0].split(", ") filteredFirstAuthor = [ self.filter_latin(i) for i in firstAuthor ] if re.match("^[A-Za-z][A-Za-z-' ]+$", filteredFirstAuthor[0]) is not None \ and re.match("^[A-Z][A-Za-z-'. ]+$", filteredFirstAuthor[1]) is not None: self.authors.append(firstAuthor[0]) else: raise Exception("Bad formatting in the author section: '" + author_section + "'") if " " not in authors[1]: raise Exception("Bad formatting in the author section: '" + author_section + "'") secondAuthor = authors[1].split(" ", 1) filteredSecondAuthor = [ self.filter_latin(i) for i in secondAuthor ] if re.match("^[A-Z][A-Za-z-']+$", filteredSecondAuthor[0]) is not None \ and re.match("^[A-Za-z][A-Za-z-'. ]+[.]$", filteredSecondAuthor[1]) is not None: self.authors.append(filteredSecondAuthor[1][:-1]) elif re.match("^[A-Za-z][.]$", filteredSecondAuthor[1]) is not None: author_cursor = cursor + 2 actualSecondAuthor = "" while citation[author_cursor:author_cursor + 2] != ". ": actualSecondAuthor += citation[author_cursor] author_cursor += 1 self.authors.append(actualSecondAuthor) else: raise Exception("Bad formatting in the author section: '" + author_section + "'") # one author elif ", " in author_section: authors = author_section.split(", ") filteredAuthor = [self.filter_latin(i) for i in authors] if re.match("^[A-Za-z][A-Za-z-' ]+$", filteredAuthor[0]) is not None \ and re.match("^[A-Z][A-Za-z-' ]+[.]$", filteredAuthor[1]) is not None: self.authors.append(authors[0]) else: raise Exception("Bad formatting in the author section: '" + author_section + "'") elif "et. al." in author_section or "et.al." in author_section: raise Exception( "'Et al.' should not have a period after the 'Et'.") # no match; bad formatting else: raise Exception("Bad formatting in the author section: '" + author_section + "'") self.citation_status = MLACitationStatus.TITLE cursor += 1 # check the title section if citation[cursor:cursor + 3] == "<i>": cursor += 3 elif citation[cursor + 1:cursor + 4] == "<i>": cursor += 4 elif citation[cursor + 1] == "\"": cursor += 2 elif citation[cursor - 1:cursor + 1] == ".\"": raise Exception("Bad formatting in the title section.") title = "" while citation[cursor] != ".": title += citation[cursor] cursor += 1 title = title.replace("\"", "") title = title.replace("</i>", "") if title[0] == " ": title = title[1:] if citation[cursor + 1] == "\"": cursor += 2 else: cursor += 1 #now cursor should be at the beginning of italics result = url("https://brettterpstra.com/titlecase/?title=" + title) title_cased_title = result.read().decode('utf-8') if title != title_cased_title: self.warnings.append( "the title might contain improper capitalization: '" + title + "'") self.title = title # check for url self.citation_status = MLACitationStatus.URL extractor = URLExtract() if extractor.has_urls(citation): urls = extractor.find_urls(citation) self.url = urls[0][:-1] if self.url + "." not in citation: raise Exception("Bad formatting in the URL section.") if citation[cursor:cursor + 3] != "<i>" and citation[cursor + 1:cursor + 4] != "<i>": self.warnings.append( "the container may not exist or may not be italicized") elif citation[cursor:cursor + 3] == "<i>" and citation[cursor + 1:cursor + 4] == "<i>": self.warnings.append( "the container might exist when not necessary (if the citation is about a book), or the block immediately following the title may be improperly italicized." ) if self.url != "": citation.replace(self.url + ".", "") # check for other info # right now, it's too complex to validate the entire MLA citation without prior knowledge on what type of citation it is, # so the other info is just stored without checking self.citation_status = MLACitationStatus.OTHER_INFO remainingText = citation[cursor:] info = remainingText.split(", ") self.otherInfo = [i for i in info]
import re, os from nltk.corpus import stopwords import json from urlextract import URLExtract my_loc = os.path.dirname(__file__) nlp = spacy.load('en_core_web_lg') split_mp = {'training': 'train', 'dev': 'val', 'test': 'test'} text_processor_wiki = get_text_processor(word_stats='english') text_processor_twit = get_text_processor(word_stats='twitter') df_stopwords = set(stopwords.words('english')) url_extr = URLExtract() for split in split_mp: tr_file = open('data/%s.tsv' % (split), 'r') data_dict = {} cnt = 0 for line in tr_file: if cnt: if split != 'test': topic, id, link, content, claim, worthy = line.strip().split( '\t') else: topic, id, link, content = line.strip().split('\t') claim, worthy = 0, 0 urls = url_extr.find_urls(content)
#keywords pq() pqc() #abstract pq() pqc() #cover_image (may not work, but might as well try) pq() cover_image = 'https://cross-currents.berkeley.edu'+photoessay['Image'] print(cover_image, end='') pqc() #pdf_url, extract from the File column extractor = URLExtract() pdf_urls = extractor.find_urls(photoessay['File']) if len(pdf_urls) >= 1: #sometimes the extractor finds more than one URL, we should just always use the first pdf_url = pdf_urls[0] else: pdf_url = 'ERROR, no PDF URL found, content-type: ' + photoessay['Content type'] + '; Content ID: ' + photoessay['Content ID'] + '; Article Type: ' + photoessay['Article Type'] pq() print(urllib.parse.unquote(pdf_url), end='') pqc() # Add 3 blank cells here at the end, because supplemental files follow on additional lines print(3*'\t', end='') print('') # let's wrap up this photoessay
spotifyPlaylistId = spotifyConfig["playlistID"] spotifyCtr = spotifyConfig["ctr"] spotifyUser = spotifyConfig["spotifyUser"] # Youtube config stuff with open("config/YTconfig.json") as f: youtubeConfig = json.load(f) with open("config/slack.json") as f: slackConfig = json.load(f) slackToken = slackConfig["token"] slackChannel = slackConfig["channel"] slackTeam = slackConfig["team"] slack_client = slack.WebClient(token=slackToken) extractor = URLExtract() # declare extractor for later # method to post a (parameter) message to slack, visible to channel def slack_response(message, userID): print("Sending slack response.") message = ("{}".format(message)) slack_client.chat_postMessage(token=slackToken, as_user=False, channel=slackChannel, text=message) # method to post an ephemeral message to the chat - only the user will see it
def getwebsite(self, data): extractor = URLExtract() urls = extractor.find_urls(data) return ' , '.join(urls)
def initialize(self): self.logNotify("Initializing LibreSelery") self.seleryPackageInfo = os_utils.getPackageInfo("libreselery") if self.seleryPackageInfo: self.log("LibreSelery version [%s]" % self.seleryPackageInfo["version"]) else: # when project is executed locally without installation, seleryPackageInfo is empty self.log("LibreSelery version [undefined]") self.log("Preparing Configuration") # find all configs in potentially given config directory foundConfigs = [] if self.config.config_dir: for root, dirs, files in os.walk(self.config.config_dir): for f in files: ext = os.path.splitext(f)[1] if ext == ".yml": foundConfigs.append(os.path.join(root, f)) # group all found configs together with individually given configuration paths from user on top self.config.config_paths = foundConfigs + self.config.config_paths # apply yaml config to our configuration if possible self.log("Loading configurations" % self.config.config_paths) [print(" -- %s" % path) for path in self.config.config_paths] [self.loadYaml(path) for path in self.config.config_paths] # finalize our configuration settings self.config.finalize() # load the README file and check if wallet address for donation matches the configured wallet address. Before payout this address is also matched against the address of the coinbase user extractor = URLExtract() fundingPath = self._getFile("README.md") if fundingPath is not None: self.log("Loading funding file [%s] for bitcoin wallet" % fundingPath) mdfile = open("README.md", "r") mdstring = mdfile.read() urls = extractor.find_urls(mdstring) badge_string = "https://badgen.net/badge/LibreSelery-Donation/" for url in urls: if badge_string in url: self.config.bitcoin_address = url.split(badge_string, 1)[1] self.log("Found bitcoin address [%s]" % self.config.bitcoin_address) else: self.log( "Using bitcoin address from configuration file for validation check [%s]" % self.config.bitcoin_address ) # Create a new QR code based on the configured wallet address self.log("Creating QR code PNG image for funders") wallet_qrcode = QRCode(error_correction=1) wallet_qrcode.add_data(self.config.bitcoin_address) wallet_qrcode.best_fit() wallet_qrcode.makeImpl(False, 6) wallet_image = wallet_qrcode.make_image() wallet_image.save( os.path.join(self.config.result_dir, "public", "wallet_qrcode.png") ) # load tooling url if self.config.include_tooling_and_runtime and self.config.tooling_path: with open(self.config.tooling_path) as f: self.config.toolrepos = yaml.safe_load(f) if self.config.toolrepos is not None: self.log("Tooling file loaded [%s]" % self.config.toolrepos) else: self.log("No tooling urls found") else: self.log("Tooling not included") # load our environment variables self.loadEnv() self.logNotify("Initialized") self.log(str(self.getConfig()))
def main(): x = 1 while x <= pages: url_org = f'https://github.com/search?p={x}&q=org%3A{organization}+{query}&type=code' page = s.get(url_org).text if 'We couldn’t find any code matching' in page: print(colored('\nNo Repositories Found. Please check the Organization name.' , 'red')) sys.exit(1) soup = BeautifulSoup(page, 'html5lib') url_list = [] for link in soup.findAll('a'): inside_file = link.get('href') full_url = 'https://github.com/' + inside_file head = full_url.partition('#') url_list.append(head[0]) final_url_list = set(url_list) final_url_list = list(final_url_list) total_repositories = len(final_url_list) print("\n") if total_repositories == 0 and x < 2: print(colored("Make sure your credentials are properly configured.", 'red')) sys.exit(1) if total_repositories ==0: print('Cannot find more S3 Buckets.') sys.exit(1) else: print(f"Fetching Data from Page: {x}") print("\n") for i in (final_url_list): inner_url = i inner_url_fetch = s.get(inner_url).text extractor = URLExtract() for bucketurl in extractor.gen_urls(inner_url_fetch): if bucketurl not in exclude and 'https://github.com/' not in bucketurl and args.q in bucketurl: try: check_takeover = requests.get(bucketurl) status = check_takeover.status_code o1 = (f'[{status}] - {bucketurl}\n') if args.o: file = open(args.o, 'a') file.write(o1) print(f'[{status}] - {bucketurl} ') except: pass try: check_takeover_response = check_takeover.content check_takeover_response = str(check_takeover_response) if 'NoSuchBucket' in check_takeover_response: s3_text = (colored('[S3 Bucket Takeover]', 'green')) o2 = (f'{s3_text} : {bucketurl}\n') print(f'{s3_text} : {bucketurl}') if args.o: file=open(args.o, 'a') file.write(o2) except: pass x=x+1
def extract_urls(body): urlset = set() extractor = URLExtract() excluded = [ '.id', '.you', '.lol', '.like', '.now', '.my', '.love', '.phone', '.how', '.post', '.me', '.got', '.hot', '.im', '.best' ] for url in extractor.gen_urls(body): if len(url) < 5 or '.' not in url: continue if url.count('http') == 1: url = url.split('http')[1] url = 'http{}'.format(url) if '(' in url: rurl = url.split('(') if extractor.has_urls(rurl[1]): url = rurl[1] elif extractor.has_urls(rurl[0]): url = rurl[0] else: continue if ')' in url: lurl = url.split(')') if extractor.has_urls(lurl[0]): url = lurl[0] elif extractor.has_urls(lurl[1]): url = lurl[1] else: continue sem = 0 for suffix in excluded: if url.endswith(suffix): sem = 1 if sem == 1: continue # """ if '[IMG]' in url: try: url = url.split('[IMG]')[1] except IndexError: pass if '[/IMG]' in url: try: url = url.split('[/IMG]')[0] except IndexError: pass if url.endswith('?fb'): url = url.replace('?fb', '') if url.endswith('?noredirect'): url = url.replace('?noredirect', '') elif url.endswith( '_d.jpg?maxwidth=640&shape=thumb&fidelity=medium'): url = url.replace( '_d.jpg?maxwidth=640&shape=thumb&fidelity=medium', '') elif url.endswith('?s=sms'): url = url.replace('?s=sms', '') if '//m.imgur.com' in url: url = url.replace('//m.imgur.com', '//imgur.com') if url.startswith('https://thumbs.gfycat.com/'): url = url.replace('https://thumbs.gfycat.com/', 'https://gfycat.com/') if url.endswith('-size_restricted.gif'): url = url.replace('-size_restricted.gif', '') # """ urlset.add(url) return urlset
def blog_post_send_to_preview(self): if not self.preview_user_id: raise Exception(_('必须输入用户信息')) ctx = self._context.copy() wx_media = self.env['wx.media'] active_model = ctx.get('active_model') active_ids = ctx.get('active_ids', []) blogs = self.env[active_model].browse(active_ids) server_url = self.env['ir.config_parameter'].sudo().get_param('server_url') articless = [] for blog in blogs: thumb_media_id = False wx_file_path = get_module_resource('e2yun_blog_post_list_extends', 'static/wx') # file_image = blog.main_image if True: if blog.cover_properties: cover_properties = eval(blog.cover_properties) if 'background-image' in eval(blog.cover_properties): imageurl = cover_properties['background-image'].replace('url(', '').replace(')', '') if 'http' not in imageurl: # imageurl = server_url + imageurl attench_id = imageurl.replace('/web/image/', '')[ 0: imageurl.replace('/web/image/', '').index('/')] datas = self.env['ir.attachment'].browse(int(attench_id)).datas img = base64.b64decode(datas) file = open('%s/thumb.jpg' % wx_file_path, 'wb') file.write(img) file.close() else: urlretrieve(imageurl, '%s/thumb.jpg' % wx_file_path) quality = 80 step = 5 while os.path.getsize('%s/thumb.jpg' % wx_file_path) / 1024 > 64: file_path = '%s/thumb.jpg' % wx_file_path im = Image.open(file_path) # 获得图像尺寸: # w, h = im.size # 缩放到50%: # im.resize((int(w / 0.8), int(h / 0.8)), Image.ANTIALIAS) # 把缩放后的图像用jpeg格式保存: if im.mode == "P": im = im.convert('RGB') im.save(file_path, 'JPEG', quality=quality) if quality - step < 0: break quality -= step else: raise Exception(_('必须要有封面图片,请在文章编辑中输入!')) # img = base64.b64decode(blog.main_image) # file = open('%s/thumb.gif' % wx_file_path, 'wb') # file.write(img) # file.close() thumb_media_upload = wx_media.upload_image('%s/thumb.jpg' % wx_file_path) thumb_media_id = thumb_media_upload['thumb_media_id'] else: raise Exception(_('必须要有封面图片,请在文章编辑中填入!')) extractor = URLExtract() urls = extractor.find_urls(blog.content, only_unique=True) wx_content = blog.content for url in urls: try: urlretrieve(url, '%s/news.jpg' % wx_file_path) import imghdr imgType = imghdr.what('%s/news.jpg' % wx_file_path) if imgType: news_media_upload = wx_media.upload_news_picture('%s/news.jpg' % wx_file_path) wx_content = wx_content.replace(url, news_media_upload['url']) except: continue blog.wx_content = wx_content blog.thumb_media_id = thumb_media_id blog.transfer_to_wx_flag = True try: os.remove('%s/thumb.jpg' % wx_file_path) os.remove('%s/news.jpg' % wx_file_path) except: pass blog_url = server_url + blog.website_url articles = { "thumb_media_id": blog.thumb_media_id, "author": blog.create_uid.name, "title": blog.name, "content_source_url": blog_url, "content": '%s' % blog.wx_content, "digest": blog.subtitle, "show_cover_pic": 1, "need_open_comment": 1, "only_fans_can_comment": 1 } articless.append(articles) randon_number = random.randint(100000, 999999) mediaid = wx_media.upload_articles(articless, '我的文章-%s' % randon_number) print(mediaid) wx_media = self.env['wx.media'].search([('media_id', '=', mediaid['media_id'])]) preview_user_id = self.preview_user_id self.env['wx.send.mass'].create( {'wx_media_id': wx_media.id, 'preview_user_id': preview_user_id.id}).preview_send() return { 'warning': { 'title': 'Tips', 'message': '同步成功' } }
class VK: def __init__(self): log_tag = 'VK - init' self.settings_tag = 'VK' self.extractor = URLExtract() self.config = Config() try: self.vk_bot = vk_api.VkApi( token=str(self.config.read(self.settings_tag, 'bot_token'))) self.api_bot_vk = self.vk_bot.get_api() Log().info(log_tag, 'Инициализация токена-бота VK успешна.') except Exception as e: Log().error(log_tag, e) p_name = 'ЛИНКЕР' p_channel = 'hackathon' p_version = '0.0.1' desc = 'Бот, создающий сокращенные vk.cc ссылки прямо в диалоге.' self.info = f'{p_name} {p_version} ({p_channel})\n\n{desc}\n\nбеседа %peer_id%' def long_poll(self): tag = 'VK - Message LongPoll' from vk_api.bot_longpoll import VkBotLongPoll, VkBotEventType long_poll_bot = VkBotLongPoll( self.vk_bot, int(self.config.read(self.settings_tag, "community_id"))) for event in long_poll_bot.listen(): try: if event.type == VkBotEventType.MESSAGE_NEW: Log().info( tag, f'Новое сообщение от \"https://vk.com/id{event.obj.from_id}\".\n' f'Текст сообщения:\t\n{event.obj.text}\n' f'Прикрепленные аттачи:\t\n{event.obj.attachments}\n' f'Пересланные сообщения:\t\n{event.obj.fwd_messages}') self.listener(event) elif event.type == VkBotEventType.MESSAGE_REPLY: Log().info(tag, f'Бот ответил в чате {event.obj.peer_id}.') else: Log().info( tag, f'Обнаружено новое действие: {event.type} от ' f'\"https://vk.com/id{event.obj.from_id}\"') except Exception as e: Log().error(tag, e) def listener(self, event): tag = "VK - Message Listener" Log().info(tag, 'Обрабатываю сообщение...') from_id = event.obj.from_id peer_id = event.obj.peer_id msg_text = str(event.obj.text) msg_attach = event.obj.attachments msg_fwd = event.obj.fwd_messages Log().info(tag, 'Обработка завершена. ') if self.extractor.has_urls(msg_text) or msg_attach or msg_fwd: response_links = [] if self.extractor.has_urls(msg_text): links = self.extractor.find_urls(msg_text) Log().info(tag, 'Найдены объекты типа ссылка.') if len(links) > 1: for i in range(len(links)): response_links.append( self.get_cc_link(links[i], 0)['short_url']) else: response_links.append( self.get_cc_link(links, 0)['short_url']) if msg_attach: for i in range(len(msg_attach)): attach_type = msg_attach[i]['type'] if attach_type == 'link': ath_url = msg_attach[i][attach_type]['url'] response_links.append( str(self.get_cc_link(ath_url, 0)['short_url'])) if msg_fwd: for i_fwd in range(len(msg_fwd)): fwd_text = msg_fwd[i_fwd]['text'] fwd_attaches = msg_fwd[i_fwd]['attachments'] for i_ath in range(len(fwd_attaches)): fwd_ath_type = fwd_attaches[i_ath]['type'] if fwd_ath_type == 'link': fwd_ath_link = msg_fwd[i_fwd]['attachments'][ i_ath][fwd_ath_type]['url'] response_links.append( str( self.get_cc_link(fwd_ath_link, 0)['short_url'])) if self.extractor.find_urls(fwd_text): response_links.append( str(self.get_cc_link(fwd_text, 0)['short_url'])) response_links_wd = list(dict.fromkeys(response_links)) if len(response_links_wd) > 1: response_str = '🔗 Вот твои ссылки из сообщения:\n\n' for i_link in range(len(response_links_wd)): response_str += response_links_wd[i_link] + '\n' else: response_str = '🔗 Была найдена лишь одна ссылка в сообщении: ' + response_links_wd[ 0] self.send_message(peer_id, response_str) elif (from_id == 140830142) and \ (msg_text.__contains__('info') or msg_text.__contains__('инфо') or msg_text.__contains__('i')) or \ (msg_text.__contains__('ping') or msg_text.__contains__('пинг')): Log().info(tag, 'Инфо о боте.') self.send_message(peer_id, 'понг') self.send_message(peer_id, self.info.replace("%peer_id%", str(peer_id))) else: Log().info(tag, 'Неизвестная команда.') self.send_message(event.obj.peer_id, '🐸 Ссылок нет.') def get_cc_link(self, url, private): cc_link = self.api_bot_vk.utils.getShortLink(url=url, private=private) return cc_link def send_message(self, user_id, text): self.api_bot_vk.messages.send(peer_id=user_id, message=text, random_id=get_random_id(), dont_parse_links=1)
import re import tensorflow as tf from urlextract import URLExtract import os, sys url_extract_api = URLExtract() fwobj = tf.gfile.GFile("/data/albert/my_chinese_pretrain.txt", "w") def clean(text): text = re.sub("""(<[=0-9a-zA-Z\/&"":_\\.]+>;?)+""", "", text) text = re.sub("""((&|#|$)+[0-9a-zA-Z]+;?)+""", "", text) text = re.sub("""[★☆\u3000]+""", "", text) try: urls = url_extract_api.find_urls(text) for url in urls: text = text.replace(url, "") return text except: return text def process(document): init_len = 0 index = 0 document = "".join(document) document = clean(document) sentences = re.split(r"([。!!??;;])", document) document = ["".join(i) for i in zip(sentences[0::2], sentences[1::2])] context = "".join(document)
def extract_urls(body): urlset = set() extractor = URLExtract() excluded = [ '.id', '.you', '.lol', '.like', '.now', '.my', '.love', '.phone', '.how', '.post', '.me', '.got', '.hot', '.im', '.best' ] try: generatedUrls = extractor.gen_urls(body) for url in generatedUrls: if len(url) < 5 or '.' not in url: continue if url.count('http') == 1: url = url.split('http')[1] url = 'http{}'.format(url) if '(' in url: rurl = url.split('(') if extractor.has_urls(rurl[1]): url = rurl[1] elif extractor.has_urls(rurl[0]): url = rurl[0] else: continue if ')' in url: lurl = url.split(')') if extractor.has_urls(lurl[0]): url = lurl[0] elif extractor.has_urls(lurl[1]): url = lurl[1] else: continue sem = 0 for suffix in excluded: if url.endswith(suffix): sem = 1 if sem == 1: continue # """ if '[IMG]' in url: try: url = url.split('[IMG]')[1] except IndexError: pass if '[/IMG]' in url: try: url = url.split('[/IMG]')[0] except IndexError: pass if url.endswith('?fb'): url = url.replace('?fb', '') if url.endswith('?noredirect'): url = url.replace('?noredirect', '') elif url.endswith( '_d.jpg?maxwidth=640&shape=thumb&fidelity=medium'): url = url.replace( '_d.jpg?maxwidth=640&shape=thumb&fidelity=medium', '') elif url.endswith('?s=sms'): url = url.replace('?s=sms', '') if '//m.imgur.com' in url: url = url.replace('//m.imgur.com', '//imgur.com') if url.startswith('https://thumbs.gfycat.com/'): url = url.replace('https://thumbs.gfycat.com/', 'https://gfycat.com/') if url.endswith('-size_restricted.gif'): url = url.replace('-size_restricted.gif', '') # """ urlset.add(url) return urlset except AttributeError as e: raise e print( "While generating urls, an AttributeError (specifically {e}) was raised. Moving on without extracting urls for now. This is likely an error with the python library URLExtract (https://github.com/lipoja/URLExtract). The issue has been fixed (see issue fix here: https://github.com/lipoja/URLExtract/commit/aa51f52e77b104932c49fb14882c632f12b6e940) but is has not included in the most recent release. Please install the version from GitHub to fix this issue (eg. pip3 install git+https://github.com/lipoja/URLExtract.git" .format(e=e)) finally: return urlset # which is empty
def __init__(self): self.extractor = URLExtract()
from typing import List import hashlib from urllib.parse import urlparse from urlextract import URLExtract import aiocache import aiohttp import discord from discord.ext import commands from alttprbot.database import config # TODO switch to ORM from alttprbot.util import http from alttprbot import models urlextractor = URLExtract() class Moderation(commands.Cog): def __init__(self, bot): self.bot: commands.Bot = bot @commands.Cog.listener() async def on_message(self, message: discord.Message): # don't moderate if a DM if message.guild is None: return # don't moderate if sent by a real bot if message.author.id == self.bot.user.id: return
def get_urls(text): extractor = URLExtract() urls = extractor.find_urls(text) return ','.join(urls), len(urls)
def find_urls_in_text(text): extractor = URLExtract() return set(extractor.find_urls(text))
import re from urlextract import URLExtract from settings import REMOVE_URLS_WL_DATA as WHITELIST url_extractor = URLExtract() def remove_urls(text, limit_not_remove=140, placeholder='***'): """Removes URLs from given text Args: text (str): Text limit_not_remove (int, optional): If text has less than 'limit_not_remove' symbols then dont process it. Defaults to 140. placeholder (str, optional): Placeholder for URL. Defaults to '***'. Returns: str: Text """ if len(text) < limit_not_remove: return text urls = url_extractor.find_urls(text) for url in urls: allowed = False for white_listed in WHITELIST: if url.find(white_listed) != -1: allowed = True break if allowed is False: text = text.replace(url, placeholder)
def urlextract(): return URLExtract()
def extract_urls(text): extractor = URLExtract() urls = extractor.find_urls(text) return urls
"WRB": 1, "``": 1, "$": 1, "#": 1 } co = 0 for k in POSDICT.keys(): POSDICT[k] = co co += 1 helper = nltk.help.upenn_tagset() linkHash = pickle.load(open("URLCache_new.json", 'rb')) print("cache load finish") ext = URLExtract() extract = extractor() def getLink(h): if h in linkHash: return linkHash.get(h) return tldextract.extract(h).domain def emExtract(texts1, texts2): emDict = {} count = 0 for text in texts1: