def clean_text(text): REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') STOPWORDS = set(stopwords.words('english')).union(["gt"]) extractor = URLExtract() info = " " for url in extractor.gen_urls(text): try: if "youtube" in url or "youtu.be" in url: content = urlopen(url, timeout=1).read() content = BeautifulSoup(content).find( 'title').string # HTML decoding info += " " + content except: continue text += info text = text.lower() text = REPLACE_BY_SPACE_RE.sub(' ', text) text = BAD_SYMBOLS_RE.sub(' ', text) text = ' '.join(word for word in text.split() if word not in STOPWORDS) text = remove_special_chars_digits(text) text = word_tokenize(text) text = remove_repeat(text) text = np.asarray(text) text = lemmatize_new(text) text = ' '.join(word for word in text if word not in STOPWORDS) TextPreprocess.num += 1 print(TextPreprocess.num, " " + text) return text
def _process_links(self): extractor = URLExtract(extract_email=True) for url in extractor.gen_urls(self.subtext): if _is_email(url): href = f"mailto:{url}" else: href = url link = f"<a href={href}>{url}</a>" self.subtext = self.subtext.replace(url, link)
def read_stream_from_assia_tv(self, response, event_url, event_name, event_date): scripts = response.css("script") extractor = URLExtract() for s in scripts: text = s.get() if extractor.has_urls(text): for url in extractor.gen_urls(text): if "video.assia.tv" in url: self.logger.info("#read_stream_from_assia_tv - found video stream url %s!"%(url)) if "m3u8" in url:
async def on_message(message): # print message # print(message.author, ": ", message.content) if message.author.bot: return # extract URLs extractor = URLExtract() URLs = list(extractor.gen_urls(message.content)) # ignore the URLs in the IGNORE list for i in range(len(URLs)): for item in IGNORE: if item in URLs[i]: URLs.remove(URLs[i]) # ignore message if there were no URLs if URLs == []: return # create attachments from URLs attachments = get_attachments(message, URLs) # print("Attachments created!") # print(*attachments, sep="\n") # get trello board all_boards = trello_client.list_boards() channel = message.channel.name Board = get_board(all_boards, "Dump") # get/create list for channel List = get_list(lists=Board.list_lists(), name=channel) if List is None: List = Board.add_list(channel) # create card for each attachment attached = [] for attachment in attachments: author = str(message.author.name) result = create_card(List, author, attachment) attached.append(result) if len(attached) == len(attachments): reply = 'I have attached these links to Trello for you. Please remember to organise them at https://trello.com/b/SAJvonRo/dump.' else: reply = 'Oops! Something went wrong. Please attach these links manually to https://trello.com/.' # sent reply and delete after 5 seconds sent = await message.channel.send(reply, delete_after=5) # suppress embeds await sent.edit(suppress=True)
def message(update: Update, context: CallbackContext) -> None: extractor = URLExtract() if extractor.has_urls(update.message.text): result_text = update.message.text for url in extractor.gen_urls(update.message.text): print(f"Url found: {url}") unshorten_url = unshort_url(url) print(f"Unshorten: {unshorten_url}") sanitized_url = trim_utm(unshorten_url) print(f"Sanitized: {sanitized_url}") if url != sanitized_url: result_text = result_text.replace(url, sanitized_url) if result_text != update.message.text: update.message.reply_text(result_text)
def extract_urls_from_string(text): """ Returns all the raw extracted urls as a list If a URL has a scheme, it ensures that it is http/s URLs like www. are sanitised in the normalise_url """ text = text.lower() extractor = URLExtract(cache_dir=tempfile.gettempdir()) results = [url for url in extractor.gen_urls(text)] for url in results: scheme = urlparse(url).scheme if scheme and scheme != 'https' and scheme != 'http': results.remove(url) return results
def domains_func(): extractor = URLExtract() with open(r"alllinks.txt", "r") as infile: for line in infile: for url in extractor.gen_urls(line): domain = urlparse(url).netloc if domain == "": f = open(r"domainsout.txt", "a") f.write(url + "\n") f.close() else: f = open(r"domainsout.txt", "a") f.write(domain + "\n") f.close()
def save(self, *args, **kwargs): """ set the slug for the first time only - slugify the title with a random alphanumeric """ if self.date_updated is None: self.slug = slugify( self.title + '-' + secrets.token_urlsafe(LENGTH_OF_RANDOM_ALPHANUMERIC_SLUG)) # Anonymous Ideas will always be public if self.user is None: self.visibility = True # Linkinfy the links extractor = URLExtract() for url in extractor.gen_urls(self.concept): self.concept = self.concept.replace( url, "<a href={}>{}</a>".format(url, url)) super(Idea, self).save(*args, **kwargs)
def getNamespacesfromIRIs(self, meta_source): extractor = URLExtract() namespaces = set() if meta_source is not None: for url in set(extractor.gen_urls(str(meta_source))): namespace_candidate = url.rsplit('/', 1)[0] if namespace_candidate != url: namespaces.add(namespace_candidate) else: namespace_candidate = url.rsplit('#', 1)[0] if namespace_candidate != url: namespaces.add(namespace_candidate) vocabs = Preprocessor.getLinkedVocabs() lod_namespaces = [ d['namespace'] for d in vocabs if 'namespace' in d ] for ns in namespaces: if ns + '/' in lod_namespaces: self.namespaces.append(ns + '/') elif ns + '#' in lod_namespaces: self.namespaces.append(ns + '#')
# print(url_to_parse.group("url")) # print("\n\n") parsing_list.append(url_to_parse.group("url")) # I do still have more to learn regarding regex in python md = markdown.Markdown() extractor = URLExtract() comp_url_list = [] item_total = 0 for pos, url_list in enumerate(parsing_list): print(">>>> ", url_list) url_text = url_parse(url_list) html = md.reset().convert(url_parse(url_list)) # print(html)# print(html) comp_url_list.extend(extractor.gen_urls(html)) comp_url_list.sort() comp_url_list = [a[0] for a in itertools.groupby(comp_url_list)] # XXX: Make URL list above unique # Converting using comp_url_list = set(comp_url_list) # blacklisted "URLs" # XXX: Generate this list from a file # remove will raise error if item is not there, so I should use # discard, instead # comp_url_list.discard('foo') comp_url_list.remove('Postgres.app') comp_url_list.remove('CONTRIBUTING.md') comp_url_list.remove('pgconfig.org') comp_url_list.remove('PostgreSQL.org')
import json import time from urlextract import URLExtract start = time.time() extractor = URLExtract() with open('result.json', encoding="utf8") as f: data = json.load(f) posts = json.dumps(data) links = [] for url in extractor.gen_urls(posts): links.append(url) with open("links.txt", "w") as f: for link in links: f.write(str(link) +"\n") end = time.time() print(end-start)
def row_tweet_to_urls(row): extractor = URLExtract() return list(extractor.gen_urls(row['tweet']))
def extract_urls(body): urlset = set() extractor = URLExtract() excluded = [ '.id', '.you', '.lol', '.like', '.now', '.my', '.love', '.phone', '.how', '.post', '.me', '.got', '.hot', '.im', '.best' ] for url in extractor.gen_urls(body): if len(url) < 5 or '.' not in url: continue if url.count('http') == 1: url = url.split('http')[1] url = 'http{}'.format(url) if '(' in url: rurl = url.split('(') if extractor.has_urls(rurl[1]): url = rurl[1] elif extractor.has_urls(rurl[0]): url = rurl[0] else: continue if ')' in url: lurl = url.split(')') if extractor.has_urls(lurl[0]): url = lurl[0] elif extractor.has_urls(lurl[1]): url = lurl[1] else: continue sem = 0 for suffix in excluded: if url.endswith(suffix): sem = 1 if sem == 1: continue # """ if '[IMG]' in url: try: url = url.split('[IMG]')[1] except IndexError: pass if '[/IMG]' in url: try: url = url.split('[/IMG]')[0] except IndexError: pass if url.endswith('?fb'): url = url.replace('?fb', '') if url.endswith('?noredirect'): url = url.replace('?noredirect', '') elif url.endswith( '_d.jpg?maxwidth=640&shape=thumb&fidelity=medium'): url = url.replace( '_d.jpg?maxwidth=640&shape=thumb&fidelity=medium', '') elif url.endswith('?s=sms'): url = url.replace('?s=sms', '') if '//m.imgur.com' in url: url = url.replace('//m.imgur.com', '//imgur.com') if url.startswith('https://thumbs.gfycat.com/'): url = url.replace('https://thumbs.gfycat.com/', 'https://gfycat.com/') if url.endswith('-size_restricted.gif'): url = url.replace('-size_restricted.gif', '') # """ urlset.add(url) return urlset
def fetching(search_words, numberofitem=2): extractor = URLExtract() date_since = "2019-12-16" new_search = search_words + " -filter:retweets" print(new_search) tweets = tw.Cursor(api.search, q=new_search, lang="en", since=date_since).items(numberofitem) id = [] textLen = [] retweetsCount = [] favoriteCount = [] source = [] language = [] text = [] retweeted = [] favourited = [] date = [] name = [] screenName = [] location = [] url = [] followers_count = [] friends_count = [] listed_count = [] favorite_count = [] statuses_count = [] verified = [] prot = [] senti = [] imgurl=[] raw_tweet = [] for t in tweets: raw_tweet.append(t) id.append(t.id) text.append(t.text) textLen.append(len(t.text)) retweetsCount.append(t.retweet_count) favoriteCount.append(t.favorite_count) source.append(t.source) language.append(t.lang) date.append(t.created_at) favourited.append(t.favorited) retweeted.append(t.retweeted) name.append(t.user.name) imgurl.append(t.user.profile_image_url) screenName.append(t.user.screen_name) location.append(t.user.location) if t.user.url: # not t.user.url: temp = "" for url_ in extractor.gen_urls(t.text): temp = url_ if temp: url.append(url_) else: url.append(t.user.url) else: temp = "" for url_ in extractor.gen_urls(t.text): temp = url_ if temp: url.append(temp) else: url.append('https://twitter.com/') followers_count.append(t.user.followers_count) friends_count.append(t.user.friends_count) listed_count.append(t.user.listed_count) favorite_count.append(t.user.favourites_count) statuses_count.append(t.user.statuses_count) prot.append(t.user.protected) verified.append(t.user.verified) senti.append(analyze_sentiment(t.text)) df = pd.DataFrame(name, columns=['userName']) df['userID'] = pd.DataFrame(id, columns=['userID']) df['text'] = pd.DataFrame(text, columns=['text']) df['textLen'] = pd.DataFrame(textLen, columns=['textLen']) df['retweetsCount'] = pd.DataFrame(retweetsCount, columns=['retweetsCount']) df['favoriteCount'] = pd.DataFrame(favoriteCount, columns=['favoriteCount']) df['source'] = pd.DataFrame(source, columns=['source']) df['language'] = pd.DataFrame(language, columns=['language']) df['date'] = pd.DataFrame(date, columns=['date']) df['favourited'] = pd.DataFrame(favourited, columns=['favourited']) df['retweeted'] = pd.DataFrame(retweeted, columns=['retweeted']) df['userLocation'] = pd.DataFrame(location, columns=['userLocation']) df['URL'] = pd.DataFrame(url, columns=['URL']) df['userfollowers_count'] = pd.DataFrame(followers_count, columns=['userfollowers_count']) df['userfriends_count'] = pd.DataFrame(friends_count, columns=['userfriends_count']) df['userListed_count'] = pd.DataFrame(listed_count, columns=['userListed_count']) df['userFavorites_count'] = pd.DataFrame(favorite_count, columns=['userFavorites_count']) df['userStatuses_count'] = pd.DataFrame(statuses_count, columns=['userStatuses_count']) df['userVerified'] = pd.DataFrame(verified, columns=['userVerified']) df['userProtected'] = pd.DataFrame(prot, columns=['userProtected']) df['sentiment'] = pd.DataFrame(senti, columns=['sentiment']) df['rawTweet'] = pd.DataFrame(raw_tweet, columns=['rawTweet']) df['screenName'] = pd.DataFrame(screenName, columns=['screenName']) df['imgUrl'] = pd.DataFrame(imgurl, columns=['imgUrl']) return df
def main(): x = 1 while x <= pages: url_org = f'https://github.com/search?p={x}&q=org%3A{organization}+{query}&type=code' page = s.get(url_org).text soup = BeautifulSoup(page, 'html5lib') url_list = [] for link in soup.findAll('a'): inside_file = link.get('href') if f'/{organization}/' in inside_file: full_url = 'https://github.com' + inside_file head = full_url.partition('#') url_list.append(head[0]) final_url_list = set(url_list) final_url_list = list(final_url_list) total_repositories = len(final_url_list) print( f'\nFetched {total_repositories} repositories from page {x} that contain S3 Buckets .' ) print("\n") if total_repositories == 0 and x < 2: print( colored("Make sure your credentials are properly configured.", 'red')) sys.exit(1) if total_repositories == 0: print('Cannot find more S3 Buckets.') sys.exit(1) for i in (final_url_list): inner_url = i inner_url_fetch = s.get(inner_url).text extractor = URLExtract() for bucketurl in extractor.gen_urls(inner_url_fetch): if bucketurl not in exclude and 's3.amazonaws.com' in bucketurl: try: check_takeover = requests.get(bucketurl) status = check_takeover.status_code o1 = (f'[{status}] - {bucketurl}\n') if args.o: file = open(args.o, 'a') file.write(o1) print(f'[{status}] - {bucketurl} ') except: pass try: check_takeover_response = check_takeover.content check_takeover_response = str(check_takeover_response) if 'NoSuchBucket' in check_takeover_response: s3_text = (colored('[S3 Bucket Takeover]', 'green')) o2 = (f'{s3_text} : {bucketurl}\n') print(f'{s3_text} : {bucketurl}') if args.o: file = open(args.o, 'a') file.write(o2) except: pass x = x + 1
def extract_urls(body): urlset = set() extractor = URLExtract() excluded = [ '.id', '.you', '.lol', '.like', '.now', '.my', '.love', '.phone', '.how', '.post', '.me', '.got', '.hot', '.im', '.best' ] try: generatedUrls = extractor.gen_urls(body) for url in generatedUrls: if len(url) < 5 or '.' not in url: continue if url.count('http') == 1: url = url.split('http')[1] url = 'http{}'.format(url) if '(' in url: rurl = url.split('(') if extractor.has_urls(rurl[1]): url = rurl[1] elif extractor.has_urls(rurl[0]): url = rurl[0] else: continue if ')' in url: lurl = url.split(')') if extractor.has_urls(lurl[0]): url = lurl[0] elif extractor.has_urls(lurl[1]): url = lurl[1] else: continue sem = 0 for suffix in excluded: if url.endswith(suffix): sem = 1 if sem == 1: continue # """ if '[IMG]' in url: try: url = url.split('[IMG]')[1] except IndexError: pass if '[/IMG]' in url: try: url = url.split('[/IMG]')[0] except IndexError: pass if url.endswith('?fb'): url = url.replace('?fb', '') if url.endswith('?noredirect'): url = url.replace('?noredirect', '') elif url.endswith( '_d.jpg?maxwidth=640&shape=thumb&fidelity=medium'): url = url.replace( '_d.jpg?maxwidth=640&shape=thumb&fidelity=medium', '') elif url.endswith('?s=sms'): url = url.replace('?s=sms', '') if '//m.imgur.com' in url: url = url.replace('//m.imgur.com', '//imgur.com') if url.startswith('https://thumbs.gfycat.com/'): url = url.replace('https://thumbs.gfycat.com/', 'https://gfycat.com/') if url.endswith('-size_restricted.gif'): url = url.replace('-size_restricted.gif', '') # """ urlset.add(url) return urlset except AttributeError as e: raise e print( "While generating urls, an AttributeError (specifically {e}) was raised. Moving on without extracting urls for now. This is likely an error with the python library URLExtract (https://github.com/lipoja/URLExtract). The issue has been fixed (see issue fix here: https://github.com/lipoja/URLExtract/commit/aa51f52e77b104932c49fb14882c632f12b6e940) but is has not included in the most recent release. Please install the version from GitHub to fix this issue (eg. pip3 install git+https://github.com/lipoja/URLExtract.git" .format(e=e)) finally: return urlset # which is empty
def renderMarkdown(text, ignoreLinks=False, heading=False, alignment=False, properties=False, view_type=False): isAttribute = False if ':hiccup' in text: # THIS DOES NOT WORK WELL !!! VERY BROKEN # text = 'hr ' data = re.sub(r'\n', '', text.strip()) data = re.sub(r':hiccup \[:hr\]', r'<hr>', data) data = re.sub(r'(\[\s*?):([\w-]+)', r'\1"\2",', data) data = re.sub(r':([\w-]+)', r'"\1":', data) data = re.sub(r'([\}\]\:][\s]*?)(\w+)([\s]*?[\[\{\]])', r'\1"\2"\3', data) data = re.sub(r'([\}\]\"])([\s\n]*?)([\[\{\"])', r'\1,\2\3', data) # print(data[9:]) # data = re.sub(r'(hr)', r'hr', data) # this tag is not being converted correctly # print(data[10:]) # print(json.loads(data[10:])) # print(convert(data)) # return convert(data) return data if ignoreLinks is False: global wordcount wordcount += len(text.split()) # todo correctly render page alias {{alias: [[Roam Research]] Roam}} # todo fix URLs that contain a # # todo if attribute exists set a flag so the attribute can be picked up and attributed to the parent block if re.match(r'\b(.+)\:\:', text, flags=0): isAttribute = True text = re.sub(r'^\[\[>\]\](.*)', r'<blockquote>\1</blockquote>', text) # blockquote text = re.sub(r'\b(.+)\:\:', lambda x: _processAttribute(x, text), text) # attributes text = re.sub(r'^(\-\-\-)$', r'<hr>', text) text = re.sub(r'{{\[\[TODO\]\]}}', _processCheckmark(False), text) # unchecked TO DO text = re.sub(r'{{{\[\[DONE\]\]}}}}', _processCheckmark(True), text) # checked TO DO alt text = re.sub(r'{{\[\[DONE\]\]}}', _processCheckmark(True), text) # checked TO DO text = re.sub(r'\!\[([^\[\]]*?)\]\((.+?)\)', r'<img src="\2" alt="\1" />', text) # markdown images text = re.sub(r'\{\{\[\[youtube\]\]:(.+?)\}\}', lambda x: _processExternalEmbed(x, text, "youtube"), text) # external clojure embeds text = re.sub(r'\{\{\[\[query\]\]:(.+?)\}\}', lambda x: _processQueries(x, text), text) # queries text = re.sub(r'\{\{(.*):.*[^\{\}]\((.+?)\)\)(.*)\}\}', lambda x: _processInternalEmbed(x, text), text) # clojure embeds and Block aliases text = re.sub(r'\{\{(.*):.*[^\{\}]\[(.+?)\]\](.*)\}\}', lambda x: _processInternaPagelEmbed(x, text), text) # clojure page aliases text = re.sub(r'\{\{\[\[slider\]\](.*)\}\}', lambda x: _processSlider(x, text, properties), text) # sliders text = re.sub(r'(\{\{or:(.+?)\}\})', lambda x: _processTextVersion(x, text), text) # text versioning if ignoreLinks: text = re.sub(r'\[\[(.+?)\]\]', r'\1', text) # page links text = re.sub(r'\[([^\[\]]+?)\]\((.+?)\)', r'\1', text) # external links text = re.sub(r'\b(.+)\:\:', lambda x: _processAttribute(x, text), text) # attributes else: text = re.sub(r'\[([^\[\]]+?)\]\(\[\[(.+?)\]\]\)', lambda x: _processInternalAlias(x, text), text) # internal page aliases text = re.sub(r'\[([^\[\]]+?)\]\(\(\((.+?)\)\)\)', lambda x: _processInternalBlockAlias(x, text), text) # internal block aliases text = re.sub(r'\[([^\[\]]+?)\]\(([^\[\]\(].+?)\)', lambda x: _processExternalAlias(x, text), text) # external aliases text = re.sub(r'(?<!href="\/[A-Za-z0-9\-\_]{8})(#(\w+))', lambda x: _processInternalTag(x, text), text) # tags without brackets text = re.sub(r'(\#\[\[(.+?)\]\])', lambda x: _processInternalTag(x, text), text) # tag with brackets text = re.sub(r'(?<!\#)\[\[(.+?)\]\]', lambda x: _processInternalLink(x, text), text) # pages with brackets text = re.sub(r'\n', r'<br>', text) # newline text = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', text) # bold text = re.sub(r'\_\_(.*?)\_\_', r'<em>\1</em>', text) # italic text = re.sub(r'\~\~(.+?)\~\~', r'<s>\1</s>', text) # strikethrough text = re.sub(r'\^\^(.+?)\^\^', r'<span class="highlight">\1</span>', text) # highlight text = re.sub(r'\`\`\`(.+?)\`\`\`', r'<code>\1</code>', text) # large codeblock text = re.sub(r'\`(.+?)\`', r'<code>\1</code>', text) # inline codeblock def isBlockPrivate(blockID, blockText): if blockID in block_ids: # print("block not private") # print(blockText) # print(blockID) return renderMarkdown(block_ids[blockID]['string']) else: # print("block is private") # print(blockText) pass text = re.sub(r'\(\((.+?)\)\)', lambda x: isBlockPrivate(x.group(1), text), text) # block ref # deal with bare URLs # not a huge fan of this forbidden_chars = ['<a', '<img', '[', '<code', '<iframe'] results = [] for substring in forbidden_chars: results.append(substring in text) if not any(results): extractor = URLExtract() if extractor.has_urls(text): for url in extractor.gen_urls(text): text = text.replace(url, _processBareURL(url)) # print(text) if heading: text = f'<h{heading}>{text}</h{heading}>' if alignment: text = f'<div style="text-align:{alignment};">{text}</div>' return text