Exemplos de URLExtract.gen_urls em Python, exemplos de urlextract.URLExtract.gen_urls em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: textpreprocess.py Projeto: BozhongLu/Reddits_Comments_Classification

def clean_text(text):
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    STOPWORDS = set(stopwords.words('english')).union(["gt"])
    extractor = URLExtract()
    info = " "

    for url in extractor.gen_urls(text):
        try:
            if "youtube" in url or "youtu.be" in url:
                content = urlopen(url, timeout=1).read()
                content = BeautifulSoup(content).find(
                    'title').string  # HTML decoding
                info += " " + content
        except:
            continue

    text += info
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub(' ', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    text = remove_special_chars_digits(text)
    text = word_tokenize(text)
    text = remove_repeat(text)
    text = np.asarray(text)
    text = lemmatize_new(text)

    text = ' '.join(word for word in text if word not in STOPWORDS)

    TextPreprocess.num += 1
    print(TextPreprocess.num, " " + text)
    return text

Exemplo n.º 2

0

Exibir arquivo

 def _process_links(self):
     extractor = URLExtract(extract_email=True)
     for url in extractor.gen_urls(self.subtext):
         if _is_email(url):
             href = f"mailto:{url}"
         else:
             href = url
         link = f"<a href={href}>{url}</a>"
         self.subtext = self.subtext.replace(url, link)

Exemplo n.º 3

0

Exibir arquivo

 def read_stream_from_assia_tv(self, response, event_url, event_name, event_date):
     scripts = response.css("script")
     extractor = URLExtract()
     for s in scripts:
         text = s.get()
         if extractor.has_urls(text):
             for url in extractor.gen_urls(text):
                 if "video.assia.tv" in url:
                     self.logger.info("#read_stream_from_assia_tv - found video stream url %s!"%(url))
                     if "m3u8" in url:

Exemplo n.º 4

0

Exibir arquivo

async def on_message(message):
    # print message
    # print(message.author, ": ", message.content)

    if message.author.bot:
        return

    # extract URLs
    extractor = URLExtract()
    URLs = list(extractor.gen_urls(message.content))

    # ignore the URLs in the IGNORE list
    for i in range(len(URLs)):
        for item in IGNORE:
            if item in URLs[i]:
                URLs.remove(URLs[i])

    # ignore message if there were no URLs
    if URLs == []:
        return

    # create attachments from URLs
    attachments = get_attachments(message, URLs)

    # print("Attachments created!")
    # print(*attachments, sep="\n")

    # get trello board
    all_boards = trello_client.list_boards()
    channel = message.channel.name
    Board = get_board(all_boards, "Dump")

    # get/create list for channel
    List = get_list(lists=Board.list_lists(), name=channel)
    if List is None:
        List = Board.add_list(channel)

    # create card for each attachment
    attached = []
    for attachment in attachments:
        author = str(message.author.name)
        result = create_card(List, author, attachment)
        attached.append(result)

    if len(attached) == len(attachments):
        reply = 'I have attached these links to Trello for you. Please remember to organise them at https://trello.com/b/SAJvonRo/dump.'
    else:
        reply = 'Oops! Something went wrong. Please attach these links manually to https://trello.com/.'

    # sent reply and delete after 5 seconds
    sent = await message.channel.send(reply, delete_after=5)
    # suppress embeds
    await sent.edit(suppress=True)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: main.py Projeto: kryksyh/utmtrackingremoverbot

def message(update: Update, context: CallbackContext) -> None:
    extractor = URLExtract()
    if extractor.has_urls(update.message.text):
        result_text = update.message.text
        for url in extractor.gen_urls(update.message.text):
            print(f"Url found: {url}")
            unshorten_url = unshort_url(url)
            print(f"Unshorten: {unshorten_url}")
            sanitized_url = trim_utm(unshorten_url)
            print(f"Sanitized: {sanitized_url}")
            if url != sanitized_url:
                result_text = result_text.replace(url, sanitized_url)
        if result_text != update.message.text:
            update.message.reply_text(result_text)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: helpers.py Projeto: mhadaily/okuna-api

def extract_urls_from_string(text):
    """
    Returns all the raw extracted urls as a list
    If a URL has a scheme, it ensures that it is http/s
    URLs like www. are sanitised in the normalise_url
    """
    text = text.lower()
    extractor = URLExtract(cache_dir=tempfile.gettempdir())
    results = [url for url in extractor.gen_urls(text)]
    for url in results:
        scheme = urlparse(url).scheme
        if scheme and scheme != 'https' and scheme != 'http':
            results.remove(url)

    return results

Exemplo n.º 7

0

Exibir arquivo

Arquivo: domains.py Projeto: adityasinghrajput/alllinks

def domains_func():
    extractor = URLExtract()
    with open(r"alllinks.txt", "r") as infile:
        for line in infile:
            for url in extractor.gen_urls(line):
                domain = urlparse(url).netloc
                if domain == "":
                    f = open(r"domainsout.txt", "a")
                    f.write(url + "\n")
                    f.close()

                else:
                    f = open(r"domainsout.txt", "a")
                    f.write(domain + "\n")
                    f.close()

Exemplo n.º 8

0

Exibir arquivo

    def save(self, *args, **kwargs):
        """
        set the slug for the first time only
            - slugify the title with a random alphanumeric
        """

        if self.date_updated is None:
            self.slug = slugify(
                self.title + '-' +
                secrets.token_urlsafe(LENGTH_OF_RANDOM_ALPHANUMERIC_SLUG))
        # Anonymous Ideas will always be public
        if self.user is None:
            self.visibility = True

        # Linkinfy the links
        extractor = URLExtract()
        for url in extractor.gen_urls(self.concept):
            self.concept = self.concept.replace(
                url, "<a href={}>{}</a>".format(url, url))

        super(Idea, self).save(*args, **kwargs)

Exemplo n.º 9

0

Exibir arquivo

    def getNamespacesfromIRIs(self, meta_source):
        extractor = URLExtract()
        namespaces = set()
        if meta_source is not None:
            for url in set(extractor.gen_urls(str(meta_source))):
                namespace_candidate = url.rsplit('/', 1)[0]
                if namespace_candidate != url:
                    namespaces.add(namespace_candidate)
                else:
                    namespace_candidate = url.rsplit('#', 1)[0]
                    if namespace_candidate != url:
                        namespaces.add(namespace_candidate)

            vocabs = Preprocessor.getLinkedVocabs()
            lod_namespaces = [
                d['namespace'] for d in vocabs if 'namespace' in d
            ]
            for ns in namespaces:
                if ns + '/' in lod_namespaces:
                    self.namespaces.append(ns + '/')
                elif ns + '#' in lod_namespaces:
                    self.namespaces.append(ns + '#')

Exemplo n.º 10

0

Exibir arquivo

Arquivo: getNparse.py Projeto: vinnix/awesome-pg

        # print(url_to_parse.group("url"))
        # print("\n\n")
        parsing_list.append(url_to_parse.group("url"))
        # I do still have more to learn regarding regex in python

    md = markdown.Markdown()
    extractor = URLExtract()

    comp_url_list = []
    item_total = 0
    for pos, url_list in enumerate(parsing_list):
        print(">>>> ", url_list)
        url_text = url_parse(url_list)
        html = md.reset().convert(url_parse(url_list))
        # print(html)# print(html)
        comp_url_list.extend(extractor.gen_urls(html))
    comp_url_list.sort()
    comp_url_list = [a[0] for a in itertools.groupby(comp_url_list)]
    # XXX: Make URL list above unique
    # Converting using  comp_url_list = set(comp_url_list)

    # blacklisted "URLs"
    # XXX: Generate this list from a file

    # remove will raise error if item is not there, so I should use
    # discard, instead
    #  comp_url_list.discard('foo')
    comp_url_list.remove('Postgres.app')
    comp_url_list.remove('CONTRIBUTING.md')
    comp_url_list.remove('pgconfig.org')
    comp_url_list.remove('PostgreSQL.org')

Exemplo n.º 11

0

Exibir arquivo

import json
import time
from urlextract import URLExtract

start = time.time()

extractor = URLExtract()

with open('result.json', encoding="utf8") as f:
  data = json.load(f)

posts = json.dumps(data)

links = []

for url in extractor.gen_urls(posts):
    links.append(url)

with open("links.txt", "w") as f:
    for link in links:
        f.write(str(link) +"\n")

end = time.time()

print(end-start)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: TwintPool.py Projeto: mdfranz/ProjectDomino

 def row_tweet_to_urls(row):
     extractor = URLExtract()
     return list(extractor.gen_urls(row['tweet']))

Exemplo n.º 13

0

Exibir arquivo

Arquivo: utils.py Projeto: practise-area/redditPostArchiver

def extract_urls(body):
    urlset = set()
    extractor = URLExtract()
    excluded = [
        '.id', '.you', '.lol', '.like', '.now', '.my', '.love', '.phone',
        '.how', '.post', '.me', '.got', '.hot', '.im', '.best'
    ]
    for url in extractor.gen_urls(body):
        if len(url) < 5 or '.' not in url:
            continue
        if url.count('http') == 1:
            url = url.split('http')[1]
            url = 'http{}'.format(url)
        if '(' in url:
            rurl = url.split('(')
            if extractor.has_urls(rurl[1]):
                url = rurl[1]
            elif extractor.has_urls(rurl[0]):
                url = rurl[0]
            else:
                continue
        if ')' in url:
            lurl = url.split(')')
            if extractor.has_urls(lurl[0]):
                url = lurl[0]
            elif extractor.has_urls(lurl[1]):
                url = lurl[1]
            else:
                continue
        sem = 0
        for suffix in excluded:
            if url.endswith(suffix):
                sem = 1
        if sem == 1:
            continue
        # """
        if '[IMG]' in url:
            try:
                url = url.split('[IMG]')[1]
            except IndexError:
                pass
        if '[/IMG]' in url:
            try:
                url = url.split('[/IMG]')[0]
            except IndexError:
                pass
        if url.endswith('?fb'):
            url = url.replace('?fb', '')
        if url.endswith('?noredirect'):
            url = url.replace('?noredirect', '')
        elif url.endswith(
                '_d.jpg?maxwidth=640&amp;shape=thumb&amp;fidelity=medium'):
            url = url.replace(
                '_d.jpg?maxwidth=640&amp;shape=thumb&amp;fidelity=medium', '')
        elif url.endswith('?s=sms'):
            url = url.replace('?s=sms', '')
        if '//m.imgur.com' in url:
            url = url.replace('//m.imgur.com', '//imgur.com')
        if url.startswith('https://thumbs.gfycat.com/'):
            url = url.replace('https://thumbs.gfycat.com/',
                              'https://gfycat.com/')
        if url.endswith('-size_restricted.gif'):
            url = url.replace('-size_restricted.gif', '')
        # """
        urlset.add(url)
    return urlset

Exemplo n.º 14

0

Exibir arquivo

Arquivo: views.py Projeto: Hafsah256/Project

def fetching(search_words, numberofitem=2):
    extractor = URLExtract()
    date_since = "2019-12-16"
    new_search = search_words + " -filter:retweets"
    print(new_search)
    tweets = tw.Cursor(api.search, q=new_search, lang="en", since=date_since).items(numberofitem)
    id = []
    textLen = []
    retweetsCount = []
    favoriteCount = []
    source = []
    language = []
    text = []
    retweeted = []
    favourited = []
    date = []
    name = []
    screenName = []
    location = []
    url = []
    followers_count = []
    friends_count = []
    listed_count = []
    favorite_count = []
    statuses_count = []
    verified = []
    prot = []
    senti = []
    imgurl=[]
    raw_tweet = []
    for t in tweets:
        raw_tweet.append(t)
        id.append(t.id)
        text.append(t.text)
        textLen.append(len(t.text))
        retweetsCount.append(t.retweet_count)
        favoriteCount.append(t.favorite_count)
        source.append(t.source)
        language.append(t.lang)
        date.append(t.created_at)
        favourited.append(t.favorited)
        retweeted.append(t.retweeted)
        name.append(t.user.name)
        imgurl.append(t.user.profile_image_url)
        screenName.append(t.user.screen_name)
        location.append(t.user.location)
        if t.user.url:  # not t.user.url:
            temp = ""
            for url_ in extractor.gen_urls(t.text):
                temp = url_
            if temp:
                url.append(url_)
            else:
                url.append(t.user.url)
        else:
            temp = ""
            for url_ in extractor.gen_urls(t.text):
                temp = url_
            if temp:
                url.append(temp)
            else:
                url.append('https://twitter.com/')

        followers_count.append(t.user.followers_count)
        friends_count.append(t.user.friends_count)
        listed_count.append(t.user.listed_count)
        favorite_count.append(t.user.favourites_count)
        statuses_count.append(t.user.statuses_count)
        prot.append(t.user.protected)
        verified.append(t.user.verified)
        senti.append(analyze_sentiment(t.text))

    df = pd.DataFrame(name, columns=['userName'])
    df['userID'] = pd.DataFrame(id, columns=['userID'])
    df['text'] = pd.DataFrame(text, columns=['text'])
    df['textLen'] = pd.DataFrame(textLen, columns=['textLen'])
    df['retweetsCount'] = pd.DataFrame(retweetsCount, columns=['retweetsCount'])
    df['favoriteCount'] = pd.DataFrame(favoriteCount, columns=['favoriteCount'])
    df['source'] = pd.DataFrame(source, columns=['source'])
    df['language'] = pd.DataFrame(language, columns=['language'])
    df['date'] = pd.DataFrame(date, columns=['date'])
    df['favourited'] = pd.DataFrame(favourited, columns=['favourited'])
    df['retweeted'] = pd.DataFrame(retweeted, columns=['retweeted'])
    df['userLocation'] = pd.DataFrame(location, columns=['userLocation'])
    df['URL'] = pd.DataFrame(url, columns=['URL'])
    df['userfollowers_count'] = pd.DataFrame(followers_count, columns=['userfollowers_count'])
    df['userfriends_count'] = pd.DataFrame(friends_count, columns=['userfriends_count'])
    df['userListed_count'] = pd.DataFrame(listed_count, columns=['userListed_count'])
    df['userFavorites_count'] = pd.DataFrame(favorite_count, columns=['userFavorites_count'])
    df['userStatuses_count'] = pd.DataFrame(statuses_count, columns=['userStatuses_count'])
    df['userVerified'] = pd.DataFrame(verified, columns=['userVerified'])
    df['userProtected'] = pd.DataFrame(prot, columns=['userProtected'])
    df['sentiment'] = pd.DataFrame(senti, columns=['sentiment'])
    df['rawTweet'] = pd.DataFrame(raw_tweet, columns=['rawTweet'])
    df['screenName'] = pd.DataFrame(screenName, columns=['screenName'])
    df['imgUrl'] = pd.DataFrame(imgurl, columns=['imgUrl'])
    return df

Exemplo n.º 15

0

Exibir arquivo

Arquivo: sage.py Projeto: Shivang0/Sage

def main():
    x = 1
    while x <= pages:
        url_org = f'https://github.com/search?p={x}&q=org%3A{organization}+{query}&type=code'

        page = s.get(url_org).text
        soup = BeautifulSoup(page, 'html5lib')

        url_list = []

        for link in soup.findAll('a'):
            inside_file = link.get('href')
            if f'/{organization}/' in inside_file:
                full_url = 'https://github.com' + inside_file
                head = full_url.partition('#')
                url_list.append(head[0])

        final_url_list = set(url_list)
        final_url_list = list(final_url_list)

        total_repositories = len(final_url_list)

        print(
            f'\nFetched {total_repositories} repositories from page {x} that contain S3 Buckets .'
        )
        print("\n")
        if total_repositories == 0 and x < 2:
            print(
                colored("Make sure your credentials are properly configured.",
                        'red'))
            sys.exit(1)
        if total_repositories == 0:
            print('Cannot find more S3 Buckets.')
            sys.exit(1)

        for i in (final_url_list):
            inner_url = i
            inner_url_fetch = s.get(inner_url).text
            extractor = URLExtract()
            for bucketurl in extractor.gen_urls(inner_url_fetch):
                if bucketurl not in exclude and 's3.amazonaws.com' in bucketurl:
                    try:
                        check_takeover = requests.get(bucketurl)
                        status = check_takeover.status_code
                        o1 = (f'[{status}] - {bucketurl}\n')
                        if args.o:
                            file = open(args.o, 'a')
                            file.write(o1)
                        print(f'[{status}] - {bucketurl} ')
                    except:
                        pass
                    try:
                        check_takeover_response = check_takeover.content
                        check_takeover_response = str(check_takeover_response)
                        if 'NoSuchBucket' in check_takeover_response:
                            s3_text = (colored('[S3 Bucket Takeover]',
                                               'green'))
                            o2 = (f'{s3_text} : {bucketurl}\n')
                            print(f'{s3_text} : {bucketurl}')
                            if args.o:
                                file = open(args.o, 'a')
                                file.write(o2)

                    except:
                        pass

        x = x + 1

Exemplo n.º 16

0

Exibir arquivo

def extract_urls(body):
    urlset = set()
    extractor = URLExtract()
    excluded = [
        '.id', '.you', '.lol', '.like', '.now', '.my', '.love', '.phone',
        '.how', '.post', '.me', '.got', '.hot', '.im', '.best'
    ]
    try:
        generatedUrls = extractor.gen_urls(body)
        for url in generatedUrls:
            if len(url) < 5 or '.' not in url:
                continue
            if url.count('http') == 1:
                url = url.split('http')[1]
                url = 'http{}'.format(url)
            if '(' in url:
                rurl = url.split('(')
                if extractor.has_urls(rurl[1]):
                    url = rurl[1]
                elif extractor.has_urls(rurl[0]):
                    url = rurl[0]
                else:
                    continue
            if ')' in url:
                lurl = url.split(')')
                if extractor.has_urls(lurl[0]):
                    url = lurl[0]
                elif extractor.has_urls(lurl[1]):
                    url = lurl[1]
                else:
                    continue
            sem = 0
            for suffix in excluded:
                if url.endswith(suffix):
                    sem = 1
            if sem == 1:
                continue
            # """
            if '[IMG]' in url:
                try:
                    url = url.split('[IMG]')[1]
                except IndexError:
                    pass
            if '[/IMG]' in url:
                try:
                    url = url.split('[/IMG]')[0]
                except IndexError:
                    pass
            if url.endswith('?fb'):
                url = url.replace('?fb', '')
            if url.endswith('?noredirect'):
                url = url.replace('?noredirect', '')
            elif url.endswith(
                    '_d.jpg?maxwidth=640&amp;shape=thumb&amp;fidelity=medium'):
                url = url.replace(
                    '_d.jpg?maxwidth=640&amp;shape=thumb&amp;fidelity=medium',
                    '')
            elif url.endswith('?s=sms'):
                url = url.replace('?s=sms', '')
            if '//m.imgur.com' in url:
                url = url.replace('//m.imgur.com', '//imgur.com')
            if url.startswith('https://thumbs.gfycat.com/'):
                url = url.replace('https://thumbs.gfycat.com/',
                                  'https://gfycat.com/')
            if url.endswith('-size_restricted.gif'):
                url = url.replace('-size_restricted.gif', '')
            # """
            urlset.add(url)
        return urlset
    except AttributeError as e:
        raise e
        print(
            "While generating urls, an AttributeError (specifically {e}) was raised. Moving on without extracting urls for now. This is likely an error with the python library URLExtract (https://github.com/lipoja/URLExtract). The issue has been fixed (see issue fix here: https://github.com/lipoja/URLExtract/commit/aa51f52e77b104932c49fb14882c632f12b6e940) but is has not included in the most recent release. Please install the version from GitHub to fix this issue (eg. pip3 install git+https://github.com/lipoja/URLExtract.git"
            .format(e=e))
    finally:
        return urlset  # which is empty

Exemplo n.º 17

0

Exibir arquivo

Arquivo: build.py Projeto: 8bitgentleman/digital-garden

def renderMarkdown(text, ignoreLinks=False, heading=False, alignment=False, properties=False, view_type=False):
    isAttribute = False
    if ':hiccup' in text:
        # THIS DOES NOT WORK WELL !!! VERY BROKEN
        # text = 'hr '
        data = re.sub(r'\n', '', text.strip())
        data = re.sub(r':hiccup \[:hr\]', r'<hr>', data)
        data = re.sub(r'(\[\s*?):([\w-]+)', r'\1"\2",', data)
        data = re.sub(r':([\w-]+)', r'"\1":', data)
        data = re.sub(r'([\}\]\:][\s]*?)(\w+)([\s]*?[\[\{\]])', r'\1"\2"\3', data)
        data = re.sub(r'([\}\]\"])([\s\n]*?)([\[\{\"])', r'\1,\2\3', data)
        # print(data[9:])
        # data = re.sub(r'(hr)', r'hr', data)  # this tag is not being converted correctly

        # print(data[10:])
        # print(json.loads(data[10:]))
        # print(convert(data))
        # return convert(data)
        return data

    if ignoreLinks is False:
        global wordcount
        wordcount += len(text.split())
    # todo correctly render page alias {{alias: [[Roam Research]] Roam}}
    # todo fix URLs that contain a #
    # todo if attribute exists set a flag so the attribute can be picked up and attributed to the parent block
    if re.match(r'\b(.+)\:\:', text, flags=0):
        isAttribute = True
    text = re.sub(r'^\[\[>\]\](.*)', r'<blockquote>\1</blockquote>', text)  # blockquote
    text = re.sub(r'\b(.+)\:\:', lambda x: _processAttribute(x, text), text)  # attributes
    text = re.sub(r'^(\-\-\-)$', r'<hr>', text)
    text = re.sub(r'{{\[\[TODO\]\]}}', _processCheckmark(False), text)  # unchecked TO DO
    text = re.sub(r'{{{\[\[DONE\]\]}}}}', _processCheckmark(True), text)  # checked TO DO alt
    text = re.sub(r'{{\[\[DONE\]\]}}', _processCheckmark(True), text)  # checked TO DO
    text = re.sub(r'\!\[([^\[\]]*?)\]\((.+?)\)', r'<img src="\2" alt="\1" />', text)  # markdown images
    text = re.sub(r'\{\{\[\[youtube\]\]:(.+?)\}\}', lambda x: _processExternalEmbed(x, text, "youtube"), text)  # external clojure embeds
    text = re.sub(r'\{\{\[\[query\]\]:(.+?)\}\}', lambda x: _processQueries(x, text), text)  # queries
    text = re.sub(r'\{\{(.*):.*[^\{\}]\((.+?)\)\)(.*)\}\}', lambda x: _processInternalEmbed(x, text), text)  # clojure embeds and Block aliases
    text = re.sub(r'\{\{(.*):.*[^\{\}]\[(.+?)\]\](.*)\}\}', lambda x: _processInternaPagelEmbed(x, text), text)  # clojure page aliases
    text = re.sub(r'\{\{\[\[slider\]\](.*)\}\}', lambda x: _processSlider(x, text, properties), text)  # sliders

    text = re.sub(r'(\{\{or:(.+?)\}\})', lambda x: _processTextVersion(x, text), text)  # text versioning
    if ignoreLinks:
        text = re.sub(r'\[\[(.+?)\]\]', r'\1', text)  # page links
        text = re.sub(r'\[([^\[\]]+?)\]\((.+?)\)', r'\1', text)  # external links
        text = re.sub(r'\b(.+)\:\:', lambda x: _processAttribute(x, text), text)  # attributes

    else:
        text = re.sub(r'\[([^\[\]]+?)\]\(\[\[(.+?)\]\]\)', lambda x: _processInternalAlias(x, text), text)  # internal page aliases
        text = re.sub(r'\[([^\[\]]+?)\]\(\(\((.+?)\)\)\)', lambda x: _processInternalBlockAlias(x, text), text)  # internal block aliases
        text = re.sub(r'\[([^\[\]]+?)\]\(([^\[\]\(].+?)\)', lambda x: _processExternalAlias(x, text), text)  # external aliases
        text = re.sub(r'(?<!href="\/[A-Za-z0-9\-\_]{8})(#(\w+))', lambda x: _processInternalTag(x, text), text)  # tags without brackets

        text = re.sub(r'(\#\[\[(.+?)\]\])', lambda x: _processInternalTag(x, text), text)  # tag with brackets
        text = re.sub(r'(?<!\#)\[\[(.+?)\]\]', lambda x: _processInternalLink(x, text), text)  # pages with brackets

    text = re.sub(r'\n', r'<br>', text)  # newline
    text = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', text)  # bold
    text = re.sub(r'\_\_(.*?)\_\_', r'<em>\1</em>', text)  # italic
    text = re.sub(r'\~\~(.+?)\~\~', r'<s>\1</s>', text)  # strikethrough
    text = re.sub(r'\^\^(.+?)\^\^', r'<span class="highlight">\1</span>', text)  # highlight
    text = re.sub(r'\`\`\`(.+?)\`\`\`', r'<code>\1</code>', text)  # large codeblock
    text = re.sub(r'\`(.+?)\`', r'<code>\1</code>', text)  # inline codeblock

    def isBlockPrivate(blockID, blockText):
        if blockID in block_ids:
            # print("block not private")
            # print(blockText)
            # print(blockID)
            return renderMarkdown(block_ids[blockID]['string'])
        else:
            # print("block is private")
            # print(blockText)

            pass

    text = re.sub(r'\(\((.+?)\)\)', lambda x: isBlockPrivate(x.group(1), text), text)  # block ref

    # deal with bare URLs
    # not a huge fan of this
    forbidden_chars = ['<a', '<img', '[', '<code', '<iframe']
    results = []
    for substring in forbidden_chars:
        results.append(substring in text)
    if not any(results):
        extractor = URLExtract()
        if extractor.has_urls(text):
            for url in extractor.gen_urls(text):
                text = text.replace(url, _processBareURL(url))
                # print(text)

    if heading:
        text = f'<h{heading}>{text}</h{heading}>'
    if alignment:
        text = f'<div style="text-align:{alignment};">{text}</div>'
    return text