Пример #1
0
def extract_urls(text):
    extractor = URLExtract()
    if type(text) is str:
        urls = extractor.find_urls(text)
        return urls
    elif type(text) is list:
        urls = []
        for x in text:
            url_x = extractor.find_urls(x)
            urls.extend(url_x)
        return urls
    else:
        print("Provided text type (%s) is not currently supported. Please supply either a list of string objects or a string object." % str(type(text)))
Пример #2
0
 def str_clean2(self, item):
     extractor = URLExtract()
     url = tldextract.extract(item).registered_domain
     return url
        print('', end='')
        pqc()

        #abstract
        pq()
        print(abstract, end='')
        pqc()

        #cover_image (may not work, but might as well try)
        pq()
        cover_image = 'https://cross-currents.berkeley.edu' + row['Image']
        print(cover_image, end='')
        pqc()

        #pdf_url, extract from the File column
        extractor = URLExtract()
        pdf_urls = extractor.find_urls(row['File'])
        if len(
                pdf_urls
        ) >= 1:  #sometimes the extractor finds more than one URL, we should just always use the first
            pdf_url = pdf_urls[0]
        else:
            pdf_url = 'ERROR, no PDF URL found, content-type: ' + row[
                'Content type'] + '; Content ID: ' + row[
                    'Content ID'] + '; Article Type: ' + row['Article Type']
        pq()
        print(urllib.parse.unquote(pdf_url), end='')
        pqc()

        #supplementalfile_url
        pq()
Пример #4
0
def google_scrap(termo, dadobruto=False, gdr=None):
    """
    Performs Scrap in the Google search website/Realiza Scrap no Google Search.


    :param credencial: Password, e-mail or document/Senha, e-mail ou documento.
    :param dadobruto: To save the raw data/Para salvar o dado bruto.
    :param gdr: Uploads to Google Drive/Sobe arquivo para o Google Drive.
    """
    block = ['https://pastebin.com']
    final_urls = []
    urls = []
    clean = []
    vazou = []
    vazio = []

    google_urls = [
        'https://www.google.com/search?q=site:pastebin.com+intext:leak&sxsrf=ALeKk03cedAQ3Y7jlzXHY8LImOO_gJGxMQ:1606136317667&source=lnt&tbs=qdr:d&sa=X&ved=2ahUKEwjcobOF3JjtAhWbF7kGHbo3BO4QpwV6BAgFECY&biw=1366&bih=629',
        'https://www.google.com/search?biw=1366&bih=629&tbs=qdr%3Ad&sxsrf=ALeKk02AVQ6YXyUuLeavYsIZjr__SUBBKQ%3A1606136641749&ei=QbO7X5eGLaLD5OUP7dCoqAU&q=site%3Apastebin.com+intext%3A*%3A*&oq=site%3Apastebin.com+intext%3A*%3A*&gs_lcp=CgZwc3ktYWIQA1C3kgJY9KMCYN6nAmgAcAB4AIABVogBhwWSAQE4mAEAoAEBqgEHZ3dzLXdpesABAQ&sclient=psy-ab&ved=0ahUKEwjXqvef3ZjtAhWiIbkGHW0oClUQ4dUDCA0&uact=5',
        'https://www.google.com/search?biw=1366&bih=629&tbs=qdr%3Ad&sxsrf=ALeKk008FbvhwTD4Qyhal8ibZGTuwj5DwQ%3A1606136886229&ei=NrS7X7-_DYHX5OUPpMaYqAg&q=site%3Apastebin.com+intext%3A%22Target%3A%22&oq=site%3Apastebin.com+intext%3A%22Target%3A%22&gs_lcp=CgZwc3ktYWIQA1DEG1iMMWCSNGgAcAB4AIABWYgBnQOSAQE1mAEAoAEBqgEHZ3dzLXdpesABAQ&sclient=psy-ab&ved=0ahUKEwi_ssGU3pjtAhWBK7kGHSQjBoUQ4dUDCA0&uact=5',
        'https://www.google.com/search?biw=1366&bih=629&tbs=qdr%3Ad&sxsrf=ALeKk01een-cvWz4vY0qsb4w_IbGk4Ym0w%3A1606136893453&ei=PbS7X9f6GoDC5OUP7amBiA0&q=site%3Apastebin.com+intext%3Apassword&oq=site%3Apastebin.com+intext%3Apassword&gs_lcp=CgZwc3ktYWIQA1DlF1ivIGCNIWgAcAB4AIABZ4gB9gWSAQM4LjGYAQCgAQGqAQdnd3Mtd2l6wAEB&sclient=psy-ab&ved=0ahUKEwiXjfqX3pjtAhUAIbkGHe1UANEQ4dUDCA0&uact=5',
        'https://www.google.com/search?biw=1366&bih=629&tbs=qdr%3Ad&sxsrf=ALeKk01EHsZ3TIvfjuSTMJN4z9lThqH_AA%3A1606136962270&ei=grS7X5P8D4Cg5OUPlP6QyAU&q=site%3Apastebin.com+intext%3Aemail&oq=site%3Apastebin.com+intext%3Aemail&gs_lcp=CgZwc3ktYWIQA1DD3ANY3_wDYMr-A2gBcAB4AIABX4gB2QmSAQIxNZgBAKABAaoBB2d3cy13aXrAAQE&sclient=psy-ab&ved=0ahUKEwiTxeK43pjtAhUAELkGHRQ_BFkQ4dUDCA0&uact=5'
    ]

    header = {
        'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
        'Accept': 'text/html, application/xhtml + xml, application/xml; q = 0.9, image/webp',
        'Accept-Encoding': 'gzip',
        'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',
        'Upgrade-Insecure-Requests': '1',
        'Referer': 'https://www.google.com/'
    }

    for open_urls in google_urls:
        sleep_tempo = random.uniform(0, 6)
        r = requests.get(open_urls, headers=header)
        sites = r.text
        extractor = URLExtract()
        urls = extractor.find_urls(sites)

        for content in urls:

            if "google" not in content:

                if "http" in content:

                    if content not in block:
                        final_urls.append(content)

        sleep_numero = random.uniform(0, 8)
        time.sleep(sleep_numero)

    for rep in final_urls:

        if '&amp' in rep:
            rep = rep.replace('&amp', '')
            clean.append(rep)

    for search in clean:
        url = search
        sleep_tempo = random.uniform(0, 6)

        if 'pastebin' in search:
            alterar_string = search[-8:]
            reponse_gk = requests.get(search, headers=header)
            vazamento_gd = reponse_gk.text
            html = BeautifulSoup(reponse_gk.text, 'html.parser')

            if termo in vazamento_gd:

                if dadobruto == True:
                    alt_string = alterar_string.replace('/','')

                    Path(f'C:/Users/{getpass.getuser()}/Documents/Oblivion').mkdir(parents=True, exist_ok=True)
                    with open (f'{documentos}/Oblivion/RAW_pastebin_{alt_string}.txt','w', encoding='utf-16') as file_gd:
                        for content in html.select('.li1'):
                            leak = content.select_one('.de1')
                            lk = leak.text
                            file_gd.write(lk)

                    if gdr == 'PySide2.QtCore.Qt.CheckState.Checked':
                        subir_arquivo_drive_raiz(id_gdr,
                                                 f'RAW_pastebin_{alt_string}.txt',
                                                 'text/plain',
                                                 f'{documentos}/Oblivion/RAW_pastebin_{alt_string}.txt')

                lista_temp_gd = vazamento_gd.split('\n')
                for i in lista_temp_gd:

                    if termo in i:
                        alt_string = alterar_string.replace('/', '')
                        termo_novo = f'{termo}:pastebin.com/{alt_string}'
                        return termo_novo
Пример #5
0
        #Get polarity based on the blob
        polarityInt = analysis.sentiment.polarity

        #Get subjectivity based on the blob
        subjectivityInt = analysis.sentiment.subjectivity

        #If polarity is greater than 0 then the tweet is positive orelse negative
        if polarityInt > 0.0:
            polarityStr = 'Positive'
        else: polarityStr = 'Negative'

        #If subjectivity is greater than 0.5 then the tweet is subjective orelse objective
        if subjectivityInt > 0.5:
            subjectivityStr = 'Subjective'
        else: subjectivityStr = 'Objective'

        #initialize var for url
        url = None
        words = tweet.text.split()
        link = URLExtract()
        urls = link.find_urls(tweet.text)
        for word in words:
            if 'http' in word:
                url = word

        fileOut.writerow([tweet.text, polarityStr, subjectivityStr, url])
        print(tweet.text)
        print('Polarity: ', polarityInt)
        print('Subjectivity:', subjectivityInt)

Пример #6
0
    def get_data(self, url: str, freeword: str, date: str) -> dict:
        result_dict = {
            "取得日時": date,
            "検索語句": freeword,
            "店名": None,
            "住所": None,
            "定休日": None,
            "お店のホームページ": None,
            "席数": None,
            "スタッフ数": None,
            "スタッフ募集": None,
            "ホットペッパービューティ上のHP": None,
            "電話番号": None,
            "口コミ総数": 0,
            "総合": 0,
            "雰囲気": 0,
            "接客サービス": 0,
            "技術・仕上がり": 0,
            "メニュー・料金": 0
        }
        error_num = 0
        extractor = URLExtract()
        while True:
            if error_num >= 10:
                logger.warning("同一のURLに対するエラーが20回続いたので、このURLからの取得を終了します。")
                return result_dict

            try:
                req = requests.get(url)
                if int(req.status_code) != 200:
                    logger.error("Error {}: このページを取得出来ません。".format(
                        req.status_code))
                    return result_dict

                else:
                    html = lxml.html.fromstring(req.text)
                    result_tmp = {
                        i.text_content(): j.text_content()
                        for i, j in zip(html.cssselect("th.w120"),
                                        html.cssselect("th.w120 ~ td"))
                    }
                    result_dict["店名"] = [
                        i.text_content()
                        for i in html.cssselect("p.detailTitle > a")
                    ][0]
                    result_dict["ホットペッパービューティ上のHP"] = url
                    kuchikomi_dict = self.get_kuchikomi(
                        url.split("?")[0] + "review/")
                    result_tmp.update(kuchikomi_dict)

                    for key in result_tmp.keys():
                        if key in result_dict.keys():
                            if key == "電話番号":
                                result_dict[key] = self.get_tel(
                                    url.split("?")[0] + "tel/")
                            elif key == "お店のホームページ" or key == "ホットペッパービューティ上のHP" or key == "スタッフ募集":
                                result_dict[key] = extractor.find_urls(
                                    result_tmp[key])[0]
                            else:
                                result_dict[key] = result_tmp[key]

                    return result_dict

            except ConnectionError:
                logger.warning("Connection Errorが発生しました。")
                error_num += 1
                time.sleep(5)
Пример #7
0
def parse_eml(eml):
    hashes = []
    urls = []
    responses = []

    for part in eml.walk():
        if part.get_content_disposition() != "attachment" and \
        part.get_content_type() == "text/plain" \
        or part.get_content_type == "text/html":
            text = str(part.get_payload(decode=True)).replace("\\n", " ")
            extractor = URLExtract()
            urls = list(set(extractor.find_urls(text)))

        if part.get_content_disposition() == "attachment":
            attach = base64.b64decode(part.get_payload())
            hashes.append(hashlib.sha256(attach).hexdigest())

    print(f"hashes: {hashes}")
    print(f"urls: {urls}")

    for shasum in hashes:
        artifact = Artifact.query.filter_by(handle=shasum).first()
        if (artifact):
            print(f"{shasum} already exists in DB")
            responses.append(json.loads(artifact.response))
        else:
            params = {'apikey': vtapi, 'resource': shasum}
            headers = {"Accept-Encoding": "gzip, deflate"}
            response = requests.get(
                'https://www.virustotal.com/vtapi/v2/file/report',
                params=params,
                headers=headers)
            json_response = response.json()

            artifact = Artifact(handle=shasum,
                                response=json.dumps(json_response))
            db.session.add(artifact)
            db.session.commit()

            responses.append(json_response)

    for url in urls:
        artifact = Artifact.query.filter_by(handle=url).first()
        if (artifact):
            print(f"{url} already exists in DB")
            responses.append(json.loads(artifact.response))
        else:

            headers = {
                "Accept-Encoding": "gzip, deflate",
            }
            params = {'apikey': vtapi, 'resource': url}
            response = requests.post(
                'https://www.virustotal.com/vtapi/v2/url/report',
                params=params,
                headers=headers)
            json_response = response.json()

            artifact = Artifact(handle=url, response=json.dumps(json_response))
            db.session.add(artifact)
            db.session.commit()

            responses.append(json_response)

    return responses
                ]

                claim_dict[id] = {
                    'claim': claim,
                    'title': title,
                    'claim_proc': proc_claim,
                    'title_proc': proc_title,
                    'claim_clean': clean_claim,
                    'title_clean': clean_title,
                }

            cnt += 1

    json.dump(claim_dict, open('my_code/proc_data/claim_dict.json', 'w'))

url_extr = URLExtract()
for split in split_mp:
    data_loc = 'data/' + split
    data_dict = {}
    cnt = 0
    with open(data_loc + '/tweets.queries.tsv', 'r') as f:
        for line in f:
            if cnt:
                id, tweet = line.strip().split('\t')
                urls = url_extr.find_urls(tweet)

                proc_twit = text_processor_twit.pre_process_doc(tweet)

                clean_twit = [
                    word for word in proc_twit
                    if not re.search("[^a-z0-9.,\s]+", word)
Пример #9
0
    def get_band_disco(soup, current_records):
        # Instancia de URLExtract.
        extractor = URLExtract()

        # Abrimos sesión con la base de datos.
        engine = create_engine('sqlite:///swedish_bands.db')
        # Estos dos son necesarios para cada sesión de base de datos.
        Base.metadata.create_all(bind=engine)
        Session = sessionmaker(bind=engine)
        session = Session()

        # Del objeto "soup" (el contenido será parecido a band_page.html) encuentra <div id="band_disco">.
        disco_finder = soup.find("div", {"id": "band_disco"})
        # Los tags resultantes pasan a string.
        s_disco_finder = str(disco_finder)
        # Extrae todos los URLs presentes.
        disco_url = extractor.find_urls(s_disco_finder)

        # Toma el primer URL y asigna a una variable.
        url = disco_url[0]
        # Hace un request con dicho URL.
        r = requests.get(url)

        # Algo para los caracteres raros, por si los hay.
        r.encoding = 'utf-8'

        # Convierte el response en un objeto BeautifulSoup para su uso.
        disco_soup = BeautifulSoup(r.content, 'html.parser')

        # Del objeto "disco_soup" (el contenido será parecido a disco.html) obtiene todos los tags <tr>.
        disco_entries = disco_soup.find_all("tr")

        # Elimina el primero porque no se necesita.
        disco_entries.pop(0)

        # -> Por cada elemento en disco_entries:
        for item in disco_entries:
            # -> Instanciamos la discografía e insertamos.
            discography = fact.factory("discography")
            discography.band_id = current_records
            # -> Intentamos:
            try:
                # -> En un ciclo de x < 3:
                for x in range(3):
                    # -> Busca todos los tags <td> usando el índice 'x'.
                    s = item.find_all("td")[x]
                    # -> Como en este caso los atributos de la discografía vienen en 3 partes, condicionamos:
                    if x == 0:
                        discography.name = str(s.getText())
                    if x == 1:
                        discography.release_type = str(s.getText())
                    if x == 2:
                        discography.year = str(s.getText())
                    # -> Agregamos el row.
                    session.add(discography)
                # Guardamos cambios.
                session.commit()
                # Cerramos sesión.
                session.close()
            except Exception as e:
                # En caso de que la banda no tenga releases sólo pasa al siguiente.
                session.close()
Пример #10
0
 def __init__(self, message=''):
     self.extractor = URLExtract()
     self.message = message
Пример #11
0
async def useless(event):  # sourcery no-metrics
    """Custom profile pics"""
    input_str = event.pattern_match.group(2)
    ext = re.findall(r"-\w+", input_str)
    try:
        flag = ext[0].replace("-", "")
        input_str = input_str.replace(ext[0], "").strip()
    except IndexError:
        flag = None
    list_link = get_collection_list("CUSTOM_PFP_LINKS")
    if flag is None:
        if gvarstatus("CUSTOM_PFP") is not None and gvarstatus(
                "CUSTOM_PFP") == "true":
            return await edit_delete(event, "`Custom pfp is already enabled`")
        if not list_link:
            return await edit_delete(
                event, "**ಠ∀ಠ  There no links for custom pfp...**")
        addgvar("CUSTOM_PFP", True)
        await edit_delete(event, "`Starting custom pfp....`")
        await custompfploop()
        return
    if flag == "l":
        if not list_link:
            return await edit_delete(
                event, "**ಠ∀ಠ  There no links set for custom pfp...**")
        links = "**Available links for custom pfp are here:-**\n\n"
        for i, each in enumerate(list_link, start=1):
            links += f"**{i}.**  {each}\n"
        await edit_delete(event, links, 60)
        return
    if flag == "s":
        if gvarstatus("CUSTOM_PFP") is not None and gvarstatus(
                "CUSTOM_PFP") == "true":
            delgvar("CUSTOM_PFP")
            await event.client(
                functions.photos.DeletePhotosRequest(
                    await event.client.get_profile_photos("me", limit=1)))
            return await edit_delete(event, "`Custompfp has been stopped now`")
        return await edit_delete(event, "`Custompfp haven't enabled`")
    reply = await event.get_reply_message()
    if not input_str and reply:
        input_str = reply.text
    if not input_str:
        return await edit_delete(
            event,
            "**ಠ∀ಠ  Reply to valid link or give valid link url as input...**")
    extractor = URLExtract()
    plink = extractor.find_urls(input_str)
    if len(plink) == 0:
        return await edit_delete(
            event,
            "**ಠ∀ಠ  Reply to valid link or give valid link url as input...**")
    if flag == "a":
        for i in plink:
            if not is_in_list("CUSTOM_PFP_LINKS", i):
                add_to_list("CUSTOM_PFP_LINKS", i)
        await edit_delete(
            event,
            f"**{len(plink)} pictures sucessfully added to custom pfps**")
    elif flag == "r":
        for i in plink:
            if is_in_list("CUSTOM_PFP_LINKS", i):
                rm_from_list("CUSTOM_PFP_LINKS", i)
        await edit_delete(
            event,
            f"**{len(plink)} pictures sucessfully removed from custom pfps**")
Пример #12
0
class ImageEmbed:
    def __init__(self, client, channel_ids, twitter_consumer_key,
                 twitter_consumer_secret, twitter_access_token_key,
                 twitter_access_token_secret):
        self.client = client
        self.channel_ids = channel_ids
        self.extractor = URLExtract()
        self.httpsession = aiohttp.ClientSession()
        self.message_cache = deque(maxlen=100)
        self.forced_embeds = deque(maxlen=100)
        self.ready = asyncio.Event()

        self.ready.set()

        self.twitter_pattern = re.compile("twitter.com/\w+/status/(\d+)")
        self.deviantart_pattern = re.compile("deviantart\.com.*.\d")
        self.pixiv_pattern = re.compile("www\.pixiv\.net\/en\/artworks\/(\d+)")

        self.deviantart_url = "https://backend.deviantart.com/oembed?url={}"

        self.twitterapi = twitter.Api(
            consumer_key=twitter_consumer_key,
            consumer_secret=twitter_consumer_secret,
            access_token_key=twitter_access_token_key,
            access_token_secret=twitter_access_token_secret,
            tweet_mode="extended")

        self.pixiv_session_url = "https://api.pixiv.moe/session"
        self.pixiv_url = "https://www.pixiv.net/ajax/illust/{}?lang=en"

    def should_spoiler(self, url, content):
        url = re.escape(url)
        match = re.search("\|\|\s*{}\s+\|\|".format(url), content)
        if match:
            return True
        return False

    async def get_rich_embed(self, url, message, force_ignore_embeds):
        return await self.get_twitter_embed(url, message, force_ignore_embeds) or \
            await self.get_deviantart_embed(url, message, force_ignore_embeds) or \
            await self.get_pixiv_embed(url, message, force_ignore_embeds)

    async def on_message(self, message):
        await self.post_image_embeds(message)

    async def post_image_embeds(self,
                                message,
                                channel=None,
                                force_ignore_embeds=False):
        if message.channel.id not in self.channel_ids or message.author == self.client.user:
            return
        if not channel:
            channel = message.channel
        self.ready.clear()
        urls = self.extractor.find_urls(message.content, True)
        urls = [url for url in urls if self.filter_link(url, message.content)]
        if any(self.pixiv_pattern.search(line)
               for line in urls) and not force_ignore_embeds:
            self.forced_embeds.append(message)
            if len(message.embeds):
                await message.edit(suppress=True)
        spoiler = []
        embeds = []
        for url in urls:
            rich_embed = await self.get_rich_embed(url, message,
                                                   force_ignore_embeds)
            if not rich_embed:
                continue
            embed, attachment = rich_embed
            if embed:
                embeds.append((embed, attachment))
                if self.should_spoiler(url, message.content):
                    spoiler.append(embed)
        to_cache = []
        for embed, attachment in embeds[:4]:
            if embed in spoiler:
                em_msg = await channel.send("||https://corr.in/s ||",
                                            embed=embed,
                                            files=attachment)
            else:
                em_msg = await channel.send(embed=embed, file=attachment)
            to_cache.append(em_msg)
        self.cache_message(message, to_cache)
        self.ready.set()

    def cache_message(self, message, embed_msgs):
        chosen = None
        for cache in self.message_cache:
            if message == cache["msg"]:
                chosen = cache
                break
        if not chosen:
            chosen = {"msg": message, "embed_msgs": []}
            self.message_cache.append(chosen)
        for em in embed_msgs:
            chosen["embed_msgs"].append(em)

    async def on_message_delete(self, message):
        if message.channel.id not in self.channel_ids or message.author == self.client.user:
            return
        await self.ready.wait()
        chosen = None
        for cache in self.message_cache:
            if message == cache["msg"]:
                chosen = cache
                break
        if chosen:
            for to_delete in chosen["embed_msgs"]:
                try:
                    await to_delete.delete()
                except discord.errors.NotFound:
                    continue
            self.message_cache.remove(chosen)

    async def on_message_edit(self, before, after):
        urls = []
        if after in self.forced_embeds and len(after.embeds):
            await after.edit(suppress=True)
            return
        for embed in after.embeds:
            if embed.url:
                url = embed.url
                url = url.replace("mobile.twitter.com", "twitter.com")
                urls.append(url)
        await self.ready.wait()
        chosen = None
        for cache in self.message_cache:
            if after == cache["msg"]:
                chosen = cache
                break
        if chosen:
            for potential in list(chosen["embed_msgs"]):
                if len(potential.embeds) and potential.embeds[0].url in urls:
                    try:
                        await potential.delete()
                    except discord.errors.NotFound:
                        continue
                    chosen["embed_msgs"].remove(potential)

    def filter_link(self, url, message_content):
        return message_content.count("<" + url +
                                     ">") < message_content.count(url)

    async def get_twitter_embed(self, url, message, force_ignore_embeds):
        url = url.replace("mobile.twitter.com", "twitter.com")
        twitter_id = self.twitter_pattern.search(url)
        if not twitter_id:
            return None
        twitter_id = int(twitter_id.group(1))
        tweet_status = self.twitterapi.GetStatus(twitter_id)
        if not tweet_status:
            return None
        if not hasattr(tweet_status, "media") or not tweet_status.media or len(
                tweet_status.media) == 0:
            return None
        if message not in self.forced_embeds and not force_ignore_embeds:
            for embed in message.embeds:
                if embed.footer and embed.footer.text == "Twitter":
                    if url == embed.url:
                        return None
        embed = discord.Embed(description=tweet_status.full_text,
                              color=1942002,
                              url=url)
        embed.set_footer(
            text="Twitter",
            icon_url="https://abs.twimg.com/icons/apple-touch-icon-192x192.png"
        )
        embed.set_image(url=tweet_status.media[0].media_url_https +
                        "?name=large")
        embed.set_author(name="{} ({})".format(tweet_status.user.name,
                                               tweet_status.user.screen_name),
                         url="https://twitter.com/{}".format(
                             tweet_status.user.screen_name),
                         icon_url=tweet_status.user.profile_image_url_https)
        embed.add_field(name="Retweets",
                        value=tweet_status.retweet_count,
                        inline=True)
        embed.add_field(name="Likes",
                        value=tweet_status.favorite_count,
                        inline=True)
        return embed, None

    async def get_deviantart_embed(self, url, message, force_ignore_embeds):
        da_link = self.deviantart_pattern.search(url)
        if not da_link:
            return None
        da_link = da_link[0]
        if message not in self.forced_embeds and not force_ignore_embeds:
            for embed in message.embeds:
                if embed.provider and embed.provider.name == "DeviantArt":
                    if da_link in embed.url:
                        return None
        async with self.httpsession.get(
                self.deviantart_url.format(da_link)) as resp:
            if resp.status < 200 or resp.status >= 300:
                return None
            result = await resp.json()
            if result["type"] != "photo":
                return None
            embed = discord.Embed(title="{} by {} on DeviantArt".format(
                result["title"], result["author_name"]),
                                  color=395021,
                                  url=url)
            embed.set_image(url=result["url"])
            embed.set_author(
                name=result["author_name"],
                url=result["author_url"],
                icon_url=
                "https://st.deviantart.net/eclipse/icons/android-192.png")
            return embed, None

    async def get_pixiv_embed(self, url, message, force_ignore_embeds):
        pixiv_link = self.pixiv_pattern.search(url)
        if not pixiv_link:
            return None
        pixiv_id = int(pixiv_link.group(1))
        pixiv = await self.fetch_pixiv(pixiv_id)
        if not pixiv:
            return None
        embed = discord.Embed(description=pixiv.get("description", None),
                              color=12123135,
                              url=url,
                              title=pixiv.get("title", None))
        embed.set_footer(
            text="Pixiv",
            icon_url="https://s.pximg.net/common/images/apple-touch-icon.png")
        image = pixiv["urls"]["regular"]
        file_object = None
        file_extension = image.split(".")[-1]
        file_name = "image.{}".format(file_extension)
        headers = {
            "user-agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36",
            "accept-language": "en-US,en;q=0.9",
            "referer": "https://www.pixiv.net/"
        }
        async with self.httpsession.get(image, headers=headers) as resp:
            file_object = io.BytesIO(await resp.read())
            file_object.seek(0)
        discord_file = discord.File(file_object, file_name)
        embed.set_image(url="attachment://{}".format(file_name))
        embed.set_author(name="{}".format(pixiv["userName"]),
                         url="https://www.pixiv.net/en/users/{}".format(
                             pixiv["userId"]))
        return embed, discord_file

    async def fetch_pixiv(self, pixiv_id):
        now = datetime.datetime.now()
        headers = {
            "user-agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36",
            "accept-language": "en-US,en;q=0.9",
            "referer": "https://www.pixiv.net/en/artworks/{}".format(pixiv_id)
        }
        async with self.httpsession.get(self.pixiv_url.format(pixiv_id),
                                        headers=headers) as resp:
            if resp.status < 200 or resp.status >= 300:
                return None
            result = await resp.json()
            return result["body"]
        return None
Пример #13
0
    def check_citation(self, citation):
        #NOTE: when implementing, wrap the method in a try catch and print out any error + the citation status

        try:
            pattern = re.compile("[ ][0-9]{4}")
            result = pattern.search(citation)
            self.year = result.group(0)[1:]
        except:
            raise Exception("Unable to find year in citation.")

        self.citation_status = MLACitationStatus.AUTHOR

        cursor = 0

        while True:
            ascii_value = ord(citation[cursor])

            # check if the current character is not " &-'." or any alphanumeric in English or Latin-1
            if citation[cursor:cursor + 2] != ". " and (
                    ascii_value == 32 or ascii_value == 39
                    or 44 <= ascii_value <= 46 or 65 <= ascii_value <= 90
                    or 97 <= ascii_value <= 122 or 192 <= ascii_value <= 255):
                cursor += 1
            else:
                break

        if cursor != 0:
            author_section = ""
            if citation[cursor:cursor + 2] == ". ":
                author_section = citation[:cursor + 1]
            else:
                raise Exception(
                    "Bad formatting in the author section (unknown error).")

            # three or more authors
            if ", et al." in author_section:
                temp = author_section.replace(", et al", "")
                authors = temp.split(", ")
                filteredAuthor = [self.filter_latin(i) for i in authors]

                if re.match("^[A-Za-z][A-Za-z-' ]+$", filteredAuthor[0]) is not None \
                and re.match("^[A-Z][A-Za-z-'. ]+[.]$", filteredAuthor[1]) is not None:
                    self.authors.append(authors[0] + ", et al.")
                else:
                    raise Exception("Bad formatting in the author section: '" +
                                    author_section + "'")

            # two authors
            elif ", and " in author_section:
                authors = author_section.split(", and ")
                if ", " not in authors[0]:
                    raise Exception("Bad formatting in the author section: '" +
                                    author_section + "'")

                firstAuthor = authors[0].split(", ")
                filteredFirstAuthor = [
                    self.filter_latin(i) for i in firstAuthor
                ]

                if re.match("^[A-Za-z][A-Za-z-' ]+$", filteredFirstAuthor[0]) is not None \
                and re.match("^[A-Z][A-Za-z-'. ]+$", filteredFirstAuthor[1]) is not None:
                    self.authors.append(firstAuthor[0])
                else:
                    raise Exception("Bad formatting in the author section: '" +
                                    author_section + "'")

                if " " not in authors[1]:
                    raise Exception("Bad formatting in the author section: '" +
                                    author_section + "'")

                secondAuthor = authors[1].split(" ", 1)
                filteredSecondAuthor = [
                    self.filter_latin(i) for i in secondAuthor
                ]

                if re.match("^[A-Z][A-Za-z-']+$", filteredSecondAuthor[0]) is not None \
                and re.match("^[A-Za-z][A-Za-z-'. ]+[.]$", filteredSecondAuthor[1]) is not None:
                    self.authors.append(filteredSecondAuthor[1][:-1])

                elif re.match("^[A-Za-z][.]$",
                              filteredSecondAuthor[1]) is not None:
                    author_cursor = cursor + 2
                    actualSecondAuthor = ""

                    while citation[author_cursor:author_cursor + 2] != ". ":
                        actualSecondAuthor += citation[author_cursor]
                        author_cursor += 1

                    self.authors.append(actualSecondAuthor)

                else:
                    raise Exception("Bad formatting in the author section: '" +
                                    author_section + "'")

            # one author
            elif ", " in author_section:
                authors = author_section.split(", ")
                filteredAuthor = [self.filter_latin(i) for i in authors]

                if re.match("^[A-Za-z][A-Za-z-' ]+$", filteredAuthor[0]) is not None \
                and re.match("^[A-Z][A-Za-z-' ]+[.]$", filteredAuthor[1]) is not None:
                    self.authors.append(authors[0])
                else:
                    raise Exception("Bad formatting in the author section: '" +
                                    author_section + "'")

            elif "et. al." in author_section or "et.al." in author_section:
                raise Exception(
                    "'Et al.' should not have a period after the 'Et'.")
            # no match; bad formatting
            else:
                raise Exception("Bad formatting in the author section: '" +
                                author_section + "'")

        self.citation_status = MLACitationStatus.TITLE
        cursor += 1
        # check the title section
        if citation[cursor:cursor + 3] == "<i>":
            cursor += 3
        elif citation[cursor + 1:cursor + 4] == "<i>":
            cursor += 4
        elif citation[cursor + 1] == "\"":
            cursor += 2
        elif citation[cursor - 1:cursor + 1] == ".\"":
            raise Exception("Bad formatting in the title section.")

        title = ""

        while citation[cursor] != ".":
            title += citation[cursor]
            cursor += 1

        title = title.replace("\"", "")
        title = title.replace("</i>", "")

        if title[0] == " ":
            title = title[1:]

        if citation[cursor + 1] == "\"":
            cursor += 2
        else:
            cursor += 1
        #now cursor should be at the beginning of italics

        result = url("https://brettterpstra.com/titlecase/?title=" + title)
        title_cased_title = result.read().decode('utf-8')

        if title != title_cased_title:
            self.warnings.append(
                "the title might contain improper capitalization: '" + title +
                "'")

        self.title = title

        # check for url
        self.citation_status = MLACitationStatus.URL

        extractor = URLExtract()
        if extractor.has_urls(citation):
            urls = extractor.find_urls(citation)
            self.url = urls[0][:-1]
            if self.url + "." not in citation:
                raise Exception("Bad formatting in the URL section.")

            if citation[cursor:cursor +
                        3] != "<i>" and citation[cursor + 1:cursor +
                                                 4] != "<i>":
                self.warnings.append(
                    "the container may not exist or may not be italicized")

        elif citation[cursor:cursor +
                      3] == "<i>" and citation[cursor + 1:cursor + 4] == "<i>":
            self.warnings.append(
                "the container might exist when not necessary (if the citation is about a book), or the block immediately following the title may be improperly italicized."
            )

        if self.url != "":
            citation.replace(self.url + ".", "")

        # check for other info
        # right now, it's too complex to validate the entire MLA citation without prior knowledge on what type of citation it is,
        # so the other info is just stored without checking
        self.citation_status = MLACitationStatus.OTHER_INFO

        remainingText = citation[cursor:]
        info = remainingText.split(", ")
        self.otherInfo = [i for i in info]
import re, os
from nltk.corpus import stopwords
import json
from urlextract import URLExtract

my_loc = os.path.dirname(__file__)

nlp = spacy.load('en_core_web_lg')

split_mp = {'training': 'train', 'dev': 'val', 'test': 'test'}

text_processor_wiki = get_text_processor(word_stats='english')
text_processor_twit = get_text_processor(word_stats='twitter')

df_stopwords = set(stopwords.words('english'))
url_extr = URLExtract()

for split in split_mp:
    tr_file = open('data/%s.tsv' % (split), 'r')
    data_dict = {}
    cnt = 0
    for line in tr_file:
        if cnt:
            if split != 'test':
                topic, id, link, content, claim, worthy = line.strip().split(
                    '\t')
            else:
                topic, id, link, content = line.strip().split('\t')
                claim, worthy = 0, 0

            urls = url_extr.find_urls(content)
Пример #15
0
    #keywords
    pq()
    pqc()
    
    #abstract
    pq()
    pqc()
    
    #cover_image (may not work, but might as well try)
    pq()
    cover_image = 'https://cross-currents.berkeley.edu'+photoessay['Image']
    print(cover_image, end='')
    pqc()
    
    #pdf_url, extract from the File column
    extractor = URLExtract()
    pdf_urls = extractor.find_urls(photoessay['File'])
    if len(pdf_urls) >= 1: #sometimes the extractor finds more than one URL, we should just always use the first
      pdf_url = pdf_urls[0]
    else:
      pdf_url = 'ERROR, no PDF URL found, content-type: ' + photoessay['Content type'] + '; Content ID: ' + photoessay['Content ID'] + '; Article Type: ' + photoessay['Article Type']
    pq()
    print(urllib.parse.unquote(pdf_url), end='')
    pqc()

    # Add 3 blank cells here at the end, because supplemental files follow on additional lines
    print(3*'\t', end='')    

    print('') # let's wrap up this photoessay

Пример #16
0
    spotifyPlaylistId = spotifyConfig["playlistID"]
    spotifyCtr = spotifyConfig["ctr"]
    spotifyUser = spotifyConfig["spotifyUser"]

# Youtube config stuff
with open("config/YTconfig.json") as f:
    youtubeConfig = json.load(f)

with open("config/slack.json") as f:
    slackConfig = json.load(f)
    slackToken = slackConfig["token"]
    slackChannel = slackConfig["channel"]
    slackTeam = slackConfig["team"]

slack_client = slack.WebClient(token=slackToken)
extractor = URLExtract()  # declare extractor for later

# method to post a (parameter) message to slack, visible to channel


def slack_response(message, userID):
    print("Sending slack response.")

    message = ("{}".format(message))
    slack_client.chat_postMessage(token=slackToken,
                                  as_user=False,
                                  channel=slackChannel,
                                  text=message)


# method to post an ephemeral message to the chat - only the user will see it
Пример #17
0
 def getwebsite(self, data):
     extractor = URLExtract()
     urls = extractor.find_urls(data)
     return '  ,  '.join(urls)
Пример #18
0
    def initialize(self):
        self.logNotify("Initializing LibreSelery")

        self.seleryPackageInfo = os_utils.getPackageInfo("libreselery")
        if self.seleryPackageInfo:
            self.log("LibreSelery version [%s]" % self.seleryPackageInfo["version"])
        else:
            # when project is executed locally without installation, seleryPackageInfo is empty
            self.log("LibreSelery version [undefined]")

        self.log("Preparing Configuration")
        # find all configs in potentially given config directory
        foundConfigs = []
        if self.config.config_dir:
            for root, dirs, files in os.walk(self.config.config_dir):
                for f in files:
                    ext = os.path.splitext(f)[1]
                    if ext == ".yml":
                        foundConfigs.append(os.path.join(root, f))
        # group all found configs together with individually given configuration paths from user on top
        self.config.config_paths = foundConfigs + self.config.config_paths
        # apply yaml config to our configuration if possible
        self.log("Loading configurations" % self.config.config_paths)
        [print(" -- %s" % path) for path in self.config.config_paths]
        [self.loadYaml(path) for path in self.config.config_paths]

        # finalize our configuration settings
        self.config.finalize()

        # load the README file and check if wallet address for donation matches the configured wallet address. Before payout this address is also matched against the address of the coinbase user
        extractor = URLExtract()
        fundingPath = self._getFile("README.md")
        if fundingPath is not None:
            self.log("Loading funding file [%s] for bitcoin wallet" % fundingPath)
            mdfile = open("README.md", "r")
            mdstring = mdfile.read()
            urls = extractor.find_urls(mdstring)
            badge_string = "https://badgen.net/badge/LibreSelery-Donation/"
            for url in urls:
                if badge_string in url:
                    self.config.bitcoin_address = url.split(badge_string, 1)[1]
                    self.log("Found bitcoin address [%s]" % self.config.bitcoin_address)
        else:
            self.log(
                "Using bitcoin address from configuration file for validation check [%s]"
                % self.config.bitcoin_address
            )

        # Create a new QR code based on the configured wallet address
        self.log("Creating QR code PNG image for funders")
        wallet_qrcode = QRCode(error_correction=1)
        wallet_qrcode.add_data(self.config.bitcoin_address)
        wallet_qrcode.best_fit()
        wallet_qrcode.makeImpl(False, 6)
        wallet_image = wallet_qrcode.make_image()
        wallet_image.save(
            os.path.join(self.config.result_dir, "public", "wallet_qrcode.png")
        )

        # load tooling url
        if self.config.include_tooling_and_runtime and self.config.tooling_path:
            with open(self.config.tooling_path) as f:
                self.config.toolrepos = yaml.safe_load(f)
            if self.config.toolrepos is not None:
                self.log("Tooling file loaded [%s]" % self.config.toolrepos)
            else:
                self.log("No tooling urls found")
        else:
            self.log("Tooling not included")

        # load our environment variables
        self.loadEnv()

        self.logNotify("Initialized")
        self.log(str(self.getConfig()))
Пример #19
0
def main():
    x = 1
    while x <= pages:
        url_org = f'https://github.com/search?p={x}&q=org%3A{organization}+{query}&type=code'


        page = s.get(url_org).text
        if 'We couldn’t find any code matching' in page:
            print(colored('\nNo Repositories Found. Please check the Organization name.' , 'red'))
            sys.exit(1)
        soup = BeautifulSoup(page, 'html5lib')

        url_list = []

        for link in soup.findAll('a'):
            inside_file = link.get('href')
            full_url = 'https://github.com/' + inside_file

            head = full_url.partition('#')
            url_list.append(head[0])
            
        final_url_list = set(url_list)
        final_url_list = list(final_url_list)


        total_repositories = len(final_url_list)
        
        print("\n")
        if total_repositories == 0 and x < 2:
            print(colored("Make sure your credentials are properly configured.", 'red'))
            sys.exit(1)
        if total_repositories ==0:
            print('Cannot find more S3 Buckets.')
            sys.exit(1)
        else:
            print(f"Fetching Data from Page: {x}")
            print("\n")


        for i in (final_url_list):
            inner_url = i
            inner_url_fetch = s.get(inner_url).text
            extractor = URLExtract()
            for bucketurl in extractor.gen_urls(inner_url_fetch):
                if bucketurl not in exclude and 'https://github.com/' not in bucketurl and args.q in bucketurl:
                    try:
                        check_takeover = requests.get(bucketurl)
                        status = check_takeover.status_code
                        o1 = (f'[{status}] - {bucketurl}\n')
                        if args.o:
                            file = open(args.o, 'a')
                            file.write(o1)
                        print(f'[{status}] - {bucketurl} ')
                    except:
                        pass
                    try:
                        check_takeover_response = check_takeover.content
                        check_takeover_response = str(check_takeover_response)
                        if 'NoSuchBucket' in check_takeover_response:
                            s3_text = (colored('[S3 Bucket Takeover]', 'green'))
                            o2 = (f'{s3_text} : {bucketurl}\n')
                            print(f'{s3_text} : {bucketurl}')
                            if args.o:
                                file=open(args.o, 'a')
                                file.write(o2)

                    except:
                        pass
                    
        x=x+1
Пример #20
0
def extract_urls(body):
    urlset = set()
    extractor = URLExtract()
    excluded = [
        '.id', '.you', '.lol', '.like', '.now', '.my', '.love', '.phone',
        '.how', '.post', '.me', '.got', '.hot', '.im', '.best'
    ]
    for url in extractor.gen_urls(body):
        if len(url) < 5 or '.' not in url:
            continue
        if url.count('http') == 1:
            url = url.split('http')[1]
            url = 'http{}'.format(url)
        if '(' in url:
            rurl = url.split('(')
            if extractor.has_urls(rurl[1]):
                url = rurl[1]
            elif extractor.has_urls(rurl[0]):
                url = rurl[0]
            else:
                continue
        if ')' in url:
            lurl = url.split(')')
            if extractor.has_urls(lurl[0]):
                url = lurl[0]
            elif extractor.has_urls(lurl[1]):
                url = lurl[1]
            else:
                continue
        sem = 0
        for suffix in excluded:
            if url.endswith(suffix):
                sem = 1
        if sem == 1:
            continue
        # """
        if '[IMG]' in url:
            try:
                url = url.split('[IMG]')[1]
            except IndexError:
                pass
        if '[/IMG]' in url:
            try:
                url = url.split('[/IMG]')[0]
            except IndexError:
                pass
        if url.endswith('?fb'):
            url = url.replace('?fb', '')
        if url.endswith('?noredirect'):
            url = url.replace('?noredirect', '')
        elif url.endswith(
                '_d.jpg?maxwidth=640&amp;shape=thumb&amp;fidelity=medium'):
            url = url.replace(
                '_d.jpg?maxwidth=640&amp;shape=thumb&amp;fidelity=medium', '')
        elif url.endswith('?s=sms'):
            url = url.replace('?s=sms', '')
        if '//m.imgur.com' in url:
            url = url.replace('//m.imgur.com', '//imgur.com')
        if url.startswith('https://thumbs.gfycat.com/'):
            url = url.replace('https://thumbs.gfycat.com/',
                              'https://gfycat.com/')
        if url.endswith('-size_restricted.gif'):
            url = url.replace('-size_restricted.gif', '')
        # """
        urlset.add(url)
    return urlset
Пример #21
0
    def blog_post_send_to_preview(self):
        if not self.preview_user_id:
            raise Exception(_('必须输入用户信息'))
        ctx = self._context.copy()
        wx_media = self.env['wx.media']

        active_model = ctx.get('active_model')
        active_ids = ctx.get('active_ids', [])

        blogs = self.env[active_model].browse(active_ids)
        server_url = self.env['ir.config_parameter'].sudo().get_param('server_url')
        articless = []
        for blog in blogs:
            thumb_media_id = False
            wx_file_path = get_module_resource('e2yun_blog_post_list_extends', 'static/wx')
            # file_image = blog.main_image
            if True:
                if blog.cover_properties:
                    cover_properties = eval(blog.cover_properties)
                    if 'background-image' in eval(blog.cover_properties):
                        imageurl = cover_properties['background-image'].replace('url(', '').replace(')', '')
                        if 'http' not in imageurl:
                            # imageurl = server_url + imageurl

                            attench_id = imageurl.replace('/web/image/', '')[
                                         0: imageurl.replace('/web/image/', '').index('/')]

                            datas = self.env['ir.attachment'].browse(int(attench_id)).datas

                            img = base64.b64decode(datas)
                            file = open('%s/thumb.jpg' % wx_file_path, 'wb')
                            file.write(img)
                            file.close()

                        else:
                            urlretrieve(imageurl, '%s/thumb.jpg' % wx_file_path)
                        quality = 80
                        step = 5
                        while os.path.getsize('%s/thumb.jpg' % wx_file_path) / 1024 > 64:
                            file_path = '%s/thumb.jpg' % wx_file_path
                            im = Image.open(file_path)
                            # 获得图像尺寸:
                            # w, h = im.size
                            # 缩放到50%:
                            # im.resize((int(w / 0.8), int(h / 0.8)), Image.ANTIALIAS)
                            # 把缩放后的图像用jpeg格式保存:
                            if im.mode == "P":
                                im = im.convert('RGB')
                            im.save(file_path, 'JPEG', quality=quality)
                            if quality - step < 0:
                                break
                            quality -= step
                    else:
                        raise Exception(_('必须要有封面图片,请在文章编辑中输入!'))

                    # img = base64.b64decode(blog.main_image)
                    # file = open('%s/thumb.gif' % wx_file_path, 'wb')
                    # file.write(img)
                    # file.close()

                    thumb_media_upload = wx_media.upload_image('%s/thumb.jpg' % wx_file_path)
                    thumb_media_id = thumb_media_upload['thumb_media_id']
                else:
                    raise Exception(_('必须要有封面图片,请在文章编辑中填入!'))
                extractor = URLExtract()
                urls = extractor.find_urls(blog.content, only_unique=True)
                wx_content = blog.content
                for url in urls:
                    try:
                        urlretrieve(url, '%s/news.jpg' % wx_file_path)
                        import imghdr
                        imgType = imghdr.what('%s/news.jpg' % wx_file_path)
                        if imgType:
                            news_media_upload = wx_media.upload_news_picture('%s/news.jpg' % wx_file_path)
                            wx_content = wx_content.replace(url, news_media_upload['url'])
                    except:
                        continue

                blog.wx_content = wx_content
                blog.thumb_media_id = thumb_media_id
                blog.transfer_to_wx_flag = True
                try:
                    os.remove('%s/thumb.jpg' % wx_file_path)
                    os.remove('%s/news.jpg' % wx_file_path)
                except:
                    pass
            blog_url = server_url + blog.website_url
            articles = {
                "thumb_media_id": blog.thumb_media_id,
                "author": blog.create_uid.name,
                "title": blog.name,
                "content_source_url": blog_url,
                "content": '%s' % blog.wx_content,
                "digest": blog.subtitle,
                "show_cover_pic": 1,
                "need_open_comment": 1,
                "only_fans_can_comment": 1
            }
            articless.append(articles)

        randon_number = random.randint(100000, 999999)
        mediaid = wx_media.upload_articles(articless, '我的文章-%s' % randon_number)
        print(mediaid)

        wx_media = self.env['wx.media'].search([('media_id', '=', mediaid['media_id'])])
        preview_user_id = self.preview_user_id

        self.env['wx.send.mass'].create(
            {'wx_media_id': wx_media.id, 'preview_user_id': preview_user_id.id}).preview_send()

        return {
            'warning': {
                'title': 'Tips',
                'message': '同步成功'
            }
        }
Пример #22
0
class VK:
    def __init__(self):

        log_tag = 'VK - init'
        self.settings_tag = 'VK'

        self.extractor = URLExtract()
        self.config = Config()
        try:
            self.vk_bot = vk_api.VkApi(
                token=str(self.config.read(self.settings_tag, 'bot_token')))
            self.api_bot_vk = self.vk_bot.get_api()
            Log().info(log_tag, 'Инициализация токена-бота VK успешна.')
        except Exception as e:
            Log().error(log_tag, e)

        p_name = 'ЛИНКЕР'
        p_channel = 'hackathon'
        p_version = '0.0.1'
        desc = 'Бот, создающий сокращенные vk.cc ссылки прямо в диалоге.'
        self.info = f'{p_name} {p_version} ({p_channel})\n\n{desc}\n\nбеседа %peer_id%'

    def long_poll(self):
        tag = 'VK - Message LongPoll'
        from vk_api.bot_longpoll import VkBotLongPoll, VkBotEventType

        long_poll_bot = VkBotLongPoll(
            self.vk_bot,
            int(self.config.read(self.settings_tag, "community_id")))

        for event in long_poll_bot.listen():
            try:
                if event.type == VkBotEventType.MESSAGE_NEW:
                    Log().info(
                        tag,
                        f'Новое сообщение от \"https://vk.com/id{event.obj.from_id}\".\n'
                        f'Текст сообщения:\t\n{event.obj.text}\n'
                        f'Прикрепленные аттачи:\t\n{event.obj.attachments}\n'
                        f'Пересланные сообщения:\t\n{event.obj.fwd_messages}')
                    self.listener(event)

                elif event.type == VkBotEventType.MESSAGE_REPLY:
                    Log().info(tag, f'Бот ответил в чате {event.obj.peer_id}.')

                else:
                    Log().info(
                        tag, f'Обнаружено новое действие: {event.type} от '
                        f'\"https://vk.com/id{event.obj.from_id}\"')

            except Exception as e:
                Log().error(tag, e)

    def listener(self, event):
        tag = "VK - Message Listener"
        Log().info(tag, 'Обрабатываю сообщение...')
        from_id = event.obj.from_id
        peer_id = event.obj.peer_id
        msg_text = str(event.obj.text)
        msg_attach = event.obj.attachments
        msg_fwd = event.obj.fwd_messages
        Log().info(tag, 'Обработка завершена. ')

        if self.extractor.has_urls(msg_text) or msg_attach or msg_fwd:
            response_links = []
            if self.extractor.has_urls(msg_text):
                links = self.extractor.find_urls(msg_text)
                Log().info(tag, 'Найдены объекты типа ссылка.')
                if len(links) > 1:
                    for i in range(len(links)):
                        response_links.append(
                            self.get_cc_link(links[i], 0)['short_url'])
                else:
                    response_links.append(
                        self.get_cc_link(links, 0)['short_url'])

            if msg_attach:
                for i in range(len(msg_attach)):
                    attach_type = msg_attach[i]['type']
                    if attach_type == 'link':
                        ath_url = msg_attach[i][attach_type]['url']
                        response_links.append(
                            str(self.get_cc_link(ath_url, 0)['short_url']))

            if msg_fwd:
                for i_fwd in range(len(msg_fwd)):
                    fwd_text = msg_fwd[i_fwd]['text']
                    fwd_attaches = msg_fwd[i_fwd]['attachments']
                    for i_ath in range(len(fwd_attaches)):
                        fwd_ath_type = fwd_attaches[i_ath]['type']
                        if fwd_ath_type == 'link':
                            fwd_ath_link = msg_fwd[i_fwd]['attachments'][
                                i_ath][fwd_ath_type]['url']
                            response_links.append(
                                str(
                                    self.get_cc_link(fwd_ath_link,
                                                     0)['short_url']))

                    if self.extractor.find_urls(fwd_text):
                        response_links.append(
                            str(self.get_cc_link(fwd_text, 0)['short_url']))

            response_links_wd = list(dict.fromkeys(response_links))

            if len(response_links_wd) > 1:
                response_str = '🔗 Вот твои ссылки из сообщения:\n\n'
                for i_link in range(len(response_links_wd)):
                    response_str += response_links_wd[i_link] + '\n'

            else:
                response_str = '🔗 Была найдена лишь одна ссылка в сообщении: ' + response_links_wd[
                    0]

            self.send_message(peer_id, response_str)

        elif (from_id == 140830142) and \
                (msg_text.__contains__('info') or msg_text.__contains__('инфо') or msg_text.__contains__('i')) or \
                (msg_text.__contains__('ping') or msg_text.__contains__('пинг')):
            Log().info(tag, 'Инфо о боте.')
            self.send_message(peer_id, 'понг')
            self.send_message(peer_id,
                              self.info.replace("%peer_id%", str(peer_id)))

        else:
            Log().info(tag, 'Неизвестная команда.')
            self.send_message(event.obj.peer_id, '🐸 Ссылок нет.')

    def get_cc_link(self, url, private):
        cc_link = self.api_bot_vk.utils.getShortLink(url=url, private=private)
        return cc_link

    def send_message(self, user_id, text):
        self.api_bot_vk.messages.send(peer_id=user_id,
                                      message=text,
                                      random_id=get_random_id(),
                                      dont_parse_links=1)
Пример #23
0
import re
import tensorflow as tf
from urlextract import URLExtract
import os, sys
url_extract_api = URLExtract()

fwobj = tf.gfile.GFile("/data/albert/my_chinese_pretrain.txt", "w")


def clean(text):
    text = re.sub("""(<[=0-9a-zA-Z\/&"":_\\.]+>;?)+""", "", text)
    text = re.sub("""((&|#|$)+[0-9a-zA-Z]+;?)+""", "", text)
    text = re.sub("""[★☆\u3000]+""", "", text)
    try:
        urls = url_extract_api.find_urls(text)
        for url in urls:
            text = text.replace(url, "")
        return text
    except:
        return text


def process(document):
    init_len = 0
    index = 0
    document = "".join(document)
    document = clean(document)
    sentences = re.split(r"([。!!??;;])", document)
    document = ["".join(i) for i in zip(sentences[0::2], sentences[1::2])]

    context = "".join(document)
Пример #24
0
def extract_urls(body):
    urlset = set()
    extractor = URLExtract()
    excluded = [
        '.id', '.you', '.lol', '.like', '.now', '.my', '.love', '.phone',
        '.how', '.post', '.me', '.got', '.hot', '.im', '.best'
    ]
    try:
        generatedUrls = extractor.gen_urls(body)
        for url in generatedUrls:
            if len(url) < 5 or '.' not in url:
                continue
            if url.count('http') == 1:
                url = url.split('http')[1]
                url = 'http{}'.format(url)
            if '(' in url:
                rurl = url.split('(')
                if extractor.has_urls(rurl[1]):
                    url = rurl[1]
                elif extractor.has_urls(rurl[0]):
                    url = rurl[0]
                else:
                    continue
            if ')' in url:
                lurl = url.split(')')
                if extractor.has_urls(lurl[0]):
                    url = lurl[0]
                elif extractor.has_urls(lurl[1]):
                    url = lurl[1]
                else:
                    continue
            sem = 0
            for suffix in excluded:
                if url.endswith(suffix):
                    sem = 1
            if sem == 1:
                continue
            # """
            if '[IMG]' in url:
                try:
                    url = url.split('[IMG]')[1]
                except IndexError:
                    pass
            if '[/IMG]' in url:
                try:
                    url = url.split('[/IMG]')[0]
                except IndexError:
                    pass
            if url.endswith('?fb'):
                url = url.replace('?fb', '')
            if url.endswith('?noredirect'):
                url = url.replace('?noredirect', '')
            elif url.endswith(
                    '_d.jpg?maxwidth=640&amp;shape=thumb&amp;fidelity=medium'):
                url = url.replace(
                    '_d.jpg?maxwidth=640&amp;shape=thumb&amp;fidelity=medium',
                    '')
            elif url.endswith('?s=sms'):
                url = url.replace('?s=sms', '')
            if '//m.imgur.com' in url:
                url = url.replace('//m.imgur.com', '//imgur.com')
            if url.startswith('https://thumbs.gfycat.com/'):
                url = url.replace('https://thumbs.gfycat.com/',
                                  'https://gfycat.com/')
            if url.endswith('-size_restricted.gif'):
                url = url.replace('-size_restricted.gif', '')
            # """
            urlset.add(url)
        return urlset
    except AttributeError as e:
        raise e
        print(
            "While generating urls, an AttributeError (specifically {e}) was raised. Moving on without extracting urls for now. This is likely an error with the python library URLExtract (https://github.com/lipoja/URLExtract). The issue has been fixed (see issue fix here: https://github.com/lipoja/URLExtract/commit/aa51f52e77b104932c49fb14882c632f12b6e940) but is has not included in the most recent release. Please install the version from GitHub to fix this issue (eg. pip3 install git+https://github.com/lipoja/URLExtract.git"
            .format(e=e))
    finally:
        return urlset  # which is empty
Пример #25
0
 def __init__(self):
     self.extractor = URLExtract()
Пример #26
0
from typing import List
import hashlib

from urllib.parse import urlparse
from urlextract import URLExtract

import aiocache
import aiohttp
import discord
from discord.ext import commands

from alttprbot.database import config  # TODO switch to ORM
from alttprbot.util import http
from alttprbot import models

urlextractor = URLExtract()


class Moderation(commands.Cog):
    def __init__(self, bot):
        self.bot: commands.Bot = bot

    @commands.Cog.listener()
    async def on_message(self, message: discord.Message):
        # don't moderate if a DM
        if message.guild is None:
            return

        # don't moderate if sent by a real bot
        if message.author.id == self.bot.user.id:
            return
Пример #27
0
def get_urls(text):
    extractor = URLExtract()
    urls = extractor.find_urls(text)
    return ','.join(urls), len(urls)
def find_urls_in_text(text):
    extractor = URLExtract()
    return set(extractor.find_urls(text))
Пример #29
0
import re
from urlextract import URLExtract
from settings import REMOVE_URLS_WL_DATA as WHITELIST

url_extractor = URLExtract()


def remove_urls(text, limit_not_remove=140, placeholder='***'):
    """Removes URLs from given text

    Args:
        text (str): Text
        limit_not_remove (int, optional): If text has less
        than 'limit_not_remove' symbols then dont process it. Defaults to 140.
        placeholder (str, optional): Placeholder for URL. Defaults to '***'.

    Returns:
        str: Text
    """
    if len(text) < limit_not_remove:
        return text

    urls = url_extractor.find_urls(text)
    for url in urls:
        allowed = False
        for white_listed in WHITELIST:
            if url.find(white_listed) != -1:
                allowed = True
                break
        if allowed is False:
            text = text.replace(url, placeholder)
Пример #30
0
def urlextract():
    return URLExtract()
Пример #31
0
def extract_urls(text):
    extractor = URLExtract()
    urls = extractor.find_urls(text)
    return urls
Пример #32
0
    "WRB": 1,
    "``": 1,
    "$": 1,
    "#": 1
}

co = 0
for k in POSDICT.keys():
    POSDICT[k] = co
    co += 1

helper = nltk.help.upenn_tagset()

linkHash = pickle.load(open("URLCache_new.json", 'rb'))
print("cache load finish")
ext = URLExtract()

extract = extractor()


def getLink(h):
    if h in linkHash:
        return linkHash.get(h)

    return tldextract.extract(h).domain


def emExtract(texts1, texts2):
    emDict = {}
    count = 0
    for text in texts1: