示例#1
0
def search_DDG_DORKS(TITLE, TEXT_0):

    engine = Duckduckgo()
    for FC_domain in config.FC_list:

        results = engine.search(f"site:{FC_domain} {TITLE}")
        for r in results:
            print("|--[INFO][GOOGLE][RESULTS][>] " + r["title"] + " | " +
                  r["text"] + " | " + r["link"])

            try:

                tsd, td, tsu = extract(r["link"])
                domain = td + '.' + tsu

                web = requests.get(r["link"], timeout=3)
                print("|----[INFO][WEB][HTTP CODE][>] " +
                      str(web.status_code) + "\n")

                if web.status_code >= 200 or web.status_code < 300:
                    if ".pdf" in r["link"]:
                        pass
                    else:
                        if not domain in config.BL_parserPhone:
                            TEXT = er.remove_tags(str(web.text))

                            compareTEXT(TEXT, TEXT_0)
                            parser.FC_words_in_text(TEXT)
                            parser.parserMAIN(TEXT)

                            ratio = compareTEXT(TEXT_0, TEXT)
                            print(
                                f"|----[INFO][COMPARE TEXTS][>] Ratio: {ratio}"
                            )

                            #Guardamos la info en un log
                            data = f"{r['title']} ||| {r['link']} ||| {r['text']}, ||| {ratio} \n"
                            generateLOG(data, target)

                        else:
                            pass
                print("")
                time.sleep(2)

            except Exception as e:
                print("|----[ERROR][HTTP CONNECTION][>] " + str(e))
示例#2
0
def news_parser(url, target):

    #Descargamos la noticia
    article = Article(url, language = 'es')
    article.download()

    #Parseamos la noticia
    article.parse()

    #La guardamos e imprimimos
    print(f"|----[INFO][WEB][>] {article.title}")
    print(f"|--------[INFO][WEB][AUTHORS][>] {article.authors}")
    print(f"|--------[INFO][WEB][PUBLISH DATE][>] {article.publish_date}")

    parser.parser_email(article.text)
    parser.parser_DNI(article.text)
    parser.parser_IBAN(article.text)
    parser.parser_n_tlfn(article.text)
    parser.FC_words_in_text(article.text)
    print(f"|--------[INFO][WEB][URL][>] {url}")

    news_insertMongoDB(target, url, article.title, article.authors, article.text, article.publish_date, article.top_image, article.movies, article.html)