Exemplo n.º 1
0
def markdown(site):
    handler = html2text.HTML2Text()
    handler.ignore_links = True
    url = f"https://archive.org/wayback/available?url={site}"
    jhtml = json.loads(get_html(url).decode('utf-8'))
    html = get_html(jhtml["archived_snapshots"]["closest"]["url"])
    html = html.decode('utf-8')
    md = handler.handle(html)
    return md
Exemplo n.º 2
0
def markdown(site):
    handler = html2text.HTML2Text()
    handler.ignore_links = True
    url = f"https://archive.org/wayback/available?url={site}"
    jhtml = json.loads(get_html(url).decode('utf-8'))
    html = get_html(jhtml["archived_snapshots"]["closest"]["url"])
    html = html.decode('utf-8', errors='ignore')
    md = handler.handle(html)
    md = filterer.applyGenericFilter(md)
    if site is not None:
        if 'folha' in site:
            md = filterer.applyFolhaFilter(md)
        if 'estadao' in site:
            md = filterer.applyEstadaoFilter(md)
    return md
Exemplo n.º 3
0
def update_SQL_through_flask(df, sql_ip):
    """
    This function updates the SQL DB which tracks the source url for redirecting
    input:
            - df: a dataframe with the data to be saved in SQL
            - sql_ip: the ip_address of the SQL DB ("10.71.0.111")
    output: none
    """

    for i, row in df.iterrows():
        #        break
        solr_id = row['solr_id'].replace(' ', '%20')

        web_url = row['web_url'].replace("'", '').replace(';',
                                                          '').replace('"', '')
        redirect_url = web_url.replace(':', '**').replace('/', '!!')

        flask_url = 'http://' + sql_ip + '/link/' + solr_id + '/' + redirect_url  #old server

        try:
            html_PR = get_html(flask_url)
        except Exception:
            print('was not able to access the url ', flask_url,
                  ' to save as an html file')
            1 + 1

    print('succesful update of SQL DB: ', sql_ip)
Exemplo n.º 4
0
def get_PR_summary(url, proxies, path, html_name, version):
    try:
        #html_PR = get_html(url, proxy)
        html_PR = get_html(url)
    except Exception:
        print('was not able to access the url ', url,
              ' to save as an html file')
        1 + 1

    if html_PR:
        if b'PDF' in html_PR[:6]:  #if it's a pdf, don't save it as an html
            return None
        #saves the html of the page to a text file
        with open(path + html_name, "wb") as file:  #open file in binary mode
            file.write(html_PR)
        #file.close()
        soup = BeautifulSoup(html_PR, "html.parser")

        summary = ""  #initialize a string
        summary_cache = []
        if version == 'RSS':
            body_tag = soup.body
            for child in body_tag.descendants:
                print(child)
                if child.string and child.string != '\n' and child.string not in summary_cache:
                    #print(child.string)
                    summary = summary + child.string + "\n"
                    summary_cache.append(child.string)
            soup_text = soup.find_all('p')
            for i in range(0, len(soup_text)):
                if soup_text[i].text.strip() not in summary_cache:
                    summary = summary + soup_text[i].text.strip() + " "
                    summary_cache.append(soup_text[i].text.strip())
            return summary
            """
            divs = soup.find_all('div')
            for div in soup:
               # print(div.string)
                try:
                    if div.find(text=True) != '\n':
                        summary = summary + div.find(text=True)
                except Exception:
                    1+1
            """
        else:
            soup_text = soup.find_all('p')
            for i in range(0, len(soup_text)):
                summary = summary + soup_text[i].text.strip() + " "
        return summary
    else:
        return None
Exemplo n.º 5
0
def shopping(query, pages=1):
    results = []
    for i in range(pages):
        url = _get_shopping_url(query, i)
        html = get_html(url)
        if html:
            j = 0
            soup = BeautifulSoup(html)

            products = soup.findAll("div", "g")
            print ("yoooo", products)
            for prod in products:
                res = ShoppingResult()

                divs = prod.findAll("div")
                for div in divs:
                    match = re.search(
                        "from (?P<count>[0-9]+) stores", div.text.strip())
                    if match:
                        res.store_count = match.group("count")
                        break

                h3 = prod.find("h3", "r")
                if h3:
                    a = h3.find("a")
                    if a:
                        res.compare_url = a["href"]
                    res.name = h3.text.strip()

                psliimg = prod.find("div", "psliimg")
                if psliimg:
                    img = psliimg.find("img")
                    if img:
                        res.thumb = img["src"]

                f = prod.find("div", "f")
                if f:
                    res.subtext = f.text.strip()

                price = prod.find("div", "psliprice")
                if price:
                    res.min_price = price.text.strip()

                results.append(res)
                j = j + 1
    return results
Exemplo n.º 6
0
def shopping(query, pages=1):
    results = []
    for i in range(pages):
        url = _get_shopping_url(query, i)
        html = get_html(url)
        if html:
            j = 0
            soup = BeautifulSoup(html)

            products = soup.findAll("li", "g")
            for prod in products:
                res = ShoppingResult()

                divs = prod.findAll("div")
                for div in divs:
                    match = re.search(
                        "from (?P<count>[0-9]+) stores", div.text.strip())
                    if match:
                        res.store_count = match.group("count")
                        break

                h3 = prod.find("h3", "r")
                if h3:
                    a = h3.find("a")
                    if a:
                        res.compare_url = a["href"]
                    res.name = h3.text.strip()

                psliimg = prod.find("div", "psliimg")
                if psliimg:
                    img = psliimg.find("img")
                    if img:
                        res.thumb = img["src"]

                f = prod.find("div", "f")
                if f:
                    res.subtext = f.text.strip()

                price = prod.find("div", "psliprice")
                if price:
                    res.min_price = price.text.strip()

                results.append(res)
                j = j + 1
    return results
Exemplo n.º 7
0
def convert(amount, from_currency, to_currency):
    """Method to convert currency.

    Args:
        amount: numeric amount to convert
        from_currency: currency denomination of the amount to convert
        to_currency: target currency denomination to convert to
    """

    # same currency, no conversion
    if from_currency == to_currency:
        return amount * 1.0

    req_url = _get_currency_req_url(amount, from_currency, to_currency)
    response = get_html(req_url)
    rate = _parse_currency_response(response, to_currency)

    return rate
Exemplo n.º 8
0
    def search(self,
               query,
               area='com',
               ncr=False,
               void=True,
               time_period=False,
               sort_by_date=False,
               first_page=0):
        for i in range(first_page, first_page + self.pages):
            url = _get_search_url(query,
                                  i,
                                  lang=self.lang,
                                  area=area,
                                  ncr=ncr,
                                  time_period=time_period,
                                  sort_by_date=sort_by_date)
            print(f'Search URL: {url}&tbm=nws')
            html = parse(get_html(url + "&tbm=nws"))
            links = html.xpath('//div[@id="rso"]/descendant::a/@href')

            [self.results.append(link) for link in links
             if link[0] != '/']  # URL leads out of Google
Exemplo n.º 9
0
def search(query, pages=1, lang='en', void=True):
    """Returns a list of GoogleResult.

    Args:
        query: String to search in google.
        pages: Number of pages where results must be taken.

    Returns:
        A GoogleResult object."""

    results = []
    for i in range(pages):
        url = _get_search_url(query, i, lang=lang)
        html = get_html(url)

        if html:
            soup = BeautifulSoup(html, "html.parser")
            lis = soup.findAll("div", attrs={"class": "g"})
            
            j = 0
            for li in lis:
                res = GoogleResult()

                res.page = i
                res.index = j

                res.name = _get_name(li)
                res.link = _get_link(li)
                res.google_link = _get_google_link(li)
                res.description = _get_description(li)
                res.thumb = _get_thumb()
                res.cached = _get_cached(li)
                if void is True:
                    if res.description is None:
                        continue
                results.append(res)
                j += 1

    return results
Exemplo n.º 10
0
from google.modules.utils import get_html
import html2text

handler = html2text.HTML2Text()
handler.ignore_links = True

url = "https://link.estadao.com.br/noticias/inovacao,as-pessoas-estao-repensando-suas-casas-diz-presidente-do-quinto-andar,70003324028"
html = get_html(url)
html = html.decode('utf-8')
md = handler.handle(html)
print(md)
Exemplo n.º 11
0
def get_PR(A, row, f, k, col,
           version):  #A is a table containning the new PR links
    #row is a row of A, f is a text file, k is the iteration loop over A
    #col is the maximum number of PR links

    summary_cache = [
    ]  #to check for summary duplicates coming from different url

    if version == 'PR':
        start = 2
    else:
        start = 1
    if k > 0:  #skip the first row bc header

        for x in range(start, col):  #col is 100
            flag = False
            if row[x]:
                if row[x].startswith('/'):
                    if row[x].startswith('//'):
                        url = 'http://' + row[x].lstrip('//')
                    else:  #there was a bug here @V2.0 - YMR
                        root_path = get_root_path(
                            row[1], 0)  #there was a bug here @V2.0 - YMR
                        url = root_path + row[
                            x]  #there was a bug here @V2.0 - YMR
                else:
                    if row[x].startswith('./'):
                        root_path = get_root_path(row[1], 1)
                        url = root_path + row[x].lstrip('.')
                    else:
                        url = row[x]

                try:
                    #apps2.shareholder is the third party noise link that gives stock data
                    #.php is a redirect link to share (FB, LinkedIn, Twitter)
                    if 'apps2.shareholder' not in url and ".php" not in url:
                        url2 = urllib.parse.quote_plus(url,
                                                       '/:!#$%^&*()_-+=[]{}?',
                                                       'utf-8')
                        url2_path = urllib.parse.urlparse(url).path
                        #sometimes it may be .jpg, .png
                        ext = os.path.splitext(url2_path)[1]
                        name = row[0] + "_PR" + str(x - 1) + "_" + date
                        response = get_html(url2)
                        #if there is are no errors in opening up the socket connection then download
                        if response:
                            print("this is the modified_url of ", url2)
                            if b'PDF' not in response[:6]:
                                if not ext:
                                    ext = '.html'
                                if 'html' in ext:
                                    ext = '.html'
                                print('this is my extension {0}'.format(ext))
                                with open(html_directory + "/" + name + ext,
                                          'wb') as out_file:
                                    print(html_directory + "/" + name + ext)
                                    out_file.write(response)


#                                html_count += 1 #there was a bug here @V2.0 - YMR
                                flag = True

                            else:
                                print(
                                    "\n------we will attempt to download a PDF instead\n"
                                )
                                pdf_name = row[0] + "_PR" + str(
                                    x - 1
                                ) + "_" + date  #there was a bug here @V2.0 - YMR
                                pdf_response = get_pdf(url2)

                                if pdf_response:
                                    with open(
                                            pdf_directory + "/" + pdf_name +
                                            '.pdf', 'wb') as out_file:
                                        out_file.write(pdf_response)
                                    flag = True
                except Exception as e:
                    print('error in get_PR function:', e)
                    pass
Exemplo n.º 12
0
def get_PR_link_cache(data, row, fail):

    time_start = time.clock()
    end = len(data)

    print('start caching PRESS RELEASES links')
    j = 0
    url = row[2]
    #if the company has no own PR website, then do a search on PRNewsWire.com
    if url == '':
        return None
        #url = url + row[0]
        #url = urllib.parse.quote_plus(url,'/:!#$%^&*()_-+=[]{}?', 'utf-8')

    #html_res = get_html(url, proxy)
    html_res = get_html(url)

    #each list of links can't have more than 32,000 characters to fit in an excell cell
    link_cache = []
    link_cache2 = []
    link_cache3 = []
    link_cache4 = []
    link_cache5 = []
    link_cache6 = []
    link_cache7 = []
    link_cache8 = []
    link_cache9 = []
    link_cache10 = []
    if not html_res:
        fail.append(url)
        print("no html results")
    #collects all the links within the web page
    if html_res:
        soup = BeautifulSoup(html_res, "html.parser")
        divs = soup.find_all('a')
        #check for website denying access while displaying an error page
        divs_flag = 0
        if len(divs) == 0:
            print("divs a href is empty!!")
            divs_flag = 1
            time_end = time.clock()
            delta = time_end - time_start
            return None  #if return None, then the for loop needs to continue

        if divs_flag == 1:  #if no links were found in the webpage, move to next company
            #ii = ii+1
            return None
        for a in divs:
            f = 0
            f_link_check = 0
            link = ''
            try:
                if a["href"].startswith("http://"):
                    link = a["href"]
                    f_link_check = link_check(link)
                    f = 1
                elif a["href"].startswith("./"):
                    link = a["href"]
                    f_link_check = link_check(link)
                    f = 1
                elif a["href"].startswith("/"):
                    link = a["href"]
                    f_link_check = link_check(link)
                    f = 1
                elif a["href"].startswith("https://"):
                    link = a["href"]
                    f_link_check = link_check(link)
                    f = 1
                elif a["href"].startswith("/url?"):
                    m = re.match('/url\?(url|q)=(.+?)&', link)
                    if m and len(m.groups()) == 2:
                        link = unquote(m.group(2))
                        f_link_check = link_check(link)
                        f = 1
                else:
                    link = '/' + a["href"]
                    f_link_check = link_check(link)
                    f = 1
                #checks that the number of characters per list does not exceeds excel cell limit
                if f == 1 and f_link_check == 0:
                    if sum(len(i) for i in link_cache) < 28000:
                        link_cache.append(link)
                    elif sum(len(i) for i in link_cache2) < 28000:
                        link_cache2.append(link)
                    elif sum(len(i) for i in link_cache3) < 28000:
                        link_cache3.append(link)
                    elif sum(len(i) for i in link_cache4) < 28000:
                        link_cache4.append(link)
                    elif sum(len(i) for i in link_cache5) < 28000:
                        link_cache5.append(link)
                    elif sum(len(i) for i in link_cache6) < 28000:
                        link_cache6.append(link)
                    elif sum(len(i) for i in link_cache7) < 28000:
                        link_cache7.append(link)
                    elif sum(len(i) for i in link_cache8) < 28000:
                        link_cache8.append(link)
                    elif sum(len(i) for i in link_cache9) < 28000:
                        link_cache9.append(link)
                    elif sum(len(i) for i in link_cache10) < 28000:
                        link_cache10.append(link)
            except Exception as e:
                print('no href found')

    if len(link_cache) != 0:
        print('PR link cache found and saved')
        link_cache.sort()
        data[data.index(row)].append(link_cache)
    if len(link_cache2) != 0:
        link_cache2.sort()
        data[data.index(row)].append(link_cache2)
    if len(link_cache3) != 0:
        link_cache3.sort()
        data[data.index(row)].append(link_cache3)
    if len(link_cache4) != 0:
        link_cache4.sort()
        data[data.index(row)].append(link_cache4)
    if len(link_cache5) != 0:
        link_cache5.sort()
        data[data.index(row)].append(link_cache5)
    if len(link_cache6) != 0:
        link_cache6.sort()
        data[data.index(row)].append(link_cache6)
    if len(link_cache7) != 0:
        link_cache7.sort()
        data[data.index(row)].append(link_cache7)
    if len(link_cache8) != 0:
        link_cache8.sort()
        data[data.index(row)].append(link_cache8)
    if len(link_cache9) != 0:
        link_cache9.sort()
        data[data.index(row)].append(link_cache9)
    if len(link_cache10) != 0:
        link_cache10.sort()
        data[data.index(row)].append(link_cache10)

    #ii = ii +1
    #print(ii)
    #print(jj)
    time_end = time.clock()
    delta = time_end - time_start