Exemplo n.º 1
0
    def run(self, params={}):
        url = params.get("url")

        if "://" in url:
            response = googlesearch.get_page(url)
            return {"web_page": response.decode("utf-8")}
        else:
            self.logger.info("A valid URL was not passed, be sure to include the prefix e.g. http://")
            raise Exception("A valid URL was not passed")
Exemplo n.º 2
0
    def run(self, params={}):
        url = params.get('url')

        if '://' in url:
            response = googlesearch.get_page(url)
            return {'web_page': response.decode('utf-8')}
        else:
            self.logger.info(
                'A valid URL was not passed, be sure to include the prefix e.g. http://'
            )
            raise Exception('A valid URL was not passed')
Exemplo n.º 3
0
def did_you_mean(query):
    try:
        # query = str(query).strip()
        html = get_page('http://www.google.com/search?q=' + query)
        soup = BeautifulSoup(html, 'html.parser')

        answer = soup.find('a', attrs={'class': 'gL9Hy'})

        result = answer.find('i') if answer is not None else None
        result = result.text if result is not None else None

        return result
    except Exception as e:
        raise

    return None
Exemplo n.º 4
0
def search_google(word, stp=5):
    # Search query
    query = str(word)

    query_result = search(query=query,
                          tld='com',
                          lang='en',
                          num=5,
                          start=0,
                          stop=stp)

    results = []
    for res in query_result:
        res = filter_result(res)
        html = get_page(res, get_random_user_agent())

        results.append({'link': res, 'page': html})

    return results
Exemplo n.º 5
0
    def process_url(self, url: str):

        if url in self.urls:
            return

        self.urls.add(url)

        try:
            soup = CheesySoup(GS.get_page(url), 'html.parser')

            lean_text = soup.lean_text
            doc = Nlp(lean_text)

            for ent in doc.ents:
                if re.match(r'[A-Za-z0-9 -_.]{2,}', str(ent)):
                    self._add_entity(name=str(ent), type=ent.label_, url=url)

            for cve in get_CVEs(lean_text):
                self._add_entity(name=cve, type='CVE', url=url)

        except Exception as e:
            print(f'Processing "{url}" failed:\n', e)
Exemplo n.º 6
0
def did_you_mean(query, trace=False):
    try:
        # Trace
        if trace is True:
            print(
                "\t\t\"Did You Mean (from Google) check\" for the query: \"" +
                query + "\"")

        # query = str(query).strip()
        html = get_page('http://www.google.com/search?q=' + query)
        soup = BeautifulSoup(html, 'html.parser')

        answer = soup.find('a', attrs={'class': 'gL9Hy'})

        result = answer.find('i') if answer is not None else None
        result = result.text if result is not None else None

        # Trace
        if trace is True and result is None:
            print(
                "\t\tNo correction from \"Did You Mean check\" for the query: \""
                + query + "\"\n")
        else:
            print("\t\t\"" + query + "\" is replaced with \"" + result +
                  "\"\n")

        return result
    except Exception as e:
        pass

    # Trace
    if trace is True:
        print("\t\tPassed \"Did You Mean check\" for the query: \"" + query +
              "\"\n")

    return None
Exemplo n.º 7
0
def mine_google_links(query,
                      tld='com',
                      lang='fa',
                      tbs='0',
                      safe='off',
                      num=10,
                      start=0,
                      stop=None,
                      domains=None,
                      pause=2.0,
                      tpe='',
                      country='',
                      extra_params=None,
                      user_agent=None):
    hashes = set()
    count = 0
    if domains:
        query = query + ' ' + ' OR '.join('site:' + domain
                                          for domain in domains)
    query = google.quote_plus(query)
    if not extra_params:
        extra_params = {}

    for builtin_param in google.url_parameters:
        if builtin_param in extra_params.keys():
            raise ValueError(
                'GET parameter "%s" is overlapping with \
                the built-in GET parameter', builtin_param)
    google.get_page(google.url_home % vars(), user_agent)
    if start:
        if num == 10:
            url = google.url_next_page % vars()
        else:
            url = google.url_next_page_num % vars()
    else:
        if num == 10:
            url = google.url_search % vars()
        else:
            url = google.url_search_num % vars()

    while not stop or count < stop:
        last_count = count
        for k, v in extra_params.items():
            k = google.quote_plus(k)
            v = google.quote_plus(v)
            url = url + ('&%s=%s' % (k, v))

        time.sleep(pause)
        html = google.get_page(url, user_agent)
        soup = BeautifulSoup(html, 'html.parser')
        news_item = soup.find_all('div', class_='ezO2md')
        for item in news_item:
            snippet = ''
            title = ''
            try:
                a = item.find('a')
                link = a['href']
                title_span = a.find('span')
                if title_span:
                    title = title_span.text
                details = item.find('td').text.split('·')
                if len(details) < 2:
                    date = ''
                else:
                    date = details[0]
                    snippet = details[1]
            except Exception:
                continue
            link = google.filter_result(link)
            if not link:
                continue
            h = hash(link)
            if h in hashes:
                continue
            hashes.add(h)
            yield title, link, snippet, date
            count += 1
            if stop and count >= stop:
                return
        if last_count == count:
            break
        start += num
        if num == 10:
            url = google.url_next_page % vars()
        else:
            url = google.url_next_page_num % vars()