Пример #1
0
def main():
    print_banner()

    root_url = get_args()

    links = list()

    tree = sitemap_tree_for_homepage(root_url)

    for page in tree.all_pages():
        links.append(page.url)

    hc = hashek.Hashek()

    print("\nGathered %d links from %s" % (len(links), root_url))

    errors_fixes = list()

    for i, link in enumerate(links):
        text = extract_text_from_link(link)

        print("Checking link: %s" % link)

        suggestions_dict = hc.check_text(text)

        errors_fixes.append({link: suggestions_dict})

    hc.close()
Пример #2
0
def ask_for_url():
    print("What URL do you want to search?")
    url = input('>')
    print("Do you want to search more than one URL?")
    answer = input('>').lower()
    if answer.startswith('y'):
        # Check sitemap of URL
        tree = sitemap_tree_for_homepage(url)
        #for page in tree.all_pages():
        #   print(page)
        # Tell user how many pages are found
        number_of_pages_found = len(list(tree.all_pages()))
        print(f'A total of  {number_of_pages_found} pages were found.')

        # Allow user to specify how many pages to scrape
        print("How many pages do you want to scrape?")
        pages_to_scrape = int(input('>'))
        pages_scraped = []
        for page in tree.all_pages():
            pages_scraped.append(page.url)
            if len(pages_scraped) == pages_to_scrape:
                break

        return pages_scraped
    else:
        return [url]
def generate_sitemap(domain):

    # Generate the sitemap
    tree = sitemap_tree_for_homepage(domain)

    # Initialise the list of links
    links = []

    # Iterate through all URLs found by the sitemap generator
    for page in tree.all_pages():
        url = page.url

        # Some sites will not have the domain name in front of URL == add this in
        if url[0] == '/':
            url = domain + url

        # This is the structure of the db
        # Needs work to improve search functionality
        link_entry = {'url': url, 'domain': 'https://' + strip_domain(domain)}

        #Add this to the list of links needing to be appended
        links.append(link_entry)

    # Write the links to a file (one for each domain)
    write_to_file(domain, links)
Пример #4
0
def get_urls(url):
    tree = sitemap_tree_for_homepage(url)

    urls = []
    for page in tree.all_pages():
        urls.append(page.url)

    return urls
Пример #5
0
 def get_sitemap_tree(common_list):
     """get all links from sitemap"""
     sitemap_tree = []
     for link in common_list:
         web_client = _RequestsWebClient()
         tree = sitemap_tree_for_homepage(link, web_client)
         for page in tree.all_pages():
             sitemap_tree.append(page.url)
     return sitemap_tree
Пример #6
0
def fetch_sitemap_pages_for_media_id(db: DatabaseHandler,
                                     media_id: int) -> None:
    """Fetch and store all pages (news stories or not) from media's sitemap tree."""
    media = db.find_by_id(table='media', object_id=media_id)
    if not media:
        raise Exception("Unable to find media with ID {}".format(media_id))

    media_url = media['url']

    log.info("Fetching sitemap pages for media ID {} ({})...".format(
        media_id, media_url))
    web_client = _SitemapWebClient()
    sitemaps = sitemap_tree_for_homepage(homepage_url=media_url,
                                         web_client=web_client)
    log.info("Fetched pages for media ID {} ({}).".format(media_id, media_url))

    log.info("Storing sitemap pages for media ID {} ({})...".format(
        media_id, media_url))

    insert_counter = 0
    for page in sitemaps.all_pages():
        db.query(
            """
            INSERT INTO media_sitemap_pages (
                media_id, url, last_modified, change_frequency, priority,
                news_title, news_publish_date
            ) VALUES (
                %(media_id)s, %(url)s, %(last_modified)s, %(change_frequency)s, %(priority)s,
                %(news_title)s, %(news_publish_date)s
            )
            ON CONFLICT (url) DO NOTHING
        """, {
                'media_id':
                media_id,
                'url':
                page.url,
                'last_modified':
                page.last_modified,
                'change_frequency':
                page.change_frequency.value
                if page.change_frequency is not None else None,
                'priority':
                page.priority,
                'news_title':
                page.news_story.title if page.news_story is not None else None,
                'news_publish_date':
                page.news_story.publish_date
                if page.news_story is not None else None,
            })

        insert_counter += 1
        if insert_counter % 1000 == 0:
            log.info("Inserted {} URLs...".format(insert_counter))

    log.info("Done storing {} sitemap pages for media ID {} ({}).".format(
        insert_counter, media_id, media_url))
Пример #7
0
def import_domains():
    with open('ru_domains.txt') as file_data:
        for item in file_data:
            domain = item.split(';')[0]
            try:
                req = requests.get('https://' + domain, timeout=3)
                if req.status_code == requests.codes.ok:
                    sitemap_tree = sitemap_tree_for_homepage('https://' + domain)
                    for page in sitemap_tree.all_pages():
                        print(page.url)
            except requests.exceptions.RequestException:
                continue
Пример #8
0
 def add_links_from_sitemap_xml(self):
     if self.sitemap_xml_processor is None:
         return
     assert self.website.main_page_url in self.website.url_nodes
     root_page = self.website.main_page_url.strip('/')
     tree = sitemap_tree_for_homepage(root_page)
     cnt = 0
     useful = 0
     for page in tree.all_pages():
         cnt += 1
         weight = self.sitemap_xml_processor(page.url)
         if weight > TLinkInfo.MINIMAL_LINK_WEIGHT:
             if page.url not in self.pages_to_process:
                 useful += 1
                 link_info = TLinkInfo(TClickEngine.sitemap_xml, self.website.main_page_url, page.url, anchor_text="")
                 link_info.weight = weight
                 self.add_link_wrapper(link_info)
     self.logger.info("processed {} links from {}/sitemap.xml found {} useful links".format(cnt, root_page, useful))
def main(req: func.HttpRequest) -> func.HttpResponse:
    name = req.params.get('url')
    now = datetime.now()
    if not name:
        try:
            req_body = req.get_json()
        except ValueError:
            pass
        else:
            name = req_body.get('url')
    logging.info('Python HTTP trigger function processed a request.')
    try:
        
        tree = sitemap_tree_for_homepage(name)
        Cur_time=now+timedelta(minutes=3)
        output=""
        count =0
        for page in tree.all_pages():
            found=""
            m = re.search('url=(.+?), ', str(page))
            if(datetime.now() > Cur_time):
                break
            if m:
                found = m.group(1)
            if str(found) not in output:
                output+="\""+str(found)+"\""+","
                count+=1
                
        output="["+output[:-1]+"]"
        print(output)
        print(count)
    except:
        output="Something went wrong"

    if (output==""):
        return func.HttpResponse(
            "Please pass a url"
        )
    else:
        return func.HttpResponse(output)
            
Пример #10
0
def search_site(site, keywords, data):
    try:
        tree = sitemap_tree_for_homepage(site)
        counter = 0
        for page in tree.all_pages():
            page_text = ''
            for keyword in keywords:
                if not data.get(page.url): data[page.url] = {}
                if data[page.url].get(keyword, -1) >= 0: continue
                if not page_text:
                    print(
                        'getting page text')  # only get page_text if necessary
                    page_text = requests.get(page.url).text
                print("Seaching on %s for %s" % (page.url, keyword))
                data = store_match_count(data, page.url, page_text, keyword)

            counter += 1
            if counter >= 20:
                save_data(data)
                counter = 0
    except:
        print("failed to search on %s" % site)

    return data
Пример #11
0
    two_factor = ("--two_factor" in args)

    print("Logging in")
    if two_factor:
        print("Two Factor authentication required")
        print("Not Implemented")
        exit()

    login_info = r.login(acc_info[0], acc_info[1])
    print("Successfully Logged in")

    root = 'https://robinhood.com/'
    collections = []
    stock_hs = set()

    tree = sitemap_tree_for_homepage('https://robinhood.com/sitemap.xml')
    pages = tree.all_pages()
    urls = [p.url.replace(root, "") for p in pages]

    for url in urls:
        if 'collections' in url:
            collections.append(url.replace('collections/', ''))
        elif 'stocks' in url:
            stock_hs.add(url.replace('stocks/', ''))

    print(len(collections))
    print(len(stock_hs))
    print("Updating list of stocks from collections set")
    print(stock_hs)
    for col in collections:
        print(col)
Пример #12
0
Parse sitemap and write it to csv file
'''

import csv
import argparse
from usp.tree import sitemap_tree_for_homepage


def write_csv(tree, csv_file):
    with open(csv_file, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, delimiter=',')
        writer.writerow(['url', 'priority', 'last_modified'])
        for line in tree.all_pages():
            writer.writerow(
                [line.url,
                 str(line.priority),
                 str(line.last_modified)])


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Generate sitemap csv from given url")
    parser.add_argument('--url', default="http://www.freshdirect.com/")
    parser.add_argument('--csv', default="sitemap.csv")

    args = parser.parse_args()
    print(args)

    tree = sitemap_tree_for_homepage(args.url)
    write_csv(tree, args.csv)
Пример #13
0
from usp.tree import sitemap_tree_for_homepage

tree = sitemap_tree_for_homepage('https://hostingspell.com/')
print(tree)
Пример #14
0
 def pages_from_sitemap(page_url: str) -> list:
     tree = sitemap_tree_for_homepage(page_url)
     return [page.url for page in tree.all_pages()]
Пример #15
0
from usp.tree import sitemap_tree_for_homepage
import re

tree = sitemap_tree_for_homepage('https://documents.polycom.com/')

for page in tree.all_pages():
    data = re.findall("(https:\/\/.+)", page)
    for url in data:
        print(url)
Пример #16
0
from usp.tree import sitemap_tree_for_homepage
import argparse


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--output",
                        dest="output_path",
                        required=False,
                        default="downloaded_sitemap_urls.txt")
    parser.add_argument("urls", nargs="*")
    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = parse_args()
    if len(args.urls) > 0:
        url = args.urls[0]
    else:
        url = "http://sokirko.info"
    print("download all sitemaps from {}".format(url))
    tree = sitemap_tree_for_homepage(url)
    urls = list(tree.all_pages())
    print("write {} urls to {}".format(len(urls), args.output_path))
    with open(args.output_path, "w") as outp:
        for u in urls:            \
                        outp.write ("{}\n".format(u.url))
Пример #17
0
def dealer_urls(car_make: str,
                model: str,
                zip_code: int,
                dist_range: int = 100,
                min_stars: int = 4,
                prices_arg='full'):
    api = API
    url = 'https://maps.googleapis.com/maps/api/place/textsearch/json?'
    geoloc = Nominatim(user_agent="PriceScraper")

    try:
        lat = geoloc.geocode({'postalcode': zip_code})[1][0]
        long = geoloc.geocode({'postalcode': zip_code})[1][1]
    except:
        raise ValueError(
            str(zip_code) +
            ' is not a valid zip code, try again with an existing zip code')

    r = requests.get(url + 'query=' + car_make + '+Dealerships&location=' +
                     str(lat) + ',' + str(long) + '&radius=' +
                     str(dist_range) + '&key=' + api)
    ids = [res['place_id'] for res in r.json()['results']]
    print('Found ' + str(len(ids)) + ' matching ' + car_make +
          ' dealerships within ' + str(dist_range) + ' miles of ' +
          str(zip_code))

    url2 = 'https://maps.googleapis.com/maps/api/place/details/json?'
    print('Getting urls for matching ' + car_make + ' Dealers....')

    url_list = []
    for place_id in ids:
        try:
            request = requests.get(url2 + 'place_id=' + place_id +
                                   '&fields=name,rating,website' + '&key=' +
                                   api).json()
            print([
                request['result']['name'], request['result']['rating'],
                request['result']['website'].split('/')[2]
            ])
            url_list.append([
                request['result']['name'], request['result']['rating'],
                request['result']['website'].split('/')[2]
            ])
        except:
            pass

    # url_list = [[requests.get(url2 + 'place_id=' + place_id + '&fields=name,rating,website' + '&key=' + api).json()['result']['name'], requests.get(url2 + 'place_id=' + place_id + '&fields=name,rating,website' + '&key=' + api).json()['result']['rating'], 'https://' + requests.get(url2 + 'place_id=' + place_id + '&fields=name,rating,website' + '&key=' + api).json()['result']['website'].split('/')[2]] for place_id in ids if 'rating' in requests.get(url2 + 'place_id=' + place_id + '&fields=name,rating,website' + '&key=' + api).json()['result'] and 'website' in requests.get(url2 + 'place_id=' + place_id + '&fields=name,rating,website' + '&key=' + api).json()['result'] and car_make.lower() in requests.get(url2 + 'place_id=' + place_id + '&fields=name,rating,website' + '&key=' + api).json()['result']['website'].lower()]

    url_df = pd.DataFrame(url_list,
                          columns=['Dealership Name', 'Rating', 'URL'])
    url_df = url_df[url_df['Rating'] >= min_stars]
    if len(url_df) == 0:
        raise ValueError(
            'Too few rows remaining after filter. Try changing the minimum star rating for dealerships'
        )

    sitemap = []
    for row in range(len(url_df)):
        url = url_df.iloc[row]['URL']
        if 'https' not in url:
            tree = sitemap_tree_for_homepage('https://' +
                                             url.replace('http://', ''))
        else:
            tree = sitemap_tree_for_homepage(url)

        try:
            with timeout(400, exception=RuntimeError):
                each = []
                for page in tree.all_pages():
                    if model.lower() in page.url.lower() and (
                            'new' in page.url.lower()
                            or 'inventory' in page.url.lower()):
                        each.append(page.url)
                    else:
                        pass
                sitemap.append(each)
        except RuntimeError as e:
            sitemap.append([])

    url_df['Sitemap'] = sitemap
    url_df = url_df[url_df['Sitemap'].str.len() > 0]

    prices_index = (1 if prices_arg == 'full' else 3)
    prices = []
    for i in range(len(url_df)):
        site = url_df.iloc[i]['Sitemap']
        for url in site:
            page = requests.get(url)
            soup = bs4.BeautifulSoup(page.content, 'html.parser')

            try:
                name = soup.find_all(text=re.compile('[0-9]{4} ' + car_make +
                                                     ' ' + model +
                                                     ' [A-Z|a-z]{1,10}'))
                price = [
                    int(str(x).strip('$').replace(',', ''))
                    for x in soup.find_all(
                        text=re.compile('^\$[0-9]{2}\,[0-9]{3}'))
                ]

                if len(price) == 0:
                    break

                if len(name[0]) > 120:
                    j = 1
                    while j < len(name):
                        if len(name[j]) > 120:
                            j += 1
                        else:
                            prices.append([
                                url_df.iloc[i]['Dealership Name'], name[j],
                                sorted(price[:prices_index]), url
                            ])
                            print('Getting price for a ' + name[j])
                            break
                    prices.append([
                        url_df.iloc[i]['Dealership Name'],
                        car_make + ' ' + model,
                        sorted(price[:prices_index]), url
                    ])
                else:
                    print('Getting price for a ' + name[0])
                    prices.append([
                        url_df.iloc[i]['Dealership Name'], name[0],
                        sorted(price[:prices_index]), url
                    ])
            except Exception as e:
                print(e)
                pass

    prices_dat = pd.DataFrame(
        prices, columns=['Dealership Name', 'Model', 'Prices', 'URL'])

    if prices_arg == 'full':
        prices_dat = prices_dat[prices_dat['Prices'].str.len() > 2]
        prices_dat['Prices_MSRP'] = [x[2] for x in prices_dat['Prices']]
        prices_dat['Prices_First_Discount'] = [
            x[1] for x in prices_dat['Prices']
        ]
        prices_dat['Prices_Final_Discount'] = [
            x[0] for x in prices_dat['Prices']
        ]
        prices_dat = prices_dat.drop('Prices', axis=1)

    else:
        prices_dat['Prices_MSRP'] = [x[0] for x in prices_dat['Prices']]
        prices_dat = prices_dat.drop('Prices', axis=1)

    prices_dat.to_csv(os.path.join(os.path.abspath('.'),
                                   'interface/static/interface/user_files/') +
                      car_make + '_' + model + '_' + 'within_' +
                      str(dist_range) + '_miles_of_' + str(zip_code) + '_' +
                      '_prices_' + prices_arg + '.csv',
                      index=False)
Пример #18
0

class TimeOutException(Exception):
    pass


def alarm_handler(signum, frame):
    print("ALARM signal received")
    raise TimeOutException()


docID = 0
urllist = []
recipes_dict = {}

tree = sitemap_tree_for_homepage("https://www.bbcgoodfood.com/")
for page in tree.all_pages():
    url = page.url
    if "https://www.bbcgoodfood.com/recipes" in url:
        urllist.append(page.url)

with open('url.txt', 'w') as fp:
    for url in urllist:
        fp.write(url + "\n")

signal.signal(signal.SIGALRM, alarm_handler)

with open('14_02_20.txt', 'w') as fp:
    for url in urllist:
        signal.alarm(8)
        print(url)