def allowed_to_crawl(url): req = pituophis.parse_url(url) if 'URL:' in req.path: return False if req.host not in robotstxt: try: resp = req.get() robotstxt[req.host] = resp.text() except: robotstxt[req.host] = '' robots = robotstxt[req.host] allowed = True for line in robots.replace('\r\n', '\n').split('\n'): if line.startswith('Disallow:'): line = line.replace('Disallow: ', '').replace('Disallow:', '').strip('/') if line in req.path: allowed = False if allowed: if ('Disallow: ' + req.path) in robots: allowed = False else: if 'Disallow: *' in robots: allowed = False return allowed
def alt(request): if request.path.startswith(settings['search_path']): typestring = request.path.replace(settings['search_path'], '').replace('/', '') types = list(typestring) menu = [] if not settings['root_text'] is None: menu.append(p.Item(itype='1', text=settings['root_text'], path='/', host=request.host, port=request.port)) if not settings['new_search_text'] is None: menu.append(p.Item(itype='7', text=settings['new_search_text'], path=settings['search_path'], host=request.host, port=request.port)) if (not request.path == settings['search_path']) and not settings['new_search_text_same_filter'] is None: menu.append(p.Item(itype='7', text=settings['new_search_text_same_filter'], path=request.path, host=request.host, port=request.port)) if not settings['results_caption'] is None: menu.append(p.Item(text=settings['results_caption'].format(request.query, len(db['items'])))) if not settings['types_caption'] is None: if len(types): menu.append(p.Item(text=settings['types_caption'].format(', '.join(types)))) if (not settings['allow_empty_queries']) and request.query == '': return p.Item(text=settings['empty_queries_not_allowed_msg'], itype='3') items = db['items'] for item in items: sampling = item for title in db['items'][item]['titles']: sampling += title if request.query.lower() in sampling.lower(): req = p.parse_url(item) yes = False if len(types) == 0: yes = True else: if req.type in types: yes = True if yes: try: menu.append(p.Item(text='')) menu.append(p.Item(itype=req.type, text=items[item]['titles'][0], path=req.path, host=req.host, port=req.port)) menu.append(p.Item(text='URL: ' + req.url())) if len(items[items]['titles']) > 1: if settings['alternate_titles']: menu.append(item(text='Alternate titles:')) for title in items[item]['titles'][1:]: menu.append(item(text=' ' + title)) if settings['referrers']: menu.append(p.Item(text='Referred by:')) for referrer in items[item]['referrers']: menu.append(p.Item(text=' ' + referrer)) except: pass return menu else: e = copy.copy(p.errors['404']) e.text = e.text.format(request.path) return e
def go(url): global gophertree, openNodes, loadedTextURL window.FindElement('-LOADING-').update(visible=True) req = pituophis.parse_url(url) window.FindElement('-QUERY-').update(req.url()) if req.type in texttypes: if req.type in ['1', '7']: gophertree = sg.TreeData() gophertree.insert('', key=req.url(), text=req.url(), values=[req.url()], icon=icons[req.type]) parentNode = req.url() history.append(req.url()) openNodes = [] populate(parentNode, req) else: try: resp = req.get() loadedTextURL = req.url() window.FindElement('-OUTPUT-').update(resp.text()) except: sg.popup("We're sorry!", req.url() + ' could not be fetched. Try again later.') else: dlpath = dlPopup(req.url()) if not dlpath is None: window.FindElement('-DOWNLOADS-').update( value='Downloading {}'.format(dlpath)) threading.Thread(target=download_thread, args=(req, dlpath, gui_queue), daemon=True).start() window.FindElement('-LOADING-').update(visible=False)
def crawl(url, cooldown=86400): tocrawl = [] req = pituophis.parse_url(url) if req.url() in db['menus']: if not (time.time() >= (db['menus'][req.url()]['last_crawled'] + cooldown)): print('Not crawling', url, 'due to', str(cooldown) + 'ms cooldown') return False try: if settings['limit_host']: if not settings['limit_host'] == req.host: return False if not req.path.startswith(settings['path_must_start_with']): return False save() if req.type in crawl_types: if allowed_to_crawl(req.url()): print('Waiting to crawl', req.url() + '...') time.sleep(settings['delay']) resp = req.get() print('Crawling ' + req.url()) db['menus'][req.url()] = {'last_crawled': 0} dead = False for item in resp.menu(): if item.type not in settings['ignore_types']: surl = item.request().url() record = True if settings['limit_host']: if settings['only_record_host']: if not item.request( ).host == settings['limit_host']: record = False if not req.path.startswith( settings['path_must_start_with']): record = False if '../' in surl: record = False if record: print('Recording item for URL', surl) # record! if surl not in db['items']: db['items'][surl] = {} db['items'][surl]['titles'] = [] db['items'][surl]['referrers'] = [] if item.text not in db['items'][surl]['titles']: db['items'][surl]['titles'].append(item.text) if req.url() not in db['items'][surl]['referrers']: db['items'][surl]['referrers'].append( req.url()) # if it's a crawl type, let's do that uwu if item.type in crawl_types: tocrawl.append(item.request().url()) if item.type == '3': dead = True if dead: db['menus'].pop(req.url(), None) db['items'].pop(req.url(), None) else: db['menus'][req.url()] = {'last_crawled': time.time()} save() for tc in tocrawl: crawl(tc, settings['cooldown']) save() except Exception: print('WARN: Failed to fetch', req.url()) traceback.print_exc() db['menus'].pop(req.url(), None) db['items'].pop(req.url(), None)
if item.type in crawl_types: tocrawl.append(item.request().url()) if item.type == '3': dead = True if dead: db['menus'].pop(req.url(), None) db['items'].pop(req.url(), None) else: db['menus'][req.url()] = {'last_crawled': time.time()} save() for tc in tocrawl: crawl(tc, settings['cooldown']) save() except Exception: print('WARN: Failed to fetch', req.url()) traceback.print_exc() db['menus'].pop(req.url(), None) db['items'].pop(req.url(), None) for key in db['menus'].copy().keys(): crawl(key, settings['cooldown']) for item in db['items'].copy().keys(): req = pituophis.parse_url(item) if req.type == '1': crawl(item, settings['cooldown']) crawl(settings['crawl_url'], settings['cooldown']) save()
elif event == '_TREE_': if value == previousvalue: previousevent = None # DOUBLE CLICK # TODO: cooldown window.FindElement('-LOADING-').update(visible=True) url = value['_TREE_'][0] if url.endswith(' <cached>'): url = url[:-9] del cache[url] go(url) else: if url.startswith('gopher'): req = pituophis.parse_url(url) if req.type == '1': parentNode = url if value['-USETREE-']: populate(parentNode, req) else: go(parentNode) elif req.type == '7': q = sg.popup_get_text('Search on ' + req.host, '') if not q is None: req.query = q go(req.url()) elif req.type != 'i': go(req.url()) window.FindElement('-LOADING-').update(visible=False)