Пример #1
0
def allowed_to_crawl(url):
    req = pituophis.parse_url(url)
    if 'URL:' in req.path:
        return False
    if req.host not in robotstxt:
        try:
            resp = req.get()
            robotstxt[req.host] = resp.text()
        except:
            robotstxt[req.host] = ''
    robots = robotstxt[req.host]
    allowed = True
    for line in robots.replace('\r\n', '\n').split('\n'):
        if line.startswith('Disallow:'):
            line = line.replace('Disallow: ', '').replace('Disallow:',
                                                          '').strip('/')
            if line in req.path:
                allowed = False
    if allowed:
        if ('Disallow: ' + req.path) in robots:
            allowed = False
        else:
            if 'Disallow: *' in robots:
                allowed = False
    return allowed
Пример #2
0
def alt(request):
    if request.path.startswith(settings['search_path']):
        typestring = request.path.replace(settings['search_path'], '').replace('/', '')
        types = list(typestring)
        menu = []
        if not settings['root_text'] is None:
            menu.append(p.Item(itype='1', text=settings['root_text'], path='/', host=request.host, port=request.port))
        if not settings['new_search_text'] is None:
            menu.append(p.Item(itype='7', text=settings['new_search_text'], path=settings['search_path'], host=request.host, port=request.port))
        if (not request.path == settings['search_path']) and not settings['new_search_text_same_filter'] is None:
            menu.append(p.Item(itype='7', text=settings['new_search_text_same_filter'], path=request.path, host=request.host, port=request.port))
        if not settings['results_caption'] is None:
            menu.append(p.Item(text=settings['results_caption'].format(request.query, len(db['items']))))
        if not settings['types_caption'] is None:
            if len(types):
                menu.append(p.Item(text=settings['types_caption'].format(', '.join(types))))
        if (not settings['allow_empty_queries']) and request.query == '':
            return p.Item(text=settings['empty_queries_not_allowed_msg'], itype='3')
        items = db['items']
        for item in items:
            sampling = item
            for title in db['items'][item]['titles']:
                sampling += title
            if request.query.lower() in sampling.lower():
                req = p.parse_url(item)
                yes = False
                if len(types) == 0:
                    yes = True
                else:
                    if req.type in types:
                        yes = True
                if yes:
                    try:
                        menu.append(p.Item(text=''))
                        menu.append(p.Item(itype=req.type, text=items[item]['titles'][0], path=req.path, host=req.host, port=req.port))
                        menu.append(p.Item(text='URL: ' + req.url()))
                        if len(items[items]['titles']) > 1:
                            if settings['alternate_titles']:
                                menu.append(item(text='Alternate titles:'))
                                for title in items[item]['titles'][1:]:
                                    menu.append(item(text='  ' + title))
                            if settings['referrers']:
                                menu.append(p.Item(text='Referred by:'))
                                for referrer in items[item]['referrers']:
                                    menu.append(p.Item(text='  ' + referrer))
                    except:
                        pass
        return menu
    else:
        e = copy.copy(p.errors['404'])
        e.text = e.text.format(request.path)
        return e
Пример #3
0
def go(url):
    global gophertree, openNodes, loadedTextURL

    window.FindElement('-LOADING-').update(visible=True)

    req = pituophis.parse_url(url)
    window.FindElement('-QUERY-').update(req.url())
    if req.type in texttypes:
        if req.type in ['1', '7']:
            gophertree = sg.TreeData()
            gophertree.insert('',
                              key=req.url(),
                              text=req.url(),
                              values=[req.url()],
                              icon=icons[req.type])
            parentNode = req.url()
            history.append(req.url())
            openNodes = []
            populate(parentNode, req)
        else:
            try:
                resp = req.get()
                loadedTextURL = req.url()
                window.FindElement('-OUTPUT-').update(resp.text())
            except:
                sg.popup("We're sorry!",
                         req.url() + ' could not be fetched. Try again later.')
    else:
        dlpath = dlPopup(req.url())
        if not dlpath is None:
            window.FindElement('-DOWNLOADS-').update(
                value='Downloading {}'.format(dlpath))
            threading.Thread(target=download_thread,
                             args=(req, dlpath, gui_queue),
                             daemon=True).start()

    window.FindElement('-LOADING-').update(visible=False)
Пример #4
0
def crawl(url, cooldown=86400):
    tocrawl = []
    req = pituophis.parse_url(url)
    if req.url() in db['menus']:
        if not (time.time() >=
                (db['menus'][req.url()]['last_crawled'] + cooldown)):
            print('Not crawling', url, 'due to', str(cooldown) + 'ms cooldown')
            return False
    try:
        if settings['limit_host']:
            if not settings['limit_host'] == req.host:
                return False
        if not req.path.startswith(settings['path_must_start_with']):
            return False
        save()
        if req.type in crawl_types:
            if allowed_to_crawl(req.url()):
                print('Waiting to crawl', req.url() + '...')
                time.sleep(settings['delay'])
                resp = req.get()
                print('Crawling ' + req.url())
                db['menus'][req.url()] = {'last_crawled': 0}
                dead = False
                for item in resp.menu():
                    if item.type not in settings['ignore_types']:
                        surl = item.request().url()
                        record = True
                        if settings['limit_host']:
                            if settings['only_record_host']:
                                if not item.request(
                                ).host == settings['limit_host']:
                                    record = False
                        if not req.path.startswith(
                                settings['path_must_start_with']):
                            record = False
                        if '../' in surl:
                            record = False
                        if record:
                            print('Recording item for URL', surl)
                            # record!
                            if surl not in db['items']:
                                db['items'][surl] = {}
                                db['items'][surl]['titles'] = []
                                db['items'][surl]['referrers'] = []
                            if item.text not in db['items'][surl]['titles']:
                                db['items'][surl]['titles'].append(item.text)
                            if req.url() not in db['items'][surl]['referrers']:
                                db['items'][surl]['referrers'].append(
                                    req.url())
                            # if it's a crawl type, let's do that uwu
                            if item.type in crawl_types:
                                tocrawl.append(item.request().url())
                    if item.type == '3':
                        dead = True
                if dead:
                    db['menus'].pop(req.url(), None)
                    db['items'].pop(req.url(), None)
                else:
                    db['menus'][req.url()] = {'last_crawled': time.time()}
                save()
                for tc in tocrawl:
                    crawl(tc, settings['cooldown'])
                    save()
    except Exception:
        print('WARN: Failed to fetch', req.url())
        traceback.print_exc()
        db['menus'].pop(req.url(), None)
        db['items'].pop(req.url(), None)
Пример #5
0
                            if item.type in crawl_types:
                                tocrawl.append(item.request().url())
                    if item.type == '3':
                        dead = True
                if dead:
                    db['menus'].pop(req.url(), None)
                    db['items'].pop(req.url(), None)
                else:
                    db['menus'][req.url()] = {'last_crawled': time.time()}
                save()
                for tc in tocrawl:
                    crawl(tc, settings['cooldown'])
                    save()
    except Exception:
        print('WARN: Failed to fetch', req.url())
        traceback.print_exc()
        db['menus'].pop(req.url(), None)
        db['items'].pop(req.url(), None)


for key in db['menus'].copy().keys():
    crawl(key, settings['cooldown'])

for item in db['items'].copy().keys():
    req = pituophis.parse_url(item)
    if req.type == '1':
        crawl(item, settings['cooldown'])

crawl(settings['crawl_url'], settings['cooldown'])
save()
Пример #6
0
    elif event == '_TREE_':
        if value == previousvalue:
            previousevent = None
            # DOUBLE CLICK
            # TODO: cooldown
            window.FindElement('-LOADING-').update(visible=True)

            url = value['_TREE_'][0]

            if url.endswith(' <cached>'):
                url = url[:-9]
                del cache[url]
                go(url)
            else:
                if url.startswith('gopher'):
                    req = pituophis.parse_url(url)
                    if req.type == '1':
                        parentNode = url
                        if value['-USETREE-']:
                            populate(parentNode, req)
                        else:
                            go(parentNode)
                    elif req.type == '7':
                        q = sg.popup_get_text('Search on ' + req.host, '')
                        if not q is None:
                            req.query = q
                            go(req.url())
                    elif req.type != 'i':
                        go(req.url())

                    window.FindElement('-LOADING-').update(visible=False)