Exemplo n.º 1
0
class Searcher:
    def __init__(self, cfg, db, env):
        self.cfg = cfg
        self.db = db
        self.env = env
        self.findex = Findex(self.db)

    def _key_check(self, keyword):
        if isinstance(keyword, dict):
            if not "key" in keyword or not keyword["key"]:
                raise SearchException("Search query must contain 4 characters or more")

            keyword = keyword["key"][0]

        block = ["-", ",", "+", "_", "%"]
        for b in block:
            keyword = keyword.replace(b, " ")

        if len(keyword) < 4:
            raise SearchException("Search query must contain 4 characters or more")

        return keyword

    def search(self, vars):
        val = self._key_check(vars)
        val = val.lower()

        filtered = False
        start_dbtime = datetime.now()

        # to-do: move this to API (or make api.py use this class)
        q = self.db.query(Files)

        # if this is later set with Files.<column_name>, it will be sorted on this.
        sort = ""

        sdata = {"protocols": [], "hosts": [], "exts": [], "cats": [], "fsize": 0}

        if "protocols" in vars:
            protocols = [z.lower() for z in vars["protocols"]]
            plookup = {"ftp": 0, "http": 1, "smb": 2}

            protocols_ids = []
            if isinstance(protocols, list):
                protocols = [z.lower() for z in protocols]

                for p in protocols:
                    if p in plookup and not plookup[p] in protocols_ids:
                        protocols_ids.append(plookup[p])

            if protocols_ids:
                sdata["protocols"] = protocols_ids

        else:
            sdata["protocols"] = [0, 1, 2]

        if "hosts" in vars:
            dhosts = vars["hosts"]

            if isinstance(dhosts, list):
                if not dhosts[0] == "*":
                    host_ids = []
                    for host in dhosts:
                        host_results = (
                            self.db.query(Resources)
                            .filter(Resources.address == host)
                            .filter(Resources.protocol.in_(sdata["protocols"]))
                            .all()
                        )

                        for host_result in host_results:
                            host_ids.append(host_result.id)

                    if host_ids:
                        sdata["hosts"] = host_ids
                    else:
                        raise SearchException("Could not find any host entries for specified host(s)")

        if sdata["hosts"]:
            q = q.filter(Files.resource_id.in_(sdata["hosts"]))
            filtered = True

        if "cats" in vars:
            clookup = {"unknown": 0, "documents": 1, "movies": 2, "music": 3, "pictures": 4}

            dformats = []
            for cat in [z.lower() for z in vars["cats"]]:
                if cat in clookup:
                    dformats.append(clookup[cat])
                else:
                    dformats.append(int(cat))

            if isinstance(dformats, list):
                q = q.filter(Files.file_format.in_(dformats))

                for dformat in dformats:
                    sdata["cats"].append(dformat)
        else:
            sdata["cats"] = [0, 1, 2, 3, 4]

        for i in [0, 1, 2, 3, 4]:
            if not i in sdata["cats"]:
                filtered = True

        if "exts" in vars:
            exts = vars["exts"]

            if isinstance(exts, list):
                exts = [z.replace(".", "") for z in exts if z]

                q = q.filter(Files.file_ext.in_(exts))
                filtered = True

                for ext in exts:
                    sdata["exts"].append(ext)
        elif "." in val:
            spl = val.split(".", 1)
            ext = spl[1].replace(",", "").strip()
            q = q.filter(Files.file_ext == ext)
            sdata["exts"].append(ext)

            val = self._key_check(spl[0])
            filtered = True

        if "size" in vars:
            fsize = vars["size"]

            if isinstance(fsize, list):
                fsize = int(fsize[0])

                sizes = {
                    0: "*",
                    1: (0, 8388600),
                    2: (8388600, 134220000),
                    3: (134220000, 536870912),
                    4: (536870912, 2147483648),
                    5: (2147483648, 8589934592),
                    6: (8589934592),
                }

                if fsize == 0:
                    pass
                elif 1 <= fsize <= 5:
                    q = q.filter(Files.file_size > sizes[fsize][0], Files.file_size < sizes[fsize][1])
                    filtered = True
                elif fsize == 6:
                    q = q.filter(Files.file_size > sizes[fsize])
                    filtered = True

                sdata["fsize"] = fsize
                sort = "file_size"

        if "path" in vars:
            path = vars["path"]

            if isinstance(path, list):
                path = path[0]

                if len(path) > 3:
                    path = quote_plus(path)
                    q = q.filter(Files.file_path.like(path + "%"))
                    filtered = True

        if "host" in vars:
            host = vars["host"]

            if isinstance(host, list):
                host = int(host[0])
                q = q.filter(Files.resource_id == host)
                filtered = True

        q = q.filter(Files.searchable.like("%" + val + "%")).limit(600)

        results = {}
        results["data"] = q.all()
        results["load_dbtime"] = (datetime.now() - start_dbtime).total_seconds()

        if sort:
            results["data"] = sorted(results["data"], key=lambda k: k.file_size, reverse=True)

        # to-do: dont do this here
        for r in results["data"]:
            host = self.findex.get_resource_objects(r.resource_id)
            setattr(r, "resource", host)

        results["data"] = self.findex.set_humanize(results["data"])
        results["data"] = self.findex.set_icons(env=self.env, files=results["data"])
        sdata["filtered"] = filtered

        return {"sdata": sdata, "results": results, "key": jinja2.escape(vars["key"][0])}
Exemplo n.º 2
0
class Searcher():
    def __init__(self, cfg, db):
        self.cfg = cfg
        self.db = db
        self.findex = Findex(self.db)

    def _key_check(self, keyword):
        if isinstance(keyword, dict):
            if not 'key' in keyword or not keyword['key']:
                raise SearchException(
                    'Search query must contain 4 characters or more')

            keyword = keyword['key'][0]

        block = ['-', ',', '+', '_', '%']
        for b in block:
            keyword = keyword.replace(b, ' ')

        if len(keyword) < 4:
            raise SearchException(
                'Search query must contain 4 characters or more')

        return keyword

    def search(self, vars):
        val = self._key_check(vars)
        val = val.lower()

        filtered = False
        start_dbtime = datetime.now()

        # to-do: move this to API (or make api.py use this class)
        q = self.db.query(Files)

        # if this is later set with Files.<column_name>, it will be sorted on this.
        sort = ''

        sdata = {
            'protocols': [],
            'hosts': [],
            'exts': [],
            'cats': [],
            'fsize': 0
        }

        if 'protocols' in vars:
            protocols = [z.lower() for z in vars['protocols']]
            plookup = {'ftp': 0, 'http': 1, 'smb': 2}

            protocols_ids = []
            if isinstance(protocols, list):
                protocols = [z.lower() for z in protocols]

                for p in protocols:
                    if p in plookup and not plookup[p] in protocols_ids:
                        protocols_ids.append(plookup[p])

            if protocols_ids:
                sdata['protocols'] = protocols_ids

        else:
            sdata['protocols'] = [0, 1, 2]

        if 'hosts' in vars:
            dhosts = vars['hosts']

            if isinstance(dhosts, list):
                if not dhosts[0] == '*':
                    host_ids = []
                    for host in dhosts:
                        host_results = self.db.query(Hosts).filter(
                            Hosts.address == host).filter(
                                Hosts.protocol.in_(sdata['protocols'])).all()

                        for host_result in host_results:
                            host_ids.append(host_result.id)

                    if host_ids:
                        sdata['hosts'] = host_ids
                    else:
                        raise SearchException(
                            'Could not find any host entries for specified host(s)'
                        )

        if sdata['hosts']:
            q = q.filter(Files.host_id.in_(sdata['hosts']))
            filtered = True

        if 'cats' in vars:
            clookup = {
                'unknown': 0,
                'documents': 1,
                'movies': 2,
                'music': 3,
                'pictures': 4
            }

            dformats = []
            for cat in [z.lower() for z in vars['cats']]:
                if cat in clookup:
                    dformats.append(clookup[cat])
                else:
                    dformats.append(int(cat))

            if isinstance(dformats, list):
                q = q.filter(Files.file_format.in_(dformats))

                for dformat in dformats:
                    sdata['cats'].append(dformat)
        else:
            sdata['cats'] = [0, 1, 2, 3, 4]

        for i in [0, 1, 2, 3, 4]:
            if not i in sdata['cats']:
                filtered = True

        if 'exts' in vars:
            exts = vars['exts']

            if isinstance(exts, list):
                exts = [z.replace('.', '') for z in exts if z]

                q = q.filter(Files.file_ext.in_(exts))
                filtered = True

                for ext in exts:
                    sdata['exts'].append(ext)
        elif '.' in val:
            spl = val.split('.', 1)
            ext = spl[1].replace(',', '').strip()
            q = q.filter(Files.file_ext == ext)
            sdata['exts'].append(ext)

            val = self._key_check(spl[0])
            filtered = True

        if 'size' in vars:
            fsize = vars['size']

            if isinstance(fsize, list):
                fsize = int(fsize[0])

                sizes = {
                    0: '*',
                    1: (0, 8388600),
                    2: (8388600, 134220000),
                    3: (134220000, 536870912),
                    4: (536870912, 2147483648),
                    5: (2147483648, 8589934592),
                    6: (8589934592)
                }

                if fsize == 0:
                    pass
                elif 1 <= fsize <= 5:
                    q = q.filter(Files.file_size > sizes[fsize][0],
                                 Files.file_size < sizes[fsize][1])
                    filtered = True
                elif fsize == 6:
                    q = q.filter(Files.file_size > sizes[fsize])
                    filtered = True

                sdata['fsize'] = fsize
                sort = 'file_size'

        if 'path' in vars:
            path = vars['path']

            if isinstance(path, list):
                path = path[0]

                if len(path) > 3:
                    path = quote_plus(path)
                    q = q.filter(Files.file_path.like(path + '%'))
                    filtered = True

        if 'host' in vars:
            host = vars['host']

            if isinstance(host, list):
                host = int(host[0])
                q = q.filter(Files.host_id == host)
                filtered = True

        q = q.filter(Files.searchable.like('%' + val + '%')).limit(600)

        results = {}
        results['data'] = q.all()
        results['load_dbtime'] = (datetime.now() -
                                  start_dbtime).total_seconds()

        if sort:
            results['data'] = sorted(results['data'],
                                     key=lambda k: k.file_size,
                                     reverse=True)

        # to-do: dont do this here
        for r in results['data']:
            host = self.findex.get_host_objects(r.host_id)
            setattr(r, 'host', host)

        results['data'] = self.findex.set_humanize(results['data'])
        results['data'] = self.findex.set_icons(results['data'])
        sdata['filtered'] = filtered

        return {
            'sdata': sdata,
            'results': results,
            'key': jinja2.escape(vars['key'][0])
        }
Exemplo n.º 3
0
class Browser():
    def __init__(self, db):
        self.db = db
        self.findex = Findex(db)

        self.env = {}
        self.files = None

    def parse_incoming_path(self, path):
        self.env['isdir'] = path.endswith('/')

        spl = path.split('/')
        self.env['host'] = spl[0]
        self.env['file_path'] = '/' + '/'.join(spl[1:-1])

        if not self.env['isdir']: self.env['file_name'] = path.split('/')[-1]
        if self.env['file_path'] != '/': self.env['file_path'] += '/'

        self.env['file_path_quoted'] = quote_plus(self.env['file_path'])

    def fetch_files(self):
        host = self.db.query(Hosts).filter_by(
            address=self.env['host']
        ).first()

        if not host:
            raise BrowseException('No host found')

        self.env['host_id'] = host.id
        files = self.findex.get_files_objects(host_id=host.id, file_path=self.env['file_path_quoted'])

        if not files:
            raise BrowseException('No files found')

        self.files = files

    def prepare_files(self, sort=None):
        # sort files
        self.files = sorted(self.files, key=lambda k: k.file_name)  # alphabetically
        self.files = sorted(self.files, key=lambda k: k.file_isdir, reverse=True)  # folders always on top

        if not self.env['file_path'] == '/':  # add CDUP dirs
            x = Files(
                file_name='..', file_path='../', file_ext='', file_format=-1,
                file_isdir=True, file_modified=datetime.now(), file_perm=None, searchable=None,
                file_size=0, host=self.env['host']
            )
            setattr(x, 'file_name_human', '..')
            self.files.insert(0, x)

        self.files = self.findex.set_icons(self.files)

    def sort(self):
        # calculate total folder file size (non-recursive)
        total_size = 0
        for f in self.files:
            total_size += f.file_size

    def generate_action_fetches(self):
        url = 'ftp://%s' % self.env['host']

        if self.env['file_path'] == '/':
            path = ''
        elif self.env['file_path'].startswith('/') and url.endswith('/'):
            path = self.env['file_path'][1:]

        wget_extras = ''
        lftp_extras = ''

        # if self.source.crawl_authtype:
        #     wget_extras = 'user=%s password=%s ' % (self.source.crawl_username, self.source.crawl_password)
        #
        #     if self.source.crawl_authtype == 'HTTP_DIGEST':
        #         lftp_extras = Debug('Authentication using DIGEST is not supported by lftp')
        #     else:
        #         lftp_extras = '-u %s,%s ' % (self.source.crawl_username, self.source.crawl_password)

        wget = 'wget %s-r -nH --no-parent \'%s\'' % (wget_extras, url + self.env['file_path'])

        #if not isinstance(lftp_extras, Debug):
        lftp = 'lftp %s-e \'mirror\' \'%s\'' % (lftp_extras, url + self.env['file_path'])
        #else:
        #    lftp = lftp_extras.message

        return dict(wget=wget, lftp=lftp)

    def breadcrumbs(self):
        data = [self.env['host']]
        data += [z for z in self.env['file_path'].split('/')[1:] if z]

        return data

    def output_json(self):
        data = []

        for source_file in self.files:
            if source_file.filename_human == '..':
                continue

            data.append('[%s] %s%s%s' % (
                'D' if source_file.is_directory else 'F',
                self.folder.source.crawl_url,
                source_file.filepath_human,
                source_file.filename_human))

        return '\n'.join(data)
Exemplo n.º 4
0
class Browser():
    def __init__(self, db):
        self.db = db
        self.findex = Findex(db)

        self.env = {}
        self.files = None

    def parse_incoming_path(self, path):
        self.env['isdir'] = path.endswith('/')

        spl = path.split('/')
        self.env['host'] = spl[0]
        self.env['file_path'] = '/' + '/'.join(spl[1:-1])

        if not self.env['isdir']: self.env['file_name'] = path.split('/')[-1]
        if self.env['file_path'] != '/': self.env['file_path'] += '/'

        self.env['file_path_quoted'] = quote_plus(self.env['file_path'])

    def fetch_files(self):
        host = self.db.query(Hosts).filter_by(address=self.env['host']).first()

        if not host:
            raise BrowseException('No host found')

        self.env['host_id'] = host.id
        files = self.findex.get_files_objects(
            host_id=host.id, file_path=self.env['file_path_quoted'])

        if not files:
            raise BrowseException('No files found')

        self.files = files

    def prepare_files(self, sort=None):
        # sort files
        self.files = sorted(self.files,
                            key=lambda k: k.file_name)  # alphabetically
        self.files = sorted(self.files,
                            key=lambda k: k.file_isdir,
                            reverse=True)  # folders always on top

        if not self.env['file_path'] == '/':  # add CDUP dirs
            x = Files(file_name='..',
                      file_path='../',
                      file_ext='',
                      file_format=-1,
                      file_isdir=True,
                      file_modified=datetime.now(),
                      file_perm=None,
                      searchable=None,
                      file_size=0,
                      host=self.env['host'])
            setattr(x, 'file_name_human', '..')
            self.files.insert(0, x)

        self.files = self.findex.set_icons(self.files)

    def sort(self):
        # calculate total folder file size (non-recursive)
        total_size = 0
        for f in self.files:
            total_size += f.file_size

    def generate_action_fetches(self):
        url = 'ftp://%s' % self.env['host']

        if self.env['file_path'] == '/':
            path = ''
        elif self.env['file_path'].startswith('/') and url.endswith('/'):
            path = self.env['file_path'][1:]

        wget_extras = ''
        lftp_extras = ''

        # if self.source.crawl_authtype:
        #     wget_extras = 'user=%s password=%s ' % (self.source.crawl_username, self.source.crawl_password)
        #
        #     if self.source.crawl_authtype == 'HTTP_DIGEST':
        #         lftp_extras = Debug('Authentication using DIGEST is not supported by lftp')
        #     else:
        #         lftp_extras = '-u %s,%s ' % (self.source.crawl_username, self.source.crawl_password)

        wget = 'wget %s-r -nH --no-parent \'%s\'' % (wget_extras, url +
                                                     self.env['file_path'])

        #if not isinstance(lftp_extras, Debug):
        lftp = 'lftp %s-e \'mirror\' \'%s\'' % (lftp_extras,
                                                url + self.env['file_path'])
        #else:
        #    lftp = lftp_extras.message

        return dict(wget=wget, lftp=lftp)

    def breadcrumbs(self):
        data = [self.env['host']]
        data += [z for z in self.env['file_path'].split('/')[1:] if z]

        return data

    def output_json(self):
        data = []

        for source_file in self.files:
            if source_file.filename_human == '..':
                continue

            data.append(
                '[%s] %s%s%s' %
                ('D' if source_file.is_directory else 'F',
                 self.folder.source.crawl_url, source_file.filepath_human,
                 source_file.filename_human))

        return '\n'.join(data)
Exemplo n.º 5
0
class Searcher():
    def __init__(self, cfg, db):
        self.cfg = cfg
        self.db = db
        self.findex = Findex(self.db)

    def _key_check(self, keyword):
        if isinstance(keyword, dict):
            if not 'key' in keyword or not keyword['key']:
                raise SearchException('Search query must contain 4 characters or more')

            keyword = keyword['key'][0]

        block = ['-', ',', '+', '_', '%']
        for b in block:
            keyword = keyword.replace(b, ' ')

        if len(keyword) < 4:
            raise SearchException('Search query must contain 4 characters or more')

        return keyword

    def search(self, vars):
        val = self._key_check(vars)
        val = val.lower()

        filtered = False
        start_dbtime = datetime.now()

        # to-do: move this to API (or make api.py use this class)
        q = self.db.query(Files)

        # if this is later set with Files.<column_name>, it will be sorted on this.
        sort = ''

        sdata = {
            'protocols': [],
            'hosts': [],
            'exts': [],
            'cats': [],
            'fsize': 0
        }

        if 'protocols' in vars:
            protocols = [z.lower() for z in vars['protocols']]
            plookup = {'ftp': 0, 'http': 1, 'smb': 2}

            protocols_ids = []
            if isinstance(protocols, list):
                protocols = [z.lower() for z in protocols]

                for p in protocols:
                    if p in plookup and not plookup[p] in protocols_ids:
                        protocols_ids.append(plookup[p])

            if protocols_ids:
                sdata['protocols'] = protocols_ids

        else:
            sdata['protocols'] = [0, 1, 2]

        if 'hosts' in vars:
            dhosts = vars['hosts']

            if isinstance(dhosts, list):
                if not dhosts[0] == '*':
                    host_ids = []
                    for host in dhosts:
                        host_results = self.db.query(Hosts).filter(Hosts.address==host).filter(Hosts.protocol.in_(sdata['protocols'])).all()

                        for host_result in host_results:
                            host_ids.append(host_result.id)

                    if host_ids:
                        sdata['hosts'] = host_ids
                    else:
                        raise SearchException('Could not find any host entries for specified host(s)')

        if sdata['hosts']:
            q = q.filter(Files.host_id.in_(sdata['hosts']))
            filtered = True

        if 'cats' in vars:
            clookup = {
                'unknown': 0,
                'documents': 1,
                'movies': 2,
                'music': 3,
                'pictures': 4
            }

            dformats = []
            for cat in [z.lower() for z in vars['cats']]:
                if cat in clookup:
                    dformats.append(clookup[cat])
                else:
                    dformats.append(int(cat))

            if isinstance(dformats, list):
                q = q.filter(Files.file_format.in_(dformats))

                for dformat in dformats:
                    sdata['cats'].append(dformat)
        else:
            sdata['cats'] = [0, 1, 2, 3, 4]

        for i in [0, 1, 2, 3, 4]:
            if not i in sdata['cats']:
                filtered = True

        if 'exts' in vars:
            exts = vars['exts']

            if isinstance(exts, list):
                exts = [z.replace('.', '') for z in exts if z]

                q = q.filter(Files.file_ext.in_(exts))
                filtered = True

                for ext in exts:
                    sdata['exts'].append(ext)
        elif '.' in val:
            spl = val.split('.', 1)
            ext = spl[1].replace(',', '').strip()
            q = q.filter(Files.file_ext == ext)
            sdata['exts'].append(ext)

            val = self._key_check(spl[0])
            filtered = True

        if 'size' in vars:
            fsize = vars['size']

            if isinstance(fsize, list):
                fsize = int(fsize[0])

                sizes = {
                    0: '*',
                    1: (0, 8388600),
                    2: (8388600, 134220000),
                    3: (134220000, 536870912),
                    4: (536870912, 2147483648),
                    5: (2147483648, 8589934592),
                    6: (8589934592)
                }

                if fsize == 0:
                    pass
                elif 1 <= fsize <= 5:
                    q = q.filter(Files.file_size > sizes[fsize][0], Files.file_size < sizes[fsize][1])
                    filtered = True
                elif fsize == 6:
                    q = q.filter(Files.file_size > sizes[fsize])
                    filtered = True

                sdata['fsize'] = fsize
                sort = 'file_size'

        if 'path' in vars:
            path = vars['path']

            if isinstance(path, list):
                path = path[0]

                if len(path) > 3:
                    path = quote_plus(path)
                    q = q.filter(Files.file_path.like(path+'%'))
                    filtered = True

        if 'host' in vars:
            host = vars['host']

            if isinstance(host, list):
                host = int(host[0])
                q = q.filter(Files.host_id == host)
                filtered = True

        q = q.filter(Files.searchable.like('%'+val+'%')).limit(600)

        results = {}
        results['data'] = q.all()
        results['load_dbtime'] = (datetime.now() - start_dbtime).total_seconds()

        if sort:
            results['data'] = sorted(results['data'], key=lambda k: k.file_size, reverse=True)

        # to-do: dont do this here
        for r in results['data']:
            host = self.findex.get_host_objects(r.host_id)
            setattr(r, 'host', host)

        results['data'] = self.findex.set_humanize(results['data'])
        results['data'] = self.findex.set_icons(results['data'])
        sdata['filtered'] = filtered

        return {'sdata': sdata, 'results': results, 'key': jinja2.escape(vars['key'][0])}