Exemplo n.º 1
0
def test():
    # Creating a client with a given index name
    client = Client('myIndex')

    # Creating the index definition and schema
    client.drop_index()
    client.create_index([TextField('title', weight=5.0), TextField('body')])

    # Indexing a document
    client.add_document(
        'doc1',
        title='RediSearch',
        body='Redisearch implements a search engine on top of redis')

    # Simple search
    res = client.search("search engine")

    # the result has the total number of results, and a list of documents
    print res.total  # "1"
    print res.docs[0]

    # Searching with snippets
    # res = client.search("search engine", snippet_sizes={'body': 50})

    # Searching with complex parameters:
    q = Query("search engine").verbatim().no_content().paging(0, 5)
    res = client.search(q)
Exemplo n.º 2
0
class TAS_Import():
    def __init__(self, index_name, host="172.16.20.7", port=6382, db=0):
        self.client = Client(index_name, host, port)
        self.host = host
        self.port = port
        self.index_name = index_name
        self.redis = Redis()

    def add_indexing(self, schema):
        self.client.create_index(schema, False, False, [])
        return ["Done"]

    def add_data(self, data):
        for i, rr in enumerate(data):
            index = i + 1
            print rr
            name, age, location = rr['name'], rr['age'], rr['location']
            self.client.add_document(index,
                                     NAME=name,
                                     AGE=age,
                                     LOCATION=location)
        return ["Done"]

    def drop_index(self):
        try:
            self.client.drop_index()
        except:
            pass
Exemplo n.º 3
0
def create_website_items_index():
    "Creates Index Definition."

    # CREATE index
    client = Client(make_key(WEBSITE_ITEM_INDEX), conn=frappe.cache())

    try:
        client.drop_index()  # drop if already exists
    except ResponseError:
        # will most likely raise a ResponseError if index does not exist
        # ignore and create index
        pass
    except Exception:
        raise_redisearch_error()

    idx_def = IndexDefinition([make_key(WEBSITE_ITEM_KEY_PREFIX)])

    # Index fields mentioned in e-commerce settings
    idx_fields = frappe.db.get_single_value("E Commerce Settings",
                                            "search_index_fields")
    idx_fields = idx_fields.split(",") if idx_fields else []

    if "web_item_name" in idx_fields:
        idx_fields.remove("web_item_name")

    idx_fields = list(map(to_search_field, idx_fields))

    client.create_index(
        [TextField("web_item_name", sortable=True)] + idx_fields,
        definition=idx_def,
    )

    reindex_all_web_items()
    define_autocomplete_dictionary()
Exemplo n.º 4
0
def create_product_search_index_async():
    print 'Creating Search Index'
    client = Client('productIndex')
    client.create_index([
        TextField('title', weight=5.0),
        TextField('description'),
        TextField('tags'),
        TextField('category')
    ])
    products = Product.objects.filter(active=True)
    cache.set('Search_index_total', len(products), timeout=None)
    index = 0
    for product in products:
        title = product.name
        description = product.description
        category = ','.join([cat.name for cat in product.category.all()])
        tag = product.tags
        tag_maps = ProductTagMap.objects.filter(product=product)
        for tag_map in tag_maps:
            tag = tag + tag_map.tag.tag + ' '
        category_varients = []
        for pv in ProductVarientList.objects.filter(product=product):
            for cv in pv.key.all():
                category_varients.append(cv.value)
        tag += ' '.join(list(set(category_varients)))
        client.add_document(str(product.id),
                            title=title,
                            description=description,
                            tags=tag,
                            category=category)
        cache.set('Search_index_index', index, timeout=None)
        index += 1
    return True
class SearchDemo:
    def __init__(self, args):
        self.index = args.index
        self.client = Client(self.index, host=args.host, port=args.port)

    def create(self):
        try:
            self.client.drop_index()
        except:
            pass

        self.client.create_index([
            NumericField('WORDCOUNT', sortable=True),
            TextField('BYLINE', no_stem=True, sortable=True),
            TextField('DOCUMENTTYPE', sortable=True),
            TextField('HEADLINE', sortable=True),
            TagField('KEYWORDS', separator=';'),
            NumericField('MULTIMEDIA', sortable=True),
            TextField('NEWDESK', sortable=True),
            NumericField('PRINTPAGE', sortable=True),
            NumericField('PUBDATE', sortable=True),
            TextField('SECTIONNAME', sortable=True),
            TextField('SNIPPET', sortable=True),
            TextField('TYPEOFMATERIAL', sortable=True),
            TextField('WEBURL')
        ])
Exemplo n.º 6
0
class RediSearchClient(object):
    def __init__(self, index_name):
        self.client = Client(index_name)
        self.index_name = index_name

    def build_index(self, line_doc_path, n_docs):
        line_pool = LineDocPool(line_doc_path)

        try:
            self.client.drop_index()
        except:
            pass

        self.client.create_index([TextField('title'), TextField('url'), TextField('body')])

        for i, d in enumerate(line_pool.doc_iterator()):
            self.client.add_document(i, nosave = True, title = d['doctitle'],
                    url = d['url'], body = d['body'])

            if i + 1 == n_docs:
                break

            if i % 1000 == 0:
                print "{}/{} building index".format(i, n_docs)

    def search(self, query):
        q = Query(query).paging(0, 5).verbatim()
        res = self.client.search(q)
        # print res.total # "1"
        return res
Exemplo n.º 7
0
def create_website_items_index():
    "Creates Index Definition."

    # CREATE index
    client = Client(make_key(WEBSITE_ITEM_INDEX), conn=frappe.cache())

    # DROP if already exists
    try:
        client.drop_index()
    except Exception:
        pass

    idx_def = IndexDefinition([make_key(WEBSITE_ITEM_KEY_PREFIX)])

    # Based on e-commerce settings
    idx_fields = frappe.db.get_single_value('E Commerce Settings',
                                            'search_index_fields')
    idx_fields = idx_fields.split(',') if idx_fields else []

    if 'web_item_name' in idx_fields:
        idx_fields.remove('web_item_name')

    idx_fields = list(map(to_search_field, idx_fields))

    client.create_index(
        [TextField("web_item_name", sortable=True)] + idx_fields,
        definition=idx_def,
    )

    reindex_all_web_items()
    define_autocomplete_dictionary()
Exemplo n.º 8
0
def cache_to_redis(data: dict):
    if REDIS_HOSTNAME == '':
        print('REDIS_HOSTNAME environment variable is not set')
        return
    client = Client('games', host=REDIS_HOSTNAME, port=REDIS_PORT)
    indexCreated = False
    maxAltNames = len(max(data.values(), key=lambda d: len(d['alt_names']))['alt_names'])
    while not indexCreated:
        try:
            client.create_index([TextField('name', weight=10),
                                *[TextField('alt_name_%d' % i, weight=10) for i in range(maxAltNames)],
                                TextField('summary', weight=1)],
                                TextField('cover', weight=0),
                                TextField('thumb', weight=0))
            indexCreated = True
        except Exception:
            print('Failed to create index, retrying %s')
            time.sleep(3)

    for k, v in data.items():
        client.add_document(k,
                            name=v['name'],
                            **{'alt_name_%d' % i: n for i, n in enumerate(v['alt_names'])},
                            cover=v['cover'],
                            thumb=v['thumb'],
                            summary=v['summary'])
    print('done')
Exemplo n.º 9
0
def build_ipa_index():
    start_time = time.time()
    rc = redis.Redis(password=os.environ.get('REDIS_PASSWORD', ''))
    rs_client = Client('IPAIndex', conn=rc)

    print(
        'Getting file `amministrazioni.txt` from https://www.indicepa.gov.it',
        flush=True)
    ipa_index_amm_url = 'https://www.indicepa.gov.it/public-services/opendata-read-service.php?dstype=FS&filename=amministrazioni.txt'
    ipa_index_amm = pd.read_csv(ipa_index_amm_url, sep='\t', dtype=str)

    print('Getting file `ou.txt` from https://www.indicepa.gov.it', flush=True)
    ipa_index_ou_url = 'https://www.indicepa.gov.it/public-services/opendata-read-service.php?dstype=FS&filename=ou.txt'
    ipa_index_ou = pd.read_csv(ipa_index_ou_url,
                               sep='\t',
                               na_values=['da_indicare', '*****@*****.**'],
                               dtype=str)
    ipa_index_ou = ipa_index_ou.loc[lambda ipa_index_ou: ipa_index_ou['cod_ou']
                                    == 'Ufficio_Transizione_Digitale']

    try:
        rs_client.drop_index()
    except:
        pass  # Index already dropped

    rs_client.create_index([
        TextField('ipa_code', weight=2.0),
        TextField('name', weight=2.0, sortable=True),
        TextField('site'),
        TextField('pec'),
        TextField('city', weight=1.4),
        TextField('county'),
        TextField('region'),
        TagField('type'),
        TextField('rtd_name'),
        TextField('rtd_pec'),
        TextField('rtd_mail'),
    ])
    print('Created index `IPAIndex`', flush=True)

    print('Feeding `IPAIndex` with data from `amministrazioni.txt`',
          flush=True)
    for index, row in ipa_index_amm.iterrows():
        rs_client.add_document(row['cod_amm'],
                               language='italian',
                               replace=True,
                               **get_ipa_amm_item(row))

    print('Feeding `IPAIndex` with data from `ou.txt`', flush=True)
    for index, row in ipa_index_ou.iterrows():
        rs_client.add_document(row['cod_amm'],
                               partial=True,
                               **get_ipa_rtd_item(row))

    finish_time = time.time()
    print('`IPAIndex` build completed in {0} seconds'.format(
        round(finish_time - start_time, 2)),
          flush=True)
Exemplo n.º 10
0
 def create_index(cls):
     error_message = "Unable to create Index. Try Again"
     redis_enabled = os.getenv("REDIS_SEARCH", False)
     if redis_enabled:
         client = Client("tower", port=6379, host=os.getenv('REDIS_HOST'))
         try:
             client.create_index(document)
             cls.build_index(client)
             print("Watcher Index created successfully")
         except ResponseError as err:
             print(err)
     else:
         print(error_message)
def insert():
    # insertion of search/suggestion data
    suggestion_client = Client('movie')
    suggestion_client.create_index([TextField('title'), TagField('genres', separator = '|')])

    for i in range(0, len(movie_df)):
        suggestion_client.add_document(movie_df['tmdbId'][i], title = movie_df['title'][i], genres = movie_df['genres'][i])

    # insertion of auto-completion data
    completion_client = AutoCompleter('ac')

    for i in range(0, len(movie_df)):
        completion_client.add_suggestions(Suggestion(movie_df['title'][i]))
Exemplo n.º 12
0
    def clientpush(self):
        client = Client('Checkout')

        client.create_index([
            NumericField('Key'),
            TextField('UsageClass'),
            TextField('CheckoutType'),
            TextField('MaterialType'),
            NumericField('CheckoutYear'),
            NumericField('CheckoutMonth'),
            NumericField('Checkouts'),
            TextField('Title'),
            TextField('Creator'),
            TextField('Subjects'),
            TextField('Publisher'),
            TextField('PublicationYear')
        ])

        db_connection, _ = self.connect()
        cursor = db_connection.cursor()
        cursor.execute('SELECT * FROM customers')
        results = cursor.fetchall()
        i = 0
        for result in results:
            client.add_document('doc%s' % i,
                                Key=result[0],
                                UsageClass=result[1],
                                CheckoutType=result[2],
                                MaterialType=result[3],
                                CheckoutYear=result[4],
                                CheckoutMonth=result[5],
                                Checkouts=result[6],
                                Title=result[7],
                                Creator=result[8],
                                Subjects=result[9],
                                Publisher=result[10],
                                PublicationYear=result[11])
            i += 1
            print(i)
        res = client.search('BOOK')

        print("{}   {}".format(res.total, res.docs[0].Title))
        res1 = client.search("use")
        print(res1)
        q = Query('use').verbatim().no_content().paging(0, 5)
        res1 = client.search(q)
        print(res1)
        cursor.close()
        db_connection.close()
Exemplo n.º 13
0
class RandomWikipediaImport(object):

    def __init__(self):
        self.rs = Client('wikipedia')
        self.rs.create_index((TextField('title', weight=5.0), TextField('body')))
        print(f'>>> Created index')

    def insert_random_loop(self):
        i = 1
        while True:
            ra = wikipedia.random()
            article = wikipedia.page(ra)
            self.rs.add_document(f'doc{i}', title=article.title, body=article.content)
            print(f'>>> Inserted {article.title}')
            i += 1
Exemplo n.º 14
0
class TAS_Import():
    def __init__(self, index_name, host=ip, port=port, db=db):
        self.client = Client(index_name, host, port)
        self.host = host
        self.port = port
        #self.redis = Redis()

    def add_indexing_schema(self, schema):
        self.client.create_index(schema, False, False, [])
        return ["Done"]

    def add_data(self, rdata, company, doc_id, project):
        for i, rr in enumerate(rdata):
            index = doc_id + company + "CMDIC" + str(i + 1) + project
            l1, l2, l3 = rr
            l1 = config_obj.StringEscape(l1)
            self.client.add_document(index, DATA=l1, PAGE=l2, BBOX=l3)
        return ["Done"]

    def drop_index(self):
        try:
            self.client.drop_index()
        except Exception as e:
            #print 'Error',e
            pass

    def start(self, data, doc_id, company, project):
        status = 1
        index_name = project + "_DOCUMENT_" + str(doc_id)
        self.drop_index()
        self.client = Client(index_name, self.host, self.port)
        status = 2
        schema = [
            NumericField('INDEX'),
            TextField('DATA'),
            TextField('PAGE'),
            TextField('BBOX')
        ]
        status = 3
        self.add_indexing_schema(schema)
        status = 4
        self.add_data(data, company, doc_id, project)
        status = 5
        return [status]
Exemplo n.º 15
0
class EventProcessor():
    def __init__(self):
        self.r = redis.from_url(config.EVENT_BROKER_URL)
        self.client = Client('CCTV_DATA')
        try:
            self.client.create_index([TextField('CCTV_ID'), TagField('TAGS')])
        except Exception as error:
            print("Error while creatign index", error)

        # self.client.create_index([TextField('title', weight=5.0), TextField('body')])

    def get_objects_in_image(self, image):
        # TODO: call RedisAI module
        objects = [
            "key", "passport", "wallet", "car", "bag", "watch", "book",
            "satchel", "laptop", "camera", "mobile_phone"
        ]
        tags = []
        tags.append(objects[r.randint(0, 10)])
        tags.append(objects[r.randint(0, 10)])
        tags.append(objects[r.randint(0, 10)])
        tags.append(objects[r.randint(0, 10)])

        return tags

    def process(self, msg):
        print("Going to process message and and store it", msg)
        # print(float(msg["LON"]), float(msg["LAT"]), msg["CCTV_ID"])
        # print(type(float(msg["LON"])), type(float(msg["LAT"])), msg["CCTV_ID"])
        try:
            self.r.geoadd("CCTV_LOCATION", float(msg["LON"]),
                          float(msg["LAT"]), msg["CCTV_ID"])
            msg["TAGS"] = self.get_objects_in_image(msg.get("IMAGE", ""))
            # print("Going to store this in search", msg)

            doc_unique_key = msg["CCTV_ID"] + "_" + msg["TS"]

            self.client.add_document(doc_unique_key,
                                     CCTV_ID=doc_unique_key,
                                     TAGS=",".join(msg["TAGS"]))

        except Exception as error:
            print("Error while adding ccty data", error)
Exemplo n.º 16
0
def index():
    client = Client('sh')
    #    client.drop_index()
    client.create_index(txt=1.0)
    chapters = {}
    with open('will_play_text.csv') as fp:

        r = csv.reader(fp, delimiter=';')
        for line in r:
            #['62816', 'Merchant of Venice', '9', '3.2.74', 'PORTIA', "I'll begin it,--Ding, dong, bell."]

            play, chapter, character, text = line[1], line[2], line[4], line[5]

            d = chapters.setdefault('{}:{}'.format(play, chapter), {})
            d['play'] = play
            d['text'] = d.get('text', '') + ' ' + text

    for chapter, doc in chapters.iteritems():
        print chapter, doc
        client.add_document(chapter, nosave=True, txt=doc['text'])
Exemplo n.º 17
0
def index():
    client = Client('sh')
#    client.drop_index()
    client.create_index(txt=1.0)
    chapters = {}
    with open('will_play_text.csv') as fp:

        r = csv.reader(fp, delimiter=';')
        for line in r:
            #['62816', 'Merchant of Venice', '9', '3.2.74', 'PORTIA', "I'll begin it,--Ding, dong, bell."]

            play, chapter, character, text = line[1], line[2], line[4], line[5]

            d = chapters.setdefault('{}:{}'.format(play, chapter), {})
            d['play'] = play
            d['text'] = d.get('text', '') + ' ' + text

    for chapter, doc in chapters.iteritems():
        print chapter, doc
        client.add_document(chapter, nosave=True, txt=doc['text'])
Exemplo n.º 18
0
    def get(self, request):
        # data=request.data
        mes = {}
        search_key = request.GET.get('key')
        print(search_key)
        all_classes = Course.objects.all()
        print("开始创建索引——————————————————————————")
        # 创建一个客户端与给定索引名称
        client = Client('CII' + str(datetime.now()), host=settings.SIP, port='6666')

        # 创建索引定义和模式
        client.create_index((TextField('title'), TextField('body')))
        print('索引创建完毕————————————————————————————————')
        print('开始添加数据————————————————————————————————')

        for i in all_classes:
            print(str(i.id) + str(i.title))
            # 索引文
            client.add_document('result' + str(datetime.now()), title=i.title + '@' + str(i.id), info=i.info,
                                language='chinese')
            print(333333333)
        print('数据添加完毕————————————————————————————————')
        print(client.info())
        # 查找搜索
        client = Client('CII' + str(datetime.now()), host=settings.SIP, port='6666')

        res = client.search(search_key)
        print('查询结束————————————————————————————————————————————————')
        id_list = []
        print(res.docs)
        for i in res.docs:
            # print(i.title)  # 取出title,以@切割,取课程ID查询,然后序列化展示
            id = i.title.split('@')[1]
            id_list.append(id)
        course = Course.objects.filter(id__in=id_list).all()
        c = CourseSerializersModel(course, many=True)
        mes['course'] = c.data
        mes['code'] = 200
        mes['message'] = '搜索完毕'
        return Response(mes)
class SearchDemo:
    def __init__(self, args):
        self.index = args.index
        self.client = Client(self.index, host=args.host, port=args.port)

    def create(self):
        try:
            self.client.drop_index()
        except:
            pass

        self.client.create_index([
            NumericField('ORDERNUMBER'),
            NumericField('QUANTITYORDERED', sortable=True),
            NumericField('PRICEEACH', sortable=True),
            NumericField('ORDERLINENUMBER'),
            NumericField('SALES', sortable=True),
            TextField('ORDERDATE'),
            TextField('STATUS', sortable=True),
            NumericField('QTR_ID', sortable=True),
            NumericField('MONTH_ID', sortable=True),
            NumericField('YEAR_ID', sortable=True),
            TextField('PRODUCTLINE', sortable=True),
            NumericField('MSRP', sortable=True),
            TextField('PRODUCTCODE', sortable=True),
            TextField('CUSTOMERNAME', sortable=True),
            TextField('PHONE'),
            TextField('ADDRESSLINE1'),
            TextField('ADDRESSLINE2'),
            TextField('CITY', sortable=True),
            TextField('STATE', sortable=True),
            TextField('POSTALCODE', sortable=True),
            TextField('COUNTRY', sortable=True),
            TextField('TERRITORY', sortable=True),
            TextField('CONTACTLASTNAME'),
            TextField('CONTACTFIRSTNAME'),
            TextField('DEALSIZE', sortable=True)
        ])
Exemplo n.º 20
0
class Hub(object):
    dconn = None  # document store connection
    sconn = None  # search index connection
    qconn = None  # queue connection
    gh = None
    autocomplete = None
    repo = None
    _ts = None
    _hubkey = 'hub:catalog'
    _ixname = 'ix'
    _acname = 'ac'

    def __init__(self,
                 ghlogin_or_token=None,
                 docs_url=None,
                 search_url=None,
                 queue_url=None,
                 repo=None):
        timestamp = datetime.utcnow()
        logger.info('Initializing temporary hub {}'.format(timestamp))

        if ghlogin_or_token:
            self.gh = Github(ghlogin_or_token)
        elif 'GITHUB_TOKEN' in os.environ:
            self.gh = Github(os.environ['GITHUB_TOKEN'])
        else:
            logger.info('Env var ' 'GITHUB_TOKEN' ' not found')

        if docs_url:
            pass
        elif 'DOCS_REDIS_URL' in os.environ:
            docs_url = os.environ['DOCS_REDIS_URL']
        else:
            logger.critical('No Redis for document storage... bye bye.')
            raise RuntimeError('No Redis for document storage... bye bye.')
        self.dconn = ReJSONClient().from_url(docs_url)

        if search_url:
            pass
        elif 'SEARCH_REDIS_URL' in os.environ:
            search_url = os.environ['SEARCH_REDIS_URL']
        else:
            search_url = docs_url
        conn = Redis(connection_pool=ConnectionPool().from_url(search_url))
        self.sconn = RediSearchClient(self._ixname, conn=conn)
        self.autocomplete = AutoCompleter(self._acname, conn=conn)

        if queue_url:
            pass
        elif 'QUEUE_REDIS_URL' in os.environ:
            queue_url = os.environ['QUEUE_REDIS_URL']
        else:
            queue_url = docs_url
        self.qconn = StrictRedis.from_url(queue_url)

        if repo:
            pass
        elif 'REDISMODULES_REPO' in os.environ:
            repo = os.environ['REDISMODULES_REPO']
        else:
            logger.critical('No REDISMODULES_REPO... bye bye.')
            raise RuntimeError('No REDISMODULES_REPO... bye bye.')
        self.repo = repo

        # Check if hub exists
        if self.dconn.exists(self._hubkey):
            self._ts = datetime.fromtimestamp(
                float(self.dconn.jsonget(self._hubkey, Path('.created'))))
            logger.info('Latching to hub {}'.format(self._ts))
        else:
            self._ts = timestamp
            logger.info('Creating hub {}'.format(self._ts))
            self.createHub()
            self.addModulesRepo(self.repo)

    def get_repo_url(self):
        return 'https://github.com/{}'.format(self.repo)

    def createHub(self):
        logger.info('Creating the hub in the database {}'.format(self._ts))
        # Store the master modules catalog as an object
        self.dconn.jsonset(
            self._hubkey, Path.rootPath(), {
                'created': str(_toepoch(self._ts)),
                'modules': {},
                'submissions': [],
                'submit_enabled': False
            })

        # Create a RediSearch index for the modules
        # TODO: catch errors
        self.sconn.create_index(
            (TextField('name', sortable=True), TextField('description'),
             NumericField('stargazers_count', sortable=True),
             NumericField('forks_count', sortable=True),
             NumericField('last_modified', sortable=True)),
            stopwords=stopwords)

    def deleteHub(self):
        # TODO
        pass

    def addModule(self, mod):
        logger.info('Adding module to hub {}'.format(mod['name']))
        # Store the module object as a document
        m = RedisModule(self.dconn, self.sconn, self.autocomplete, mod['name'])
        m.save(mod)

        # Add a reference to it in the master catalog
        self.dconn.jsonset(
            self._hubkey, Path('.modules["{}"]'.format(m.get_id())), {
                'id': m.get_id(),
                'key': m.get_key(),
                'created': str(_toepoch(self._ts)),
            })

        # Schedule a job to refresh repository statistics, starting from now and every hour
        s = Scheduler(connection=self.qconn)
        job = s.schedule(
            scheduled_time=datetime(1970, 1, 1),
            func=callRedisModuleUpateStats,
            args=[m.get_id()],
            interval=60 * 60,  # every hour
            repeat=None,  # indefinitely
            ttl=0,
            result_ttl=0)
        return m

    """
    Adds modules to the hub from a local directory
    TODO: deprecate asap
    """

    def addModulesPath(self, path):
        logger.info('Loading modules from local path {}'.format(path))
        # Iterate module JSON files
        for filename in os.listdir(path):
            if filename.endswith(".json"):
                with open('{}/{}'.format(path, filename)) as fp:
                    mod = json.load(fp)

                m = self.addModule(mod['name'], mod)

    """
    Adds a modules to the hub from a github repository
    """

    def addModulesRepo(self, name, path='/modules/'):
        # TODO: check for success
        q = Queue(connection=self.qconn)
        q.enqueue(callLoadModulesFromRepo, name, path)

    def loadModulesFromRepo(self, name, path):
        logger.info('Loading modules from Github {} {}'.format(name, path))
        # TODO: error handling, sometimes not all contents are imported?
        repo = self.gh.get_repo(name)
        files = repo.get_dir_contents(path)
        for f in files:
            mod = json.loads(f.decoded_content)
            m = self.addModule(mod)

    """
    Submits a module to the hub
    """

    def submitModule(self, repo_id, **kwargs):
        logger.info('Module submitted to hub {}'.format(repo_id))
        repo_id = repo_id.lower()
        ts = datetime.utcnow()
        res = {'id': repo_id, 'status': 'failed'}

        if not self.dconn.jsonget(self._hubkey, Path('submit_enabled')):
            res['message'] = 'Module submission is currently disabled'
            return res

        # Check if the module is already listed
        m = RedisModule(self.dconn, self.sconn, self.autocomplete, repo_id)
        if m.exists:
            # TODO: return in search results
            res['message'] = 'Module already listed in the hub'
            return res

        # Check if there's an active submission, or if the failure was too recent
        submission = Submission(self.dconn, repo_id)
        if submission.exists:
            status = submission.status
            if status != 'failed':
                res['status'] = 'active'
                res['message'] = 'Active submission found for module'
                return res
            else:
                # TODO: handle failed submissions
                res['message'] = 'Module already submitted to the hub and had failed, please reset manually for now'
                return res

        # Store the new submission
        submission.save(**kwargs)

        # Record the submission in the catalog
        # TODO: find a good use for that, e.g. 5 last submissions
        self.dconn.jsonarrappend(self._hubkey, Path('.submissions'), {
            'id': submission.get_id(),
            'created': submission.created,
        })

        # Add a job to process the submission
        q = Queue(connection=self.qconn)
        job = q.enqueue(callProcessSubmission, submission.get_id())
        if job is None:
            res['message'] = 'Submission job could not be created'
            # TODO: design retry path
            logger.error(
                'Could not create submission processing job for {}'.format(
                    submission.get_id()))
        else:
            res['status'] = 'queued'
            submission.status = res['status']
            submission.job = job.id

        return res

    def viewSubmissionStatus(self, repo_id):
        submission = Submission(self.dconn, repo_id)
        if submission.exists:
            res = {
                'id': submission.get_id(),
                'status': submission.status,
                'message': submission.message,
            }
            if 'finished' == res['status']:
                res['pull_number'] = submission.pull_number
                res['pull_url'] = submission.pull_url
            return res

    def processSubmission(self, repo_id):
        logger.info('Processing submision for {}'.format(repo_id))
        submission = Submission(self.dconn, repo_id)
        if submission.exists:
            return submission.process(self.gh, self.repo)

    def viewModules(self, query=None, sort=None):
        if not query:
            # Use a purely negative query to get all modules
            query = '-etaoinshrdlu'
        q = Query(query).no_content().paging(0, 1000)
        if sort:
            if sort == 'relevance':
                pass
            elif sort == 'update':
                q.sort_by('last_modified')
            elif sort == 'stars':
                q.sort_by('stargazers_count', asc=False)
            elif sort == 'forks':
                q.sort_by('forks_count', asc=False)
            elif sort == 'name':
                q.sort_by('name')

        results = self.sconn.search(q)
        mods = []
        fetch_duration = 0
        # TODO: this should be pipelined
        for doc in results.docs:
            m = RedisModule(self.dconn, self.sconn, self.autocomplete, doc.id)
            res, duration = _durationms(m.to_dict)
            mods.append(res)
            fetch_duration += duration

        return {
            'results': results.total,
            'search_duration': '{:.3f}'.format(results.duration),
            'fetch_duration': '{:.3f}'.format(fetch_duration),
            'total_duration':
            '{:.3f}'.format(fetch_duration + results.duration),
            'modules': mods,
        }

    def viewSearchSuggestions(self, prefix):
        suggestions = self.autocomplete.get_suggestions(prefix)
        return [s.string for s in suggestions]
Exemplo n.º 21
0
#--------------------------------------------
# Import the whole dirctory to redisearch
# Create the index and the documents
# Change the the dirt to your document's path
#--------------------------------------------

import os
from redisearch import Client, Query, TextField

dirt = "/path/to/the/documents/"  # Change it to your own path

client = Client("BoxGroup", port=6379)  # 6379 as default
client.create_index([TextField('title'), TextField('body')])

filelist = os.listdir(dirt)
filelist = sorted(filelist)
try:
    filelist.remove(".git")
except:
    print("git目录不存在,已跳过")
filecounter = 0
for filename in filelist:
    openfilename = dirt + filename
    with open(openfilename, "r+") as f:
        data = f.read()
        try:
            client.add_document(filecounter,
                                title=filename,
                                body=data,
                                language="chinese")
        except:
Exemplo n.º 22
0
import pandas as pd 
import json
from tqdm import tqdm
from redisearch import Client, TextField, NumericField, Query
from time import sleep
from rediscluster import StrictRedisCluster

sleep(15)

nodes = [{'host': "173.17.0.2", 'port': "7000"}]
rc = StrictRedisCluster(startup_nodes=nodes, decode_responses=True)


client=Client('week1', conn=rc)
client.create_index([TextField('name'), TextField('surname'), TextField('job')])
dat = pd.read_csv("test.csv")


for idx, row in tqdm(dat.iterrows()):
	client.add_document(f"{row['index']}", replace=True, partial=True, name = f"{row['name']}", surname = f"{row['surname']}", job = f"{row['job']}")
Exemplo n.º 23
0
import hashlib 
import gpxpy 
import gpxpy.gpx 
from redisearch import Client, Query, TextField, GeoField, NumericField


client = Client(
   'attractions',
   host='127.0.0.1',
   password='',
   port=6379
   )

client.create_index([
   TextField('title', weight=5.0),
   TextField('description'),
   NumericField('verified', sortable=True),
   GeoField('geo'),
])


gpx_file = open('All_States_Offbeat_Tourist_Attractions.gpx', 'r', encoding='utf-8')

gpx = gpxpy.parse(gpx_file)

for waypoint in gpx.waypoints:
    if "Verified" in waypoint.comment:
        v = 1
    else:
        v = 0
    t = "%s,%s,%s" %(waypoint.name, waypoint.longitude, waypoint.latitude)
    client.add_document(
Exemplo n.º 24
0
class UserCache:
    def __init__(self):
        self.client = Client("api_user_index", app.config["REDIS_HOST"],
                             app.config["REDIS_PORT"])

    def create_user_index(self, users):
        """
        Creates a new user index if not exists
        :param users:
        :return:
        """
        definition = IndexDefinition(prefix=['doc:', 'user:'])

        try:
            self.client.create_index(
                (TextField("first_name"), TextField("last_name"),
                 TextField("email"), NumericField("age"),
                 NumericField("is_employee"),
                 NumericField("user_id", sortable=True)),
                definition=definition)
        except redis.exceptions.ResponseError:
            return False

        indexer = self.client.batch_indexer(chunk_size=len(users))

        for user in users:
            fields = {
                "first_name":
                user.first_name.translate(str.maketrans({"-": r"\-"})),
                "last_name":
                user.last_name.translate(str.maketrans({"-": r"\-"})),
                "email":
                user.email.translate(str.maketrans({"-": r"\-"})),
                "age":
                user.age,
                "user_id":
                user.id,
                "is_employee":
                int(user.is_employee),
            }
            indexer.add_document(f"doc:{user.id}", **fields)
        indexer.commit()

        return True

    def cache_single_user(self, user):
        """
        Caches a single user
        :param user:
        :return:
        """
        self.client.redis.hset(
            f"doc:{user.id}",
            mapping={
                "first_name":
                user.first_name.translate(str.maketrans({"-": r"\-"})),
                "last_name":
                user.last_name.translate(str.maketrans({"-": r"\-"})),
                "email":
                user.email.translate(str.maketrans({"-": r"\-"})),
                "age":
                user.age,
                "user_id":
                user.id,
                "is_employee":
                int(user.is_employee),
            })

        return True

    def search(self, filters, page, per_page):
        """
        Searches through redis
        :return:
        """
        q = Query(self.build_query(filters)).paging(
            (page - 1) * per_page, per_page).sort_by("user_id")

        return self.client.search(q)

    def build_query(self, filters):
        query = []
        age = "+@age:[minAge maxAge]"

        for filter_name, value in filters.items():
            # Ugly non-solid way
            if value is not None:
                if filter_name == "firstName" and len(value) > 1:
                    query.append(f"+@first_name:{value}*")
                if filter_name == "lastName" and len(value) > 1:
                    query.append(f"+@last_name:{value}*")
                if filter_name == "email" and len(value) > 1:
                    query.append(f"+@email:{value}*")
                if filter_name == "minAge":
                    age = age.replace("minAge", str(value))
                if filter_name == "maxAge":
                    age = age.replace("maxAge", str(value))
                if filter_name == "isEmployee":
                    query.append(f"+@is_employee:{int(value)}")

        age = age.replace("minAge", "0")
        age = age.replace("maxAge", "100")

        query.append(age)

        return " ".join(query)
Exemplo n.º 25
0
# Creating a client with a given index name
client = Client("cveIndex")
try:
    client.info()
except Exception as e:
    if e.args[0] != "Unknown Index name":
        print("You must be running a redis server with the redisearch module installed")
        exit()

# IndexDefinition is avaliable for RediSearch 2.0+
definition = IndexDefinition(prefix=['cve:'])

# Creating the index definition and schema
try:
    client.create_index((TextField("id"), TextField("description"), TextField("configurations")), definition=definition)
except:
    # Index already exists. Delete and recreate
    client.drop_index()
    print("Index already exists. Dropping. Delete keys and try again.")
    exit()

def process_CVE_file(file):
    with open(file, 'r', encoding="utf8") as f:
        json = ujson.decode(f.read())
        cve_items = json['CVE_Items']
        for cve_item in cve_items:
            cve_id = cve_item['cve']['CVE_data_meta']['ID']
            cve_desc = cve_item['cve']['description']['description_data'][0]['value']
            cve_configurations = str(cve_item['configurations']['nodes'])
            # Sanitizing special characters to prevent them from being tokenized away
Exemplo n.º 26
0
def open_redis():
    if not os.path.isdir('./nvd_data_feeds/'):
        os.mkdir('./nvd_data_feeds/')

    print('Creating the docker container with redislabs/redisearch\n')
    Popen([
        'docker', 'run', '--rm', '--name', 'amadeus', '-p', '6379:6379',
        'redislabs/redisearch:latest'
    ])
    sleep(6)

    urls = [
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2021.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2020.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2019.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2018.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2017.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2016.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2015.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2014.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2013.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2012.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2011.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2010.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2009.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2008.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2007.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2006.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2005.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2004.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2003.json.zip',
        'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2002.json.zip',
        'https://nvd.nist.gov/feeds/json/cpematch/1.0/nvdcpematch-1.0.json.zip'
    ]

    print('\nDownloading and unziping json feeds')
    if not os.path.isdir('./downloads/'):
        os.mkdir('./downloads/')
    tam = len(urls)
    dl = 0
    for url in urls:
        name = url.split('/')[-1]
        response = get(url)
        open('./downloads/' + name, 'wb').write(response.content)
        with ZipFile('./downloads/' + name, 'r') as zip_ref:
            zip_ref.extractall('./nvd_data_feeds/')
        dl += 1
        prog = dl / tam
        done = int(50 * prog)
        stdout.write('\r[%s%s%s]%s' %
                     ('Progres > ', '=' * (done - 1) + '>', ' ' *
                      (50 - done), str(round(prog * 100)) + '%'))
    rmtree('./downloads/')
    print('\n')

    print('Start processing CVE feeds')

    # Create a normal redis connection
    conn = redis.Redis('localhost')

    # Creating a client with a given index name
    client = Client('cveIndex')

    # IndexDefinition is avaliable for RediSearch 2.0+
    definition = IndexDefinition(prefix=['cve:'])

    # Creating the index definition and schema
    try:
        client.create_index((TextField('id'), TextField('description'),
                             TextField('configurations')),
                            definition=definition)
    except:
        # Index already exists. Delete and recreate
        client.drop_index()
        print('Index already exists\nDropping\nDelete keys and try again')
        exit()

    def process_CVE_file(file):
        with open(file, 'r', encoding='utf8') as f:
            json = ujson.decode(f.read())
            cve_items = json['CVE_Items']
            for cve_item in cve_items:
                cve_id = cve_item['cve']['CVE_data_meta']['ID']
                cve_desc = cve_item['cve']['description']['description_data'][
                    0]['value']
                cve_configurations = str(cve_item['configurations']['nodes'])
                # Sanitizing special characters to prevent them from being tokenized away
                cve_desc_sanitized = cve_desc.replace(':', 'cc11').replace(
                    '.', 'pp22').replace('*', 'ss33')
                cve_configurations_sanitized = cve_configurations.replace(
                    ':', 'cc11').replace('.', 'pp22').replace('*', 'ss33')
                # Indexing a document for RediSearch 2.0+
                client.redis.hset('cve:' + cve_id,
                                  mapping={
                                      'id':
                                      cve_id,
                                      'description':
                                      cve_desc_sanitized,
                                      'configurations':
                                      cve_configurations_sanitized
                                  })
            print('Processed ' + file)

    with ThreadPoolExecutor(max_workers=20) as pool:
        futures = []
        for i in range(2002, 2021):
            future = pool.submit(
                process_CVE_file,
                './nvd_data_feeds/nvdcve-1.1-{0}.json'.format(i))
            futures.append(future)
        json_list = [x.result() for x in as_completed(futures)]

    print('Done processing CVE feeds\nProcessing NVD CPE match feed')

    with open('./nvd_data_feeds/nvdcpematch-1.0.json', 'r',
              encoding='utf8') as f:
        json = ujson.decode(f.read())
        matches = json['matches']
        for match in matches:
            rootUri = match['cpe23Uri']
            keyName = rootUri
            if 'versionStartIncluding' in match:
                keyName += ';;versionStartIncluding=' + match[
                    'versionStartIncluding']
            if 'versionStartExcluding' in match:
                keyName += ';;versionStartExcluding=' + match[
                    'versionStartExcluding']
            if 'versionEndIncluding' in match:
                keyName += ';;versionEndIncluding=' + match[
                    'versionEndIncluding']
            if 'versionEndExcluding' in match:
                keyName += ';;versionEndExcluding=' + match[
                    'versionEndExcluding']
            if len(match['cpe_name']) > 0:
                # if CPE list is empty no need to include it in cache
                valueString = ';;'.join(x['cpe23Uri']
                                        for x in match['cpe_name'])
                conn.set(keyName, valueString)

    print('\nAMADEUS is already launched!')
Exemplo n.º 27
0
import re
from datetime import datetime
import json
import logging
from random import randint
from time import sleep

#stagger reading and indexing for parallel
sleep(randint(1, 10))

logging.basicConfig(filename='parse.log',level=logging.INFO)

client = Client('medline')

try:
        client.create_index([TextField('abstract')])

except ResponseError:
        pass

with open(sys.argv[1], 'r') as f:
        data=f.read()

recs = data.split("<PubmedArticle>");
recs = recs[1:]

indexer = client.batch_indexer(chunk_size=500)

count = 0

for r in recs:
Exemplo n.º 28
0
from redisearch import Client, TextField

# Creating a client with a given index name
client = Client('myIndex')

# Creating the index definition and schema
client.create_index((TextField('title', weight=5.0), TextField('body')))

# Indexing a document
client.add_document(
    'doc1',
    title='RediSearch',
    body='Redisearch impements a search engine on top of redis')
Exemplo n.º 29
0
import time
from redisearch import Client, TextField, NumericField, Query
from redis.exceptions import ResponseError

file = open('test_set_tweets.txt', 'r')
client = Client('Tweets')
client.redis.flushdb()
client.create_index([TextField('tweet'), TextField('timestamp')])
start = time.time()
for x, line in enumerate(file.readlines()):
    content = line.strip().split('\t')
    try:
        if len(content) == 4:  # tem data
            client.add_document('-'.join(content[:2]),
                                tweet=content[-2],
                                timestamp=content[-1])
        else:
            client.add_document('-'.join(content[:2]),
                                tweet=content[-1],
                                timestamp='')
    except ResponseError:
        pass
    if x % 1000 == 0:
        print(x, 'lines indexed...')

end = time.time()
print("Indexing time elapsed", end - start)

total = 0
for i in range(30):
    start = time.time()
Exemplo n.º 30
0
class BaseSearchHandler(BaseSearchHandlerSupport):
    def __init__(self):
        super().__init__()
        self._entity = None
        # This will only be here as an example
        self.is_replacement = False
        self.current_replacement = None
        self.is_set_entity = False

        # Subs are all of the subfields we would need to search through
        self.subs: Dict[str, BaseSearchHandler] = {}

        self.current_doc_id = None
        self.current_doc_id_list = set()
        self.current_client = None

        self.print_sub = False
        self.use_sub_query = False

        self._super_ids = []
        self._sub_ids = []
        self.finished_alter = False

        self.search_sub = False
        self._processor: Optional[Processor] = None

    def __setitem__(self, key: str, value: Any):
        if key not in self.requirements.keys() and (not self.is_replacement):
            return
        self.is_set_entity = False
        self.current_client = None
        if isinstance(value, dict):
            if len(value) == 0:
                return
            self.handle_input_dict_key(key, value)
        else:
            _instance_type = type(value)
            # check that the value is the right type
            if is_generic(_instance_type):
                _str_type = to_str(_instance_type)
                self.query_builder.insert_by_type_str(_str_type, key, value)
                self.insert_builder.insert_by_type_str(_str_type, key, value)

    @property
    def replacement(self):
        if self.current_replacement is None:
            self.current_replacement = BaseSearchHandler()
            self.current_replacement.is_replacement = True
            self.current_replacement.insert_builder.is_replacement = True
            # self.current_replacement.
        return self.current_replacement

    @property
    def entity(self):
        if self._entity is None:
            raise AttributeError("You haven't set entity yet")
        return self._entity

    @entity.setter
    def entity(self, _entity: str):
        self._entity = _entity

    @property
    def processor(self):
        if self._processor is None:
            raise AttributeError("The processor hasn't been set yet.")
        return self._processor

    @processor.setter
    def processor(self, _processor: Processor):
        self._processor = _processor
        self.set_sub_processors()

    @property
    def requirements(self):
        return self._requirements_str

    @requirements.setter
    def requirements(self, _requirements: dict):
        """If we set it here we'd go through each dict item and create string version of each key"""
        # Document id will allow us to figure out which documents are involved with subkeys

        _requirements["entity"] = str
        _requirements["super_id"] = str
        self.process_requirements(_requirements)
        if not self.is_sub_key:
            self.create_sub_handlers()

    @property
    def dreq(self):
        return self._dreq

    @dreq.setter
    def dreq(self, _req):
        self._dreq = _req
        self.reset()
        self.requirements = _req
        self.replacement.requirements = _req

    @property
    def allrequirements(self):
        return self._dreq

    @allrequirements.setter
    def allrequirements(self, _req):
        self._dreq = _req
        self.reset()
        self.requirements = _req
        self.replacement.requirements = _req

    @property
    def doc_id(self):
        """ We get the current doc_id if it exists"""

        return self.current_doc_id

    @doc_id.setter
    def doc_id(self, _doc_id: str):
        self.current_doc_id = _doc_id

    @property
    def verbatim(self):
        return self.query_builder.build_exact()

    @property
    def client(self):
        """client

        Get the client for the user. If it doesn't exist yet, create a new one with the given stop words. 
        
        For subkeys it adds fields if we've created them recently. 

        .. code-block:: python

            >>> self.client.add_document(_id, payload, **records)


        Returns
        -------
        [type]
            A redis connected client. Gets connection from Jamboree processor.
        """
        # self.processor
        if self.current_client is None:
            # We would insert a connection here. Use the connection from the search processor to operate.
            with logger.catch(ResponseError):
                self.current_client = Client(self.index,
                                             conn=self.processor.rconn)
                if len(self.indexable) > 0:
                    self.current_client.create_index(
                        self.indexable,
                        stopwords=[
                            "but", "there", "these", "they", "this", "to"
                        ],
                    )

        if self.is_sub_key:
            if not self.finished_alter:
                for i in self.indexable:
                    with suppress(ResponseError):
                        self.current_client.alter_schema_add([i])
                self.finished_alter = True

        return self.current_client

    @property
    def general(self):
        return self.query_builder.general

    @general.setter
    def general(self, term: str):
        """ Push a general term into the query. It can only be done once. Don't put it to a filter key."""
        if not isinstance(term, str):
            logger.error("Term isn't a string")
            return
        self.query_builder.general = term

    """
        This is when things get weird
    """

    def create_sub_handlers(self):
        """ Creates subhandlers for the given index"""
        for name, subkey in self.subfields.items():
            subhandler = BaseSearchHandler()
            subhandler.is_sub_key = True
            subhandler.index = subkey
            subhandler.insert_builder.is_sub = True

            self.replacement.subs[name] = copy(subhandler)
            self.subs[name] = subhandler

    def set_sub_processors(self):
        """ If there are any sub queries, set processors to them """
        if len(self.subfields) > 0:
            self.use_sub_query = True
            for name in self.subfields.keys():
                self.subs[name].processor = self.processor
                with suppress(Exception):
                    self.replacement.subs[name].processor = self.processor

    def set_entity(self):
        if self.is_set_entity is False:
            self["entity"] = {
                "type": "TEXT",
                "is_filter": True,
                "values": {
                    "is_exact": True,
                    "term": self.entity
                },
            }
            self.is_set_entity = True

    def verbatim_docs(self):
        built = self.query_builder.build_exact()
        q = Query(built).no_stopwords().paging(0, 1000000)
        results = self.client.search(q)
        result_docs = results.docs
        return result_docs

    def general_docs(self):
        built = self.query_builder.build()
        q = Query(built).paging(0, 1000000)
        results = self.client.search(q)
        result_docs = results.docs
        return result_docs

    def verbatim_sub_ids(self):
        super_id_set = set()
        sub_id_set = set()

        for key, sub in self.subs.items():
            sub.print_sub = True

            built = sub.query_builder.build()
            # logger.warning(built)
            built = built.strip(" ")
            is_falsy = not built
            if is_falsy:
                continue
            # logger.error(built)
            verb_items = sub.general_docs()
            current_super_ids = []
            current_subs = []
            for verb in verb_items:
                try:
                    _verb_id = verb.id
                    _super_id = verb.super_id
                    full_dict = verb.__dict__

                    self.keystore.add(_super_id, key, full_dict)
                    current_subs.append(_verb_id)
                    current_super_ids.append(_super_id)
                except Exception as e:
                    logger.error(str(e))

            if len(current_super_ids) > 0:
                if len(super_id_set) == 0:
                    super_id_set.update(current_super_ids)
                else:
                    super_id_set = super_id_set.intersection(current_super_ids)

            sub_id_set.update(current_subs)

        return list(super_id_set), list(sub_id_set)

    def verbatim_doc_ids(self):
        q = Query(self.verbatim).no_content().paging(0, 1000000)
        results = self.client.search(q)
        ids = [res.id for res in results.docs]
        return ids

    def handle_input_dict_key(self, name: str, item: dict):
        """ Figures out where to put the input dictionary for the query """
        if self.is_sub(name):
            # If this is a subkey we'll run the same operation again
            # Check to see if the subkey is empty and has information that is reducible to "type"
            self.use_sub_query = True
            self.search_sub = True
            reqs = self.loaded_dict_to_requirements(item)
            # logger.debug(reqs)
            self.subs[name].requirements = reqs
            for k, v in item.items():
                self.subs[name][k] = v
        else:
            # If it's not queryable don't try adding anything
            if not is_queryable_dict(item):
                return
            self.insert_builder.from_dict(name, item)
            self.query_builder.from_dict(name, item)

    def normal_find(self, limit_ids=None):
        built = self.query_builder.build()
        q = Query(built).paging(0, 1000000)
        if limit_ids is not None and len(limit_ids) > 0:
            q.limit_ids(*limit_ids)

        results = self.client.search(q)
        result_docs = results.docs
        return result_docs

    def normal_find_ids(self, limit_ids=None):
        _query = self.query_builder.build()
        q = Query(_query).no_content().paging(0, 1000000)
        if limit_ids is not None and len(limit_ids) > 0:
            q.limit_ids(*limit_ids)
        results = self.client.search(q)
        result_docs = results.docs
        return [res.id for res in result_docs]

    def sub_find(self):
        sup_ids, sub_ids = self.verbatim_sub_ids()
        if len(sub_ids) == 0:
            return []
        results = self.normal_find(limit_ids=sup_ids)
        results_dicts = []
        for result in results:
            _id, idict = split_doc(result)

            idict.pop("payload", None)
            subitems = self.keystore.get(_id)
            idict.update(subitems)
            results_dicts.append(idict)
        return results_dicts

    def normal_insert(self, allow_duplicates=False):
        if allow_duplicates == False:
            verbatim_docs = self.verbatim_docs()

            if len(verbatim_docs) > 0 and allow_duplicates == False:

                # Not adding docs because we're not allowing duplicates
                return verbatim_docs[0].id, False
        insert_variables = self.insert_builder.build()
        _doc_id = self.insert_builder.doc_id
        index_name = self.client.index_name
        fields = [i.redis_args()[0] for i in self.indexable]
        with logger.catch(message=f"{index_name} - {fields}", reraise=True):
            self.client.add_document(_doc_id,
                                     payload=_doc_id,
                                     **insert_variables)

        return _doc_id, True

    def sub_insert(self, allow_duplicates=False):
        _super_id, _did_insert = self.normal_insert(
            allow_duplicates=allow_duplicates)
        # logger.info(f'Did insert: {_did_insert}')
        if _did_insert:
            for key, sub in self.subs.items():
                if len(sub.insert_builder._insert_dict) > 0:
                    sub.insert_builder.super_id = _super_id
                    sub.normal_insert(allow_duplicates=True)
        return _super_id

    def find_sub_dictionaries(self, super_id):
        """ Finds a subdictionary by superid inside of the database. """
        # Should use the find within function for every subkey
        mega_dict = ADict()
        for key, sub in self.subs.items():
            key_dict = ADict()
            try:
                res = sub.client.search(f'"{super_id}"')
                if res.total == 0:
                    continue
                dd = [dictify(doc, False) for doc in res.docs]
                key_dict[key] = dd[0]
            except ResponseError:
                pass
            mega_dict.update(key_dict)
        return mega_dict

    def find(self):
        """Given the items we've set, find all matching items"""

        self.set_entity()
        self.keystore.reset()
        if self.use_sub_query and self.search_sub:
            return self.sub_find()
        normal = self.normal_find()
        if len(self.subs) == 0:
            if len(normal) > 0:
                return [doc_convert(x) for x in normal]
            return normal
        ndicts = []
        for i in normal:
            _i = dictify(i)
            mega = self.find_sub_dictionaries(_i.id)
            if len(mega) > 0:
                _i.update(mega.to_dict())
            ndicts.append(_i)
        return ndicts

    def pick(self, _id: str):
        """ 
            Given an id find the element with the top level id. We aren't searching lower level_ids. 
            
            After we pull all of the 
        """
        self.set_entity()
        self.keystore.reset()
        doc = self.client.load_document(_id)
        dd = doc.__dict__
        doc = ADict(**dd)
        _id = doc.pop("id", None)
        doc.pop("payload", None)
        doc_z = len(doc) > 0
        if len(self.subs) == 0:
            if not doc_z:
                return None
            doc.update({"id": _id})
            return doc

        if doc_z:
            sub_dicts = self.find_sub_dictionaries(_id)
            # if len(sub_dicts) > 0:
            doc.update(sub_dicts)
            doc.update({"id": _id})
            return doc

        return None

    def update(self):
        """
            # UPDATE

            Given the items or ID we've set, partial update every matching document. 
            If we have the document_ids already, replace those items
        """
        self.set_entity()
        self.keystore.reset()

        replacement_variables = self.replacement.insert_builder.build()
        if self.use_sub_query == False:
            doc_ids = self.verbatim_doc_ids()
            batcher = self.client.batch_indexer(chunk_size=len(doc_ids))
            for doc_id in doc_ids:
                batcher.add_document(doc_id,
                                     replace=True,
                                     partial=True,
                                     **replacement_variables)
            batcher.commit()
        else:
            sup_ids, sub_ids = self.verbatim_sub_ids()
            norm_ids = self.normal_find_ids(limit_ids=sup_ids)
            batcher = self.client.batch_indexer(chunk_size=len(norm_ids))
            for doc_id in norm_ids:
                batcher.add_document(doc_id,
                                     replace=True,
                                     partial=True,
                                     **replacement_variables)
            batcher.commit()

            for sub in self.subs.values():
                subreplacement = sub.insert_builder.build()
                if len(subreplacement) > 0:
                    subbatcher = sub.client.batch_indexer(
                        chunk_size=len(sub_ids))
                    for _id in sub_ids:
                        self.client.add_document(_id,
                                                 replace=True,
                                                 partial=True,
                                                 **subreplacement)
                    subbatcher.commit()

    def update_id(self, _id):
        self.set_entity()
        self.keystore.reset()
        doc = self.client.load_document(_id)
        doc_dict, is_exist = single_doc_check_convert(doc)

        if not is_exist:
            return

        replacement_variables = self.replacement.insert_builder.build()
        self.client.add_document(_id,
                                 replace=True,
                                 partial=True,
                                 **replacement_variables)
        doc = self.client.load_document(_id)
        # if len(self.subs) > 0:
        #     subreplacement = sub.insert_builder.build()

    # def insert_many(self, list_of_items):
    #     self.client.ba
    #     pass

    def insert(self, allow_duplicates=False):
        """
            # INSERT

            Given all of the items we've set, add documents
        """
        self.set_entity()
        self.keystore.reset()
        previous_id = None
        if self.use_sub_query:
            previous_id = self.sub_insert(allow_duplicates=allow_duplicates)
        else:
            previous_id, _ = self.normal_insert(
                allow_duplicates=allow_duplicates)
        return previous_id

    def remove(self):
        """Remove all documents that match a query.

        Given a query, remove every document that matches the results of that query. 

        ::
            >>> search['name'] = 'sample_name'
            >>> search['category'] = 'sample_query'
            >>> search.remove() 


        """
        self.set_entity()
        self.keystore.reset()
        if self.use_sub_query and self.search_sub:
            removable = set()
            sup_ids, sub_ids = self.verbatim_sub_ids()
            norm_ids = self.normal_find_ids(limit_ids=sup_ids)
            removable = removable.intersection(sup_ids)
            removable = removable.intersection(norm_ids)

            [self.client.delete_document(_id) for _id in removable]
            for sub in self.subs.values():
                for _id in sub_ids:
                    sub.client.delete_document(_id)
        else:
            norm_ids = self.normal_find_ids()
            [self.client.delete_document(_id) for _id in norm_ids]

    def reset(self):
        """Reset all local variables"""
        self.reset_builders()
        self.is_set_entity = True
        self.is_replacement = False
        self.current_replacement = None
        self.current_client = None
        self.use_sub_query = False
Exemplo n.º 31
0
def load_data(redis_server, redis_port, redis_password):
   load_client = Client(
      'fortune500-v1',
      host=redis_server,
      password=redis_password,
      port=redis_port
   )
   load_ac = AutoCompleter(
   'ac',
   conn = load_client.redis
   )
   
   definition = IndexDefinition(
           prefix=['fortune500:'],
           language='English',
           score_field='title',
           score=0.5
           )
   load_client.create_index(
           (
               TextField("title", weight=5.0),
               TextField('website'),
               TextField('company'),
               NumericField('employees', sortable=True),
               TextField('industry', sortable=True),
               TextField('sector', sortable=True),
               TextField('hqcity', sortable=True),
               TextField('hqstate', sortable=True),
               TextField('ceo'),
               TextField('ceoTitle'),
               NumericField('rank', sortable=True),
               NumericField('assets', sortable=True),
               NumericField('revenues', sortable=True),
               NumericField('profits', sortable=True),
               NumericField('equity', sortable=True),
               TagField('tags'),
               TextField('ticker')
               ),        
       definition=definition)

   with open('./fortune500.csv', encoding='utf-8') as csv_file:
      csv_reader = csv.reader(csv_file, delimiter=',')
      line_count = 0
      for row in csv_reader:
         if line_count > 0:
            load_ac.add_suggestions(Suggestion(row[1].replace('"', ''),  1.0))
            load_client.redis.hset(
                    "fortune500:%s" %(row[1].replace(" ", '')),
                    mapping = {
                        'title': row[1],
                        'company': row[1],
                        'rank': row[0],
                        'website': row[2],
                        'employees': row[3],
                        'sector': row[4],
                        'tags': ",".join(row[4].replace('&', '').replace(',', '').replace('  ', ' ').split()).lower(),
                        'industry': row[5],
                        'hqcity': row[8],
                        'hqstate': row[9],
                        'ceo': row[12],
                        'ceoTitle': row[13],
                        'ticker': row[15],
                        'revenues': row[17],
                        'profits': row[19],
                        'assets': row[21],
                        'equity': row[22]

               })
         line_count += 1
   # Finally Create the alias
   load_client.aliasadd("fortune500")