Пример #1
0
def modify_index(index_id, sql, queue, values = None, retries = 0):
  ''' 
  Either adds to index directly or queues statements 
  for async execution by storing them in Redis 
  If either Redis or searchd is unresponsive MAX_RETRIES attempts will be performed 
  in order to store the request to the alternative
  '''
  if retries > settings.MAX_RETRIES: 
    return E(message = 'Maximum retries %d exceeded' % retries)
  queue_action = None
  if sql.find('INSERT') == 0:
    queue_action = 'insert'
  elif sql.find('UPDATE') == 0:
    queue_action = 'update'
  elif sql.find('DELETE') == 0:
    queue_action = 'delete'
  response = None
  cache = Cache()
  if not queue:
    try:
      c = connections['sphinx:' + str(index_id)]
      cursor = c.cursor()
      if queue_action == 'delete':
        cursor.execute( sql )
      elif queue_action == 'update':
        cursor.execute(sql, values)
      elif queue_action == 'insert':
        cursor.executemany(sql, values)
      cache.dirty(index_id)
      response = { 'searchd' : 'ok' }
    except Exception as e:
      return str(e)
      response = modify_index(index_id, sql, True, values, retries + 1)
  else:
    try:
      rkey = rqueue(queue_action, index_id, sql, values)
      response = { 'redis' : rkey }
    except Exception as e:
      response = modify_index(index_id, sql, False, values, retries + 1)
  return response
Пример #2
0
def fetch_index_name(index_id):
  ''' Fetch index name by id '''
  try:
    c = Cache()
    if not c.exists('structures:indexes'):
      for index in Index.objects.all():
        if index_id == index.id:
          index_name = index.name
        c.hset('structures:indexes', str(index.id), index.name, True)
    else:
      indexes = c.hget('structures:indexes')
      index_id = str(index_id)
      if index_id in indexes:
        index_name = indexes[index_id]
      else:
        return E(message = 'No such index')
    return index_name
  except Exception as e:
    return E(message = 'Error while retrieving index')
Пример #3
0
def excerpts(request, index_id):
  cache = Cache()
  ''' 
  Returns highlighted snippets 
  Caches responses in Redis
  '''
  index_id = int(index_id)
  index = fetch_index_name(index_id)
  r = request_data(request)
  cache_key = md5(index + json.dumps(r)).hexdigest()
  lock_key = 'lock:' + cache_key
  version = cache.version(index_id)
  cache_key = 'cache:excerpts:%s:%d:%s' % (cache_key, index_id, version)
  if not 'docs' in r:
    return R({})
  if settings.EXCERPTS_CACHE:
    try:   
      response = cache.get(cache_key) 
      if not response is None:
        return R(response, request, code = 200, serialize = False) 
      ''' lock this key for re-caching '''
      start = time.time()
      lock = cache.get(lock_key)
      while ( not lock is None ):
        lock = cache.get(lock_key)
        if (time.time() - start) > settings.CACHE_LOCK_TIMEOUT:
          return E(message = 'Cache lock wait timeout exceeded')
      ''' check if key now exists in cache '''
      response = cache.get(cache_key)
      if not response is None:
        return R(response, request, code = 200, serialize = False)
      ''' otherwise acquire lock for this session '''
      cache.set(lock_key, 1, True, settings.CACHE_LOCK_TIMEOUT) # expire in 10sec         
    except:
      return E(message = 'Error while examining excerpts cache')    

  options = {
      "before_match"      : '<b>',
      "after_match"       : '</b>',
      "chunk_separator"   : '...',
      "limit"             : 256,
      "around"            : 5,    
      "exact_phrase"      : False,
      "use_boundaries"    : False,
      "query_mode"        : True,
      "weight_order"      : False,
      "force_all_words"   : False,
      "limit_passages"    : 0,
      "limit_words"       : 0,
      "start_passage_id"  : 1,
      "html_strip_mode"   : 'index',
      "allow_empty"       : False,
      "passage_boundary"  : 'paragraph',
      "emit_zones"        : False
  }
  for k, v in options.iteritems():
    if k in r:
      if isinstance(v, int):
        options[k] = int(r[k])
      elif isinstance(v, bool):
        options[k] = bool(r[k])
      else:
        options[k] = r[k]
  if 'ttl' in r:      
    cache_expiration = int(r['ttl'])
  else:
    cache_expiration = settings.EXCERPTS_CACHE_EXPIRE
  if isinstance(r['docs'], dict):
    document_ids = r['docs'].keys()
    documents = r['docs'].values()
  elif isinstance(r['docs'], list):
    document_ids = range(len(r['docs'])) # get a list of numeric indexes from the list
    documents = r['docs']
  else:
    return E(message = 'Documents are passed as a list or dictionary structure')
  del r['docs'] # free up some memory
  '''
  docs = { 838393 : 'a document with lots of text', 119996 : 'another document with text' }
  '''
  ci = ConfigurationIndex.objects.filter(sp_index_id = index_id)[0]
  searchd_id = ConfigurationSearchd.objects.filter(sp_configuration_id = ci.sp_configuration_id)[0].sp_searchd_id
  ''' TODO: convert hard coded option ids to constants '''
  so = SearchdOption.objects.filter(sp_searchd_id = searchd_id, sp_option_id = 138,).exclude(value__endswith = ':mysql41')
  sphinx_port = int(so[0].value)
  try:
    so = SearchdOption.objects.filter(sp_searchd_id = searchd_id, sp_option_id = 188,)
    if so:
      sphinx_host = so[0].value
    else:
      sphinx_host = 'localhost'
  except:
    sphinx_host = 'localhost'
  try:
    cl = SphinxClient()
    cl.SetServer(host = sphinx_host, port = sphinx_port)
    excerpts = cl.BuildExcerpts( documents, index, r['q'], options )
    del documents
    if not excerpts:
      return E(message = 'Sphinx Excerpts Error: ' + cl.GetLastError())
    else:      
      if settings.EXCERPTS_CACHE:
        cache.set(cache_key, excerpts, True, cache_expiration, lock_key)
      excerpts = { 
        'excerpts' : dict(zip(document_ids, excerpts)), 
        'cache-key' : cache_key,        
        }
      return R(json.dumps(excerpts), request)
  except Exception as e:
    return E(message = 'Error while building excerpts ' + str(e))
Пример #4
0
def search(request, index_id):
  cache = Cache()
  index = fetch_index_name(index_id)
  ''' Search wrapper with SphinxQL '''
  r = request_data(request)
  if settings.SEARCH_CACHE:
    cache_key = md5(index + request.REQUEST['data']).hexdigest()
    lock_key = 'lock:' + cache_key
    version = cache.version(index_id)
    cache_key = 'cache:search:%s:%d:%s' % (cache_key, index_id, version)
    try:   
      response = cache.get(cache_key) 
      if not response is None:
        return R(response, 200, False)
      else:
        ''' lock this key for re-caching '''
        start = time.time()
        lock = cache.get(lock_key)
        while ( not lock is None ):
          lock = cache.get(lock_key)
          if (time.time() - start) > settings.CACHE_LOCK_TIMEOUT:
            return E(message = 'Cache lock wait timeout exceeded')
        ''' check if key now exists in cache '''
        response = cache.get(cache_key)
        if not response is None:
          return R(response, 200, False)
        ''' otherwise acquire lock for this session '''
        cache.set(lock_key, 1, True, settings.CACHE_LOCK_TIMEOUT) # expire in 10sec        
    except:
      pass    
  
  option_mapping = {
    'mode' : {
        'extended' : SPH_MATCH_EXTENDED2,
        'boolean'  : SPH_MATCH_BOOLEAN,
        'all'      : SPH_MATCH_ALL,
        'phrase'   : SPH_MATCH_PHRASE,
        'fullscan' : SPH_MATCH_FULLSCAN,
        'any'      : SPH_MATCH_ANY,
      }
  }
  options = {
      'sortby'      : '',
      'mode'        : 'extended',
      'groupby'     : '',
      'groupsort'   : '',
      'offset'      : 0,
      'limit'       : 1000,
      'max_matches' : 0,
      'cutoff'      : 0,
      'fields'      : '*',
    }
  
  sphinxql_list_options = {
    'ranker' : [ 'proximity_bm25', 'bm25', 'none', 'wordcount', 'proximity',
                 'matchany', 'fieldmask', 'sph04', 'expr', 'export' ],
    'idf' : [ 'normalized', 'plain'],
    'sort_method'  : ['pq', 'kbuffer' ]
  }
  sphinxql_options = { 
    'agent_query_timeout' : 10000,
    'boolean_simplify' : 0,
    'comment' : '',
    'cutoff'  : 0,
    'field_weights' : '',
    'global_idf' : '',
    'idf' : 'normalized',
    'index_weights'  : '',
    'max_matches' : 10000,
    'max_query_time' : 10000,
    'ranker' : 'proximity_bm25',
    'retry_count' : 2,
    'retry_delay' : 100,
    'reverse_scan' : 0,
    'sort_method'  : 'pq'
  }
  order_direction = {
    '-1'   : 'DESC',
    'DESC' : 'DESC',
    '1'    : 'ASC',
    'ASC'  : 'ASC',
  }

  try:
    ''' Check attributes from request with stored options (sp_index_option) '''
    ''' Preload host and ports per index '''
    '''
    SELECT
    select_expr [, select_expr ...]
    FROM index [, index2 ...]
    [WHERE where_condition]
    [GROUP BY {col_name | expr_alias}]
    [WITHIN GROUP ORDER BY {col_name | expr_alias} {ASC | DESC}]
    [ORDER BY {col_name | expr_alias} {ASC | DESC} [, ...]]
    [LIMIT [offset,] row_count]
    [OPTION opt_name = opt_value [, ...]]
    '''
    sql_sequence = [ ('SELECT', 'fields'), ('FROM', 'indexes'), ('WHERE', 'where'), 
                     ('GROUP BY', 'group_by'), ('WITHIN GROUP ORDER BY', 'order_within_group'), 
                     ('ORDER BY', 'order_by'), ('LIMIT', 'limit'), ('OPTION', 'option') ]
    sql = {}
    for sql_clause, key in sql_sequence:
      sql[key] = ''
      if not key in r:
        r[key] = ''
    sql['indexes'] = index + ','.join( r['indexes'] )
    if isinstance(r['fields'], list):
      sql['fields'] = ',' . join(r['fields'])
    else:
      sql['fields'] = options['fields']
    if r['group_by'] != '':
      sql['group_by'] = r['groupby']
    if not isinstance(r['limit'], dict):
      r['limit'] = { 'offset' : '0', 'count' : options['limit'] }
    r['limit'] = '%(offset)s, %(count)s' % r['limit']
    sql['order_by'] = ',' . join([ '%s %s' % (order[0], order_direction(order[1].upper())) for order in r['order_by'] ])
    if r['order_within_group'] != '':
      sql['order_within_group'] = ',' . join([ '%s %s' % (order[0], order_direction(order[1].upper())) for order in r['order_within_group'] ])
    sql['where'] = [] #dictionary e.g. { 'date_from' : [[ '>' , 13445454350] ] } 
    value_list = []
    if isinstance(r['where'], dict):
      for field, conditions in r['where'].iteritems():
        for condition in conditions:
          operator, value = condition
          value_list.append(value)
          sql['where'].append('%s%s%%s' % (field, operator,))
    value_list.append(r['q'])
    sql['where'].append('MATCH(%%s)')
    sql['where'] = ' ' . join(sql['where'])
    if isinstance(r['option'], dict):
      sql['option'] = []
      for option_name, option_value in r['option'].iteritems():
        if isinstance(option_value, dict): 
          option_value = '(' + (','. join([ '%s = %s' % (k, option_value[k]) for k in option_value.keys() ])) + ')'
          sql['option'].append('%s = %s' % (option_name, option_value))
      sql['option'] = ',' . join(sql['option'])
    response = { 'results' : None, 'meta' : None }
    try:    
      cursor = connections['sphinx:' + index].cursor()
      sql =  ' ' . join([ clause[0] + ' ' + sql[clause[1]] for clause in sql_sequence if sql[clause[1]] != '' ]) 
      cursor.execute(sql, value_list)
      response['results'] = cursorfetchall(cursor)
    except Exception as e:
      error_message = 'Sphinx Search Query failed with error "%s"' % str(e)
      return E(message = error_message)
    try:
      cursor.execute('SHOW META')
      response['meta'] = cursorfetchall(cursor)
    except:
      pass
    if settings.SEARCH_CACHE:
      cache.set(cache_key, response, True, SEARCH_CACHE_EXPIRE, lock_key)
  except Exception as e:
    return E(message = str(e))
  return R(response)