def text_similarity_scores(index, doc_type, **kwargs): """ return a sparse matrix whose (i,j) entry is the similarity score of document i and document j of /index_name/doc_type """ _build_default(kwargs) es = Elasticsearch() n = es.search(index=index, doc_type=doc_type, body={ "query": { "match_all": {} } }, search_type="count").get('hits').get('total', 0) I = [] J = [] V = [] for i in range(n): rv = es.mlt(index, doc_type, id=i, **kwargs) results = rv['hits']['hits'] if len(results) > 0: j = int(rv['hits']['hits'][0]['_id']) score = rv['hits']['hits'][0]['_score'] I.append(i) J.append(j) V.append(score) return coo_matrix((V, (I, J)), shape=(n, n))
def text_similarity_score_by_id(index, doc_type, id, **kwargs): """ Find the document most like the given id Return: (id, score) if any match is found; otherwise, (-1, -np.inf) """ _build_default(kwargs) es = Elasticsearch() rv = es.mlt(index=index, doc_type=doc_type, id=id, **kwargs) results = rv['hits']['hits'] if len(results) > 0: j = int(rv['hits']['hits'][0]['_id']) score = rv['hits']['hits'][0]['_score'] return (j, score) else: return (-1, -np.inf)
class UMF_Analyzer: def __init__(self): self.es = Elasticsearch([{'host':'localhost', 'port': 9200}]) self.scheme = ['bm25','ib','lmd','lmj','ngram','tfidf','dfr'] self.umf_query = 'umf_query' self.umf_document = 'umf_document' self.docMap = pd.read_csv(open('doc_map.csv'),sep='\t',index_col=False) # Save vector as mat file def save_mat(self,filename,vector): v = vector.as_matrix() sio.savemat(filename,{'data':v}) # Build Similarity vector between all pairs of users def build_similarity_vector(self,directory): labels = [] ids = [] for filename in os.listdir(directory): labels.append(filename.split('_')[3].split('.')[0]) ids.append(filename.split('.')[0]) qVector = {} for s in self.scheme: qVector[s] = pd.DataFrame() # Build Query Similarity Vector for filename1 in os.listdir(directory): data1 = pd.read_csv(open(directory + '/'+filename1),sep='\t',names=['query','document','time']) pivot_id = filename1.split('.')[0] v={} for s in self.scheme: v[s] = [] for filename2 in os.listdir(directory): data2 = pd.read_csv(open(directory + '/' + filename2),sep='\t',names=['query','document','time']) qSim = self.calculate_query_similarities(data1['query'],data2['query']) dSim = self.calculate_document_similarities(data1['document'],data2['document']) #print "similarity:",sim for s in self.scheme: v[s].append(qSim[s]) for s in self.scheme: temp = {} for idx,l in enumerate(ids): temp[l] = v[s][idx] a = pd.DataFrame(temp,index=[pivot_id],columns=ids) qVector[s] = qVector[s].append(a) # Build Document Similarity Vector return qVector,dVector #print pd.DataFrame(v) # vectors = {} # for s in self.scheme: # vectors[s] = pd.DataFrame() # for i in v: # temp = [] # for j in i: # temp.append(j[s]) # vectors[s] = vectors[s].append(pd.DataFrame(temp)) # print temp # print 'test' # print vectors['ngram'] # remove duplicates def remove_duplicates(self,queries): return Set(queries) # Query Refinement def query_refine(self,q): q = q.replace('\'','') q = q.replace(']','') q = q.replace('[','') q = q.replace(',','') q = q.replace('\"','') return q def display_analysis(self): print "############Query Inner Similarity#############" scores = self.inner_similarity_query('data') for i in range(9): print "######################Q",i+1,"#################" for s in self.scheme: print s,":",scores[i][s] print "############Document Inner Similarity#############" scores = self.inner_similarity_document('data') for i in range(9): print "Q",i+1,":",scores[i] def inner_similarity_query(self,directory,display=False): l = [] for i in range(9): files = [] avg = {} cnt = 0 for s in self.scheme: avg[s] = 0 for filename in os.listdir(directory): if filename.split('_')[3] == str(i+1) + '.csv': files.append(filename) for k in range(len(files)-1): qSet1 = pd.read_csv(open(directory + '/' + files[k]),sep='\t',names=['query','document','time']) for j in range(k+1,len(files)): qSet2 = pd.read_csv(open(directory + '/' +files[j]),sep='\t',names=['query','document','time']) Sim_kj = self.calculate_query_similarities(qSet1['query'],qSet2['query']) for s in self.scheme: avg[s] = avg[s] + Sim_kj[s] cnt = cnt + 1 if display: print Sim_kj for s in self.scheme: avg[s] = avg[s]/cnt l.append(avg) return l def inner_similarity_document(self,directory,display=False): l = [] for i in range(9): files = [] avg = 0 cnt = 0 for filename in os.listdir(directory): if filename.split('_')[3] == str(i+1) + '.csv': files.append(filename) for k in range(len(files)-1): dSet1 = pd.read_csv(open(directory + '/' + files[k]),sep='\t',names=['query','document','time']) for j in range(k+1,len(files)): dSet2 = pd.read_csv(open(directory + '/' + files[j]),sep='\t',names=['query','document','time']) Sim_kj = self.calculate_document_similarities(dSet1['document'],dSet2['document']) avg = avg+Sim_kj cnt = cnt + 1 if display: print Sim_kj l.append(avg/cnt) return l def outer_similairty_document(self,directory,display=False): for i in range(9): files1 = [] files2 = [] cnt = 0 avg = {} for s in self.scheme: avg[s] = 0 for filename in os.listdir(directory): if filename.split('_')[3] == str(i+1) + '.csv': files1.append(filename) for filename in os.listdir(directory): if not filename.split('_')[3] == str(i+1) + '.csv': files2.append(filename) for k in range(len(files) -1): qSet1 = pd.read_csv(open(directory + '/' + files[k]),sep='\t',names=['query','document','time']) for j in range(k+1,len(files)): qSet2 = pd.read_csv(open(directory + '/' +files[j]),sep='\t',names=['query','document','time']) Sim_kj = self.calculate_query_similarities(qSet1['query'],qSet2['query']) for s in self.scheme: avg[s] = avg[s] + Sim_kj[s] cnt = cnt + 1 if display: print Sim_kj for s in self.scheme: avg[s] = avg[s]/cnt print "Average:",avg def calculate_cluster_query_similarity(self,directory, num1,num2): avg = {} files1 = [] files2 = [] cnt = 0 for s in self.scheme: avg[s] = 0 for filename in os.listdir(directory): if filename.split('_')[3] == str(num1) + '.csv': files1.append(filename) for filename in os.listdir(directory): if filename.split('_')[3] == str(num2) + '.csv': files2.append(filename) for i in range(len(files1) -1): qSet1 = pd.read_csv(open(directory + '/' + files1[i]),sep='\t',names=['query','document','time']) for j in range(i+1,len(files2)): qSet2 = pd.read_csv(open(directory + '/' + files2[j]),sep='\t',names=['query','document','time']) Sim_ij = self.calculate_query_similarities(qSet1['query'],qSet2['query']) for s in self.scheme: avg[s] = avg[s] + Sim_ij[s] cnt = cnt+1 for s in self.scheme: avg[s] = avg[s]/cnt return avg def calculate_cluster_document_similarity(self,directory, num1,num2): avg = 0 files1 = [] files2 = [] cnt = 0 for filename in os.listdir(directory): if filename.split('_')[3] == str(num1) + '.csv': files1.append(filename) for filename in os.listdir(directory): if filename.split('_')[3] == str(num2) + '.csv': files2.append(filename) for i in range(len(files1) -1): qSet1 = pd.read_csv(open(directory + '/' + files1[i]),sep='\t',names=['query','document','time']) for j in range(i+1,len(files2)): qSet2 = pd.read_csv(open(directory + '/' + files2[j]),sep='\t',names=['query','document','time']) Sim_ij = self.calculate_document_similarities(qSet1['document'],qSet2['document']) avg = avg + Sim_ij cnt = cnt+1 return avg/cnt # This function calculates similarities between two sets of queries. # 'function calculate_query_similarity' is called def calculate_query_similarities(self,qSet1,qSet2): qSet1 = self.remove_duplicates(qSet1) qSet2 = self.remove_duplicates(qSet2) scores = {} for s in self.scheme: scores[s] = 0 for q1 in qSet1: for q2 in qSet2: rQ1 = self.query_refine(q1) rQ2 = self.query_refine(q2) score = self.calculate_query_similarity(rQ1,rQ2) for s in self.scheme: scores[s] = scores[s] + score[s] cnt = len(qSet1) * len(qSet2) for s in self.scheme: scores[s] = scores[s]/cnt return scores # This function calculates similarity between two queries def calculate_query_similarity(self,q1,q2): scores = {} for s in self.scheme: scores[s] = 0 analyzer = 'my_' + s + '_analyzer' content = q1.replace(r"/",",") res = self.es.search(index=self.umf_query+ '_' + s, q=content,doc_type='query',analyzer=analyzer,size=4000) for entry in res['hits']['hits']: if q2 == entry['_source']['query']: scores[s] = entry['_score'] break return scores # Getting Body Text extracted from Web page # Return the extracted document def getDocumentFromURL(self,url): # from goose import Goose # g = Goose() # article = g.extract(url=url) # text = ''.join([i if ord(i) < 128 else '' for i in article.cleaned_text]) # return text for idx,entry in self.docMap.iterrows(): if entry['key'] == url: if type(entry['value']) == float: return None return entry['value'] def getDocumentIDFromURL(self,url): for idx,entry in self.docMap.iterrows(): if entry['key'] == url: return entry['id'] def getDocumentIDFromDocument(self,document): for idx,entry in self.docMap.iterrows(): if entry['value'] == document: return entry['id'] def document_preprocess(self,dSet): dSet = self.remove_duplicates(dSet) # remove duplicates if 'https://google.com/' in dSet: dSett = dSet.remove('https://google.com/') if 'http://google.com/' in dSet: dSet.remove('http://google.com/') if 'http://www.google.com/' in dSet: dSet.remove('http://www.google.com/') if 'https://www.google.com/' in dSet: dSet.remove('https://www.google.com/') if 'http://www.google.co.kr/' in dSet: dSet.remove('http://www.google.co.kr/') if 'http://www.google.com/webhp?hl=en' in dSet: dSet.remove('http://www.google.com/webhp?hl=en') if 'about:blank' in dSet: dSet.remove('about:blank') if 'google.com' in dSet: dSet.remove('google.com') return dSet def calculate_document_similarities(self,dSet1,dSet2): dSet1 = self.document_preprocess(dSet1) dSet2 = self.document_preprocess(dSet2) score = 0 for d1 in dSet1: for d2 in dSet2: rD1 = self.getDocumentFromURL(d1) rD2 = self.getDocumentFromURL(d2) if rD2 == None or rD1 == None: continue score = score + self.calculate_document_similarity(rD1,rD2) cnt = len(dSet1) * len(dSet2) return score/cnt def calculate_document_similarity(self,d1,d2): ID1 = self.getDocumentIDFromDocument(d1) ID2 = self.getDocumentIDFromDocument(d2) if ID1 == ID2: return 1.0 score = 0 res = self.es.mlt(index=self.umf_document+'_bm25',doc_type='document',id=ID1,search_size=200) for entry in res['hits']['hits']: if entry['_id'] == ID2: return entry['_score'] return 0.0
def text_similarity_clustering(index, doc_type, searchsize, cutv, **kwargs): _build_default(kwargs) es = Elasticsearch() # --- retrieve the document _ids from elasticsearch_dsl import Search s = Search(using=es, index=index, doc_type=doc_type) s = s.fields( []) # only get ids, otherwise `fields` takes a list of field names ids = [h.meta.id for h in s.scan()] n = s.count() # restrict the searchsize between 1 and 10 searchsize = 10 if searchsize >= 10 else searchsize searchsize = 1 if searchsize < 1 else searchsize # restrict the cutv between > 0 cutv = 0 if cutv < 0 else cutv # store the sparse adjaency matrix I = [] J = [] V = [] RV = [] for i in ids: rv = es.mlt(index, doc_type, id=i, **kwargs) results = rv['hits']['hits'] if len(results) > 0: for loop in xrange(searchsize): j = int(rv['hits']['hits'][loop]['_id']) score = rv['hits']['hits'][loop]['_score'] if score >= cutv: I.append(ids.index(i)) J.append(j) V.append(score) RV.append(score) # construct the adjaency matrix using Sparse Index A = coo_matrix((V, (I, J)), shape=(n, n)) RA = coo_matrix((RV, (I, J)), shape=(n, n)) # construct a graph G = nx.from_scipy_sparse_matrix(A) # obtain the degree vlaues for all the nodes # {0: 0, 1: 1, 2: 1, 3: 2, 4: 3, 5: 4, 6: 4, 7: 1, 8: 2, 9: 0, # 10: 4, 11: 0, 12: 4, 13: 1, 14: 1, 15: 1, 16: 4, 17: 4, 18: 3, # 19: 4, 20: 4, 21: 2, 22: 1, 23: 0, 24: 1, 25: 4, 26: 3, 27: 1, # 28: 1, 29: 2, 30: 3, 31: 4, 32: 1, 33: 4, 34: 2, 35: 2, 36: 1, # 37: 1, 38: 3, 39: 1, 40: 1, 41: 4, 42: 0, 43: 0, 44: 2, 45: 4, # 46: 1, 47: 1, 48: 2, 49: 0, 50: 1, 51: 1, 52: 3, 53: 1, 54: 2, # 55: 0, 56: 1, 57: 1, 58: 3, 59: 5, 60: 0, 61: 2, 62: 2, 63: 3, # 64: 5, 65: 1, 66: 1, 67: 4, 68: 5, 69: 2, 70: 4, 71: 3, 72: 4, # 73: 4, 74: 4, 75: 2, 76: 2, 77: 3, 78: 3, 79: 2, 80: 3, 81: 1, # 82: 1, 83: 2, 84: 0, 85: 2, 86: 4, 87: 2, 88: 1, 89: 1, 90: 4, # 91: 2, 92: 0, 93: 3, 94: 1, 95: 3, 96: 2, 97: 3, 98: 0, 99: 3, # 100: 0, 101: 1, 102: 1, 103: 2, 104: 3, 105: 4, 106: 4, 107: 1, # 108: 2, 109: 4, 110: 2, 111: 0, 112: 1, 113: 1, 114: 1, 115: 1, # 116: 4, 117: 4, 118: 3, 119: 4, 120: 4, 121: 2, 122: 1, 123: 0, # 124: 2, 125: 4, 126: 5, 127: 1, 128: 1, 129: 2, 130: 3, 131: 4, # 132: 1, 133: 4, 134: 2, 135: 2, 136: 0, 137: 1, 138: 3, 139: 0, # 140: 1, 141: 4, 142: 0, 143: 0, 144: 2, 145: 4, 146: 1, 147: 1, # 148: 2, 149: 0, 150: 1, 151: 1, 152: 3, 153: 1, 154: 2, 155: 0, # 156: 1, 157: 1, 158: 3, 159: 5, 160: 0, 161: 2, 162: 2, 163: 3, # 164: 5, 165: 1, 166: 1, 167: 4, 168: 5, 169: 2, 170: 4, 171: 3, # 172: 4, 173: 4, 174: 4, 175: 2, 176: 2, 177: 0, 178: 4, 179: 2, # 180: 3, 181: 1, 182: 1, 183: 2, 184: 0, 185: 2, 186: 4, 187: 2, # 188: 1, 189: 1, 190: 4, 191: 2, 192: 0, 193: 3, 194: 1, 195: 3, # 196: 2, 197: 3, 198: 0, 199: 3} D = G.degree().values() # partition the graph by modularity # return a dictionary # {0: 0, 1: 1, 2: 1, 3: 2, 4: 3, 5: 4, 6: 4, 7: 1, 8: 2, 9: 0, 10: 4, 11: 0, 12: 4, 13: 1, 14: 1, 15: 1, 16: 4, 17: 4, 18: 3, 19: 4, # 20: 4, 21: 2, 22: 1, 23: 0, 24: 1, 25: 4, 26: 3, 27: 1, 28: 1, 29: 2, 30: 3, 31: 4, 32: 1, 33: 4, 34: 2, 35: 2, 36: 1, 37: 1, 38: # 3, 39: 1, 40: 1, 41: 4, 42: 0, 43: 0, 44: 2, 45: 4, 46: 1, 47: 1, 48: 2, 49: 0, 50: 1, 51: 1, 52: 3, 53: 1, 54: 2, 55: 0, 56: 1, # 57: 1, 58: 3, 59: 5, 60: 0, 61: 2, 62: 2, 63: 3, 64: 5, 65: 1, 66: 1, 67: 4, 68: 5, 69: 2, 70: 4, 71: 3, 72: 4, 73: 4, 74: 4, # 75: 2, 76: 2, 77: 3, 78: 3, 79: 2, 80: 3, 81: 1, 82: 1, 83: 2, 84: 0, 85: 2, 86: 4, 87: 2, 88: 1, 89: 1, 90: 4, 91: 2, 92: 0, # 93: 3, 94: 1, 95: 3, 96: 2, 97: 3, 98: 0, 99: 3, 100: 0, 101: 1, 102: 1, 103: 2, 104: 3, 105: 4, 106: 4, 107: 1, 108: 2, 109: 4, # 110: 2, 111: 0, 112: 1, 113: 1, 114: 1, 115: 1, 116: 4, 117: 4, 118: 3, 119: 4, 120: 4, 121: 2, 122: 1, 123: 0, 124: 2, 125: 4, # 126: 5, 127: 1, 128: 1, 129: 2, 130: 3, 131: 4, 132: 1, 133: 4, 134: 2, 135: 2, 136: 0, 137: 1, 138: 3, 139: 0, 140: 1, 141: 4, # 142: 0, 143: 0, 144: 2, 145: 4, 146: 1, 147: 1, 148: 2, 149: 0, 150: 1, 151: 1, 152: 3, 153: 1, 154: 2, 155: 0, 156: 1, 157: 1, # 158: 3, 159: 5, 160: 0, 161: 2, 162: 2, 163: 3, 164: 5, 165: 1, 166: 1, 167: 4, 168: 5, 169: 2, 170: 4, 171: 3, 172: 4, 173: 4, # 174: 4, 175: 2, 176: 2, 177: 0, 178: 4, 179: 2, 180: 3, 181: 1, 182: 1, 183: 2, 184: 0, 185: 2, 186: 4, 187: 2, 188: 1, 189: 1, # 190: 4, 191: 2, 192: 0, 193: 3, 194: 1, 195: 3, 196: 2, 197: 3, 198: 0, 199: 3} # where key is the document id, and value is the parition id partition = community.best_partition(G) RAC = RA.tocsr() return get_map_document(partition, ids, D, RAC)
class VWCollection(VWCallback): def __init__(self,items=[],**kwargs): self.bulk_chunk_size = kwargs.get('bulk_chunk_size', config.bulk_chunk_size) self._sort = [] self.results_per_page = kwargs.get('results_per_page', config.results_per_page) self._querybody = querybuilder.QueryBody() # sets up the new query bodies if kwargs.get('base_obj'): self.base_obj = kwargs.get('base_obj') else: try: self.base_obj = self.__class__.__model__ except AttributeError: raise AttributeError('Base object must contain a model or pass base_obj') self._es = Elasticsearch(config.dsn) self._esc = client.IndicesClient(self._es) if '__index__' in dir(self.base_obj): idx = self.base_obj.__index__ else: idx = config.default_index self._search_params = [] self._raw = {} self.idx = idx self.type = self.base_obj.__type__ self._special_body = {} # special list of items that can be committed in bulk self._items = items def search(self,q): self._search_params.append(q) return self # setup a raw request def raw(self, raw_request): self._raw = raw_request return self def filter_by(self, condition = 'and',**kwargs): if kwargs.get('condition'): condition=kwargs.get('condition') del kwargs['condition'] condition = self._translate_bool_condition(condition) for k,v in kwargs.iteritems(): if k == 'id' or k == 'ids': id_filter = v if not isinstance(id_filter, list): id_filter = [id_filter] self._querybody.chain(qdsl.ids(id_filter), condition=condition) else: try: analyzed = is_analyzed(getattr(self.base_obj, k)) except AttributeError: analyzed = is_analyzed(v) q_type = 'filter' if analyzed: q_type = 'query' if isinstance(v, list): # lists are treat as like "OR" (using terms() on not_analyzed, bool/matched on analyzed) if analyzed: match_queries = [] for item in v: match_queries.append( qdsl.match(k,item) ) self._querybody.chain( qdsl.bool(qdsl.should(match_queries)), condition=condition,type=q_type ) else: self._querybody.chain( qdsl.terms(k,v),condition=condition, type=q_type) else: #search_value = unicode(v) if analyzed: self._querybody.chain(qdsl.match(unicode(k), v), condition=condition,type=q_type) else: self._querybody.chain(qdsl.term(unicode(k), v), condition=condition,type=q_type) return self def multi_match(self, fields, query, **kwargs): self._querybody.chain(qdsl.multi_match(query, fields), condition=kwargs.get('condition', None), type='query') return self def exact(self, field, value,**kwargs): try: field_template = getattr( self.base_obj, field) if type(field_template) != ESType: field_template = create_es_type(field_template) for estype in [String,IP,Attachment]: if isinstance(field_template, estype) and field_template.analyzed == True: logger.warn('%s types may not exact match correctly if they are analyzed' % unicode(estype.__class__.__name__)) except AttributeError: logger.warn('%s is not in the base model.' % unicode(field)) kwargs['type'] = 'filter' if isinstance(value, list): self._querybody.chain(qdsl.terms(field,value), **kwargs) else: self._querybody.chain(qdsl.term(field, value), **kwargs) return self def or_(self,*args): return ' OR '.join(args) def and_(self,*args): return ' AND '.join(args) def get(self,id, **kwargs): try: params = {'index':self.idx, 'doc_type':self.type, 'id':id} params.update(kwargs) doc = self._es.get(**params) if doc: return VWCollectionGen(self.base_obj, {'docs':[doc]})[0] return None except: # TODO. Discuss this. Should get() return None even on exceptions? return None def refresh(self, **kwargs): self._esc.refresh(index=self.idx, **kwargs) def get_in(self, ids,**kwargs): if len(ids) > 0: # check for ids. empty list returns an empty list (instead of exception) params = {'index':self.idx, 'doc_type':self.type, 'body':{'ids':ids}} params.update(kwargs); res = self._es.mget(**params) if res and res.get('docs'): return VWCollectionGen(self.base_obj, res) return [] def get_like_this(self,doc_id,**kwargs): params = {'index':self.idx,'doc_type':self.type,'id':doc_id} params.update(kwargs) res = self._es.mlt(**params) if res and res.get('docs'): return VWCollectionGen(self.base_obj, res) else: return [] def sort(self, **kwargs): for k,v in kwargs.iteritems(): v = v.lower() if v not in ['asc','desc']: v = 'asc' self._sort.append('%s:%s' % (k,v)) return self def clear_previous_search(self): self._raw = {} self._search_params = [] self._special_body = {} self._querybody = querybuilder.QueryBody() def _create_search_params( self, **kwargs ): # before_query_build() is allowed to manipulate the object's internal state before we do stuff self._querybody = self.execute_callbacks('before_query_build', self._querybody ) q = { 'index': self.idx, 'doc_type': self.type } if self._raw: q['body'] = self._raw elif len(self._search_params) > 0: kwargs['type'] = 'query' self._querybody.chain(qdsl.query_string(self.and_(*self._search_params)), **kwargs) else: q['body'] = qdsl.query(qdsl.match_all()) if self._querybody.is_filtered() or self._querybody.is_query(): q['body'] = self._querybody.build() # after_query_build() can manipulate the final query before being sent to ES # this is generally considered a bad idea but might be useful for logging q = self.execute_callbacks( 'after_query_build', q ) logger.debug(json.dumps(q)) return q def count(self): params = self._create_search_params() resp = self._es.count(**params) return resp.get('count') def __len__(self): return self.count() def limit(self,count): self.results_per_page = count return self def all(self,**kwargs): params = self._create_search_params() if not params.get('size'): params['size'] = self.results_per_page if kwargs.get('results_per_page') != None: kwargs['size'] = kwargs.get('results_per_page') del kwargs['results_per_page'] if kwargs.get('start') != None: kwargs['from_'] = kwargs.get('start') del kwargs['start'] logger.debug(json.dumps(self._sort)) params.update(kwargs) if len(self._sort) > 0: if params.get('sort') and isinstance(params['sort'], list): params['sort'].extend(self._sort) else: params['sort'] = self._sort if params.get('sort'): if isinstance(params['sort'], list): params['sort'] = ','.join(params.get('sort')) else: raise TypeError('"sort" argument must be a list') logger.debug(json.dumps(params)) results = self._es.search(**params) return VWCollectionGen(self.base_obj,results) def one(self,**kwargs): kwargs['results_per_page'] = 1 results = self.all(**kwargs) try: return results[0] except IndexError: raise NoResultsFound('No result found for one()') # this is for legacy purposes in filter_by def _translate_bool_condition(self,_bool_condition): if _bool_condition == 'and': _bool_condition = 'must' elif _bool_condition == 'or': _bool_condition = 'should' elif _bool_condition == 'not': _bool_condition = 'must_not' # this is for things like geo_distance where we explicitly want the true and/or/not elif _bool_condition == 'explicit_and': _bool_condition = 'and' elif _bool_condition == 'explicit_or': _bool_condition = 'or' elif _bool_condition == 'explicit_not': _bool_condition = 'not' return _bool_condition def range(self, field, **kwargs): search_options = {} for opt in ['condition','minimum_should_match']: if opt in kwargs: search_options[opt] = kwargs.get(opt) del kwargs[opt] q = qdsl.range(field, **kwargs) if self._querybody.is_filtered(): d = {'filter': q} else: d = {'query': q} if search_options: d.update(search_options) self._querybody.chain(d) return self def search_geo(self, field, distance, lat, lon,**kwargs): condition = kwargs.get('condition', 'and') if 'condition' in kwargs: del kwargs['condition'] self._querybody.chain(qdsl.filter_(qdsl.geo_distance(field, [lon,lat], distance, **kwargs)), condition=condition) return self def missing( self, field, **kwargs): self._querybody.chain(qdsl.filter_(qdsl.missing(field))) return self def exists( self, field, **kwargs): self._querybody.chain(qdsl.filter_(qdsl.exists(field, **kwargs))) return self def delete(self, **kwargs): params = self._create_search_params() params.update(kwargs) self._es.delete_by_query(**params) def delete_in(self, ids): if not isinstance(ids, list): raise TypeError('argument to delete in must be a list.') bulk_docs = [] for i in ids: this_id = i this_type = self.base_obj.__type__ this_idx = self.idx if isinstance(i, VWBase): this_id = i.id this_type = i.__type__ try: this_idx = i.__index__ except AttributeError: pass bulk_docs.append({'_op_type': 'delete', '_type': this_type, '_index': this_idx, '_id': this_id }) return helpers.bulk( self._es, bulk_docs, chunk_size=self.bulk_chunk_size) # commits items in bulk def commit(self, callback=None): bulk_docs = [] if callback: if not callable(callback): raise TypeError('Argument 2 to commit() must be callable') # allow for a search to work if there are not _items if len(self._items) == 0: items = self.all() else: items = self._items for i in self._items: if callback: i = callback(i) i = self.execute_callbacks('on_bulk_commit', i) this_dict = {} this_id = '' this_idx = self.idx this_type = self.base_obj.__type__ if isinstance(i, VWBase): this_dict = i._create_source_document() this_type = i.__type__ this_id = i.id try: this_idx = i.__index__ except AttributeError: pass elif isinstance(i,dict): this_dict = i this_id = i.get('id') else: raise TypeError('Elments passed to the collection must be type of "dict" or "VWBase"') if not this_id: this_id = str(uuid4()) bulk_docs.append({'_op_type': 'index', '_type': this_type, '_index': this_idx, '_id': this_id, '_source': this_dict}) return helpers.bulk(self._es,bulk_docs,chunk_size=self.bulk_chunk_size)
class ESControl(): def __init__(self, index=None, doc_type=None): b = models.Blog.objects.all()[0] self._index = index or b.elastic_search_index self._doc_type = doc_type or b.elastic_search_doc_type self._es = Elasticsearch() @staticmethod def _hit_item_element(hit_item, element, fill_source=True): highlight = hit_item.get('highlight') source = hit_item.get('_source') if highlight and highlight.get(element): return '...'.join(highlight.get(element)) elif fill_source: return source.get(element) else: return '' def remove_entry(self, entry): self._es.delete(index=self._index, doc_type=self._doc_type, id=entry.id) def import_entry(self, entry): doc = {'title': entry.title, 'slug': entry.slug, 'content': strip_tags(entry.content), 'category': entry.category.name, 'pub_date': entry.pub_date} self._es.index(index=self._index, doc_type=self._doc_type, id=entry.id, body=doc) def import_entries(self): entries = models.Entry.objects.all() for entry in entries: self.import_entry(entry) def delete_index(self): self._es.indices.delete(self._index) def create_index(self): self._es.indices.create(self._index, essettings.index_definition(self._doc_type)) def update_analyzer_kuromoji(self): self._es.indices.close(self._index) self._es.indices.put_settings(essettings.kuromoji_analyzer_def(), self._index) self._es.indices.open(self._index) def search_entries(self, query): hits = self._es.search(index=self._index, doc_type=self._doc_type, body=essettings.search_query_body(query))['hits'] hit_list = [] for hit in hits['hits']: item = { 'score': hit.get('_score'), 'title': self._hit_item_element(hit, 'title'), 'slug': self._hit_item_element(hit, 'slug'), 'slug_source': hit.get('_source').get('slug'), 'content': self._hit_item_element(hit, 'content', fill_source=False), 'category': self._hit_item_element(hit, 'category'), 'pub_date': self._hit_item_element(hit, 'pub_date'), } hit_list.append(item) return hit_list def more_like_this(self, entry, **kwargs): """ :param entry: :return: a list of entries similar to the passed entry. """ query_result = self._es.mlt(self._index, self._doc_type, entry.id, **kwargs) entry_list = [] for hit in query_result['hits']['hits']: hit_info = {'entry': hit.get('_source'), 'score': hit.get('_score')} entry_list.append(hit_info) return entry_list
# Creating elasticsearch and index es = Elasticsearch(BONSAI_URL) es.indices.create(index='news', ignore=400) # Adding and retrieving a document doc = { 'domain': 'CNN', 'date': datetime(2010, 10, 10, 10, 10, 10), 'text': 'This is an article.' } res = es.index(index="news", doc_type='article', id=1, body=doc) res = es.get(index="news", doc_type='article', id=1) es.indices.refresh(index="news") # Returning all documents res = es.search(index="news", body={"query": {"match_all": {}}}) print("Got %d Hits:" % res['hits']['total']) for hit in res['hits']['hits']: print("%(timestamp)s %(author)s: %(text)s" % hit["_source"]) # Finding simlarity # NOTE similarity does not work for test sentences. # A senstence will generally not be simialr enough regardless es.mlt(index='news', doc_type="article", id=1, mlt_fields="text", search_size=7, min_term_freq=0, min_doc_freq=0, percent_terms_to_match= 0)
class VWCollection(VWCallback): def __init__(self, items=[], **kwargs): self.bulk_chunk_size = kwargs.get('bulk_chunk_size', config.bulk_chunk_size) self._sort = [] self.results_per_page = kwargs.get('results_per_page', config.results_per_page) self._querybody = querybuilder.QueryBody( ) # sets up the new query bodies if kwargs.get('base_obj'): self.base_obj = kwargs.get('base_obj') else: try: self.base_obj = self.__class__.__model__ except AttributeError: raise AttributeError( 'Base object must contain a model or pass base_obj') self._es = Elasticsearch(config.dsn) self._esc = client.IndicesClient(self._es) if '__index__' in dir(self.base_obj): idx = self.base_obj.__index__ else: idx = config.default_index self._search_params = [] self._raw = {} self.idx = idx self.type = self.base_obj.__type__ self._special_body = {} self._items = items # special list of items that can be committed in bulk # these values are used in the _build_body() to determine where additional _build_body() # options should exist. Defaults to and/must self._last_top_level_boolean = None self._last_boolean = None def search(self, q): self._search_params.append(q) return self # setup a raw request def raw(self, raw_request): self._raw = raw_request return self def filter_by(self, condition='and', **kwargs): if kwargs.get('condition'): condition = kwargs.get('condition') del kwargs['condition'] condition = self._translate_bool_condition(condition) for k, v in kwargs.iteritems(): if k == 'id' or k == 'ids': id_filter = v if not isinstance(id_filter, list): id_filter = [id_filter] #self._build_body( filter={"ids": {"values": id_filter } }, condition=condition ) self._querybody.chain(qdsl.ids(id_filter), condition=condition) else: try: analyzed = is_analyzed(getattr(self.base_obj, k)) except AttributeError: analyzed = is_analyzed(v) q_type = 'filter' if analyzed: q_type = 'query' if isinstance(v, list): # lists are treat as like "OR" #search_value = " or ".join( [ unicode(vitem) for vitem in v] ) #search_value = "(" + search_value + ")" self._querybody.chain(qdsl.terms(k, v), condition=condition, type=q_type) else: #search_value = unicode(v) if analyzed: self._querybody.chain(qdsl.match(unicode(k), v), condition=condition, type=q_type) else: self._querybody.chain(qdsl.term(unicode(k), v), condition=condition, type=q_type) return self def multi_match(self, fields, query, **kwargs): #self._build_body(query={"multi_match": { "fields": fields, "query": query } }, condition=kwargs.get('condition', None)) self._querybody.chain(qdsl.multi_match(query, fields), condition=kwargs.get('condition', None), type='query') return self def exact(self, field, value, **kwargs): try: field_template = getattr(self.base_obj, field) if type(field_template) != ESType: field_template = create_es_type(field_template) for estype in [String, IP, Attachment]: if isinstance(field_template, estype) and field_template.analyzed == True: logger.warn( str(estype.__class__.__name__) + ' types may not exact match correctly if they are analyzed' ) except AttributeError: logger.warn(str(field) + ' is not in the base model.') kwargs['type'] = 'filter' if isinstance(value, list): #self._build_body( filter={"terms": { field: value } }, **kwargs ) self._querybody.chain(qdsl.terms(field, value), **kwargs) else: #self._build_body( filter={"term": { field: value } }, **kwargs ) self._querybody.chain(qdsl.term(field, value), **kwargs) return self def or_(self, *args): return ' OR '.join(args) def and_(self, *args): return ' AND '.join(args) def get(self, id, **kwargs): try: params = dict(index=self.idx, doc_type=self.type, id=id) params.update(kwargs) #return self._create_obj( self._es.get(**params) ) doc = self._es.get(**params) if doc: return VWCollectionGen(self.base_obj, {'docs': [doc]})[0] return None except: return None def refresh(self, **kwargs): self._esc.refresh(index=self.idx, **kwargs) def get_in(self, ids, **kwargs): if len( ids ) > 0: # check for ids. empty list returns an empty list (instead of exception) params = dict(index=self.idx, doc_type=self.type, body={'ids': ids}) params.update(kwargs) res = self._es.mget(**params) if res and res.get('docs'): return VWCollectionGen(self.base_obj, res) return [] def get_like_this(self, doc_id, **kwargs): params = dict(index=self.idx, doc_type=self.type, id=doc_id) params.update(kwargs) res = self._es.mlt(**params) if res and res.get('docs'): return VWCollectionGen(self.base_obj, res) else: return [] def sort(self, **kwargs): for k, v in kwargs.iteritems(): v = v.lower() if v not in ['asc', 'desc']: v = 'asc' self._sort.append('%s:%s' % (k, v)) return self def clear_previous_search(self): self._raw = {} self._search_params = [] self._special_body = {} self._querybody = querybuilder.QueryBody() def _create_search_params(self, **kwargs): # before_query_build() is allowed to manipulate the object's internal state before we do the do self._querybody = self.execute_callbacks('before_query_build', self._querybody) q = {'index': self.idx, 'doc_type': self.type} if self._raw: q['body'] = self._raw elif len(self._search_params) > 0: kwargs['type'] = 'query' #q['body'] = self._build_body(query=qdsl.query_string( self.and_(*self._search_params), **kwargs) ) self._querybody.chain( qdsl.query_string(self.and_(*self._search_params)), **kwargs) else: q['body'] = qdsl.query(qdsl.match_all()) if self._querybody.is_filtered() or self._querybody.is_query(): q['body'] = self._querybody.build() # after_query_build() can manipulate the final query before being sent to ES # this is generally considered a bad idea but might be useful for logging q = self.execute_callbacks('after_query_build', q) logger.debug(json.dumps(q)) return q def count(self): params = self._create_search_params() resp = self._es.count(**params) return resp.get('count') def __len__(self): return self.count() def limit(self, count): self.results_per_page = count return self def all(self, **kwargs): params = self._create_search_params() if not params.get('size'): params['size'] = self.results_per_page if kwargs.get('results_per_page') != None: kwargs['size'] = kwargs.get('results_per_page') del kwargs['results_per_page'] if kwargs.get('start') != None: kwargs['from_'] = kwargs.get('start') del kwargs['start'] logger.debug(json.dumps(self._sort)) params.update(kwargs) if len(self._sort) > 0: if params.get('sort') and isinstance(params['sort'], list): params['sort'].extend(self._sort) else: params['sort'] = self._sort if params.get('sort'): if isinstance(params['sort'], list): params['sort'] = ','.join(params.get('sort')) else: raise TypeError('"sort" argument must be a list') logger.debug(json.dumps(params)) results = self._es.search(**params) #rows = results.get('hits').get('hits') return VWCollectionGen(self.base_obj, results) def one(self, **kwargs): kwargs['results_per_page'] = 1 results = self.all(**kwargs) try: return results[0] except IndexError: raise NoResultsFound('No result found for one()') # this is for legacy purposes in filter_by def _translate_bool_condition(self, _bool_condition): if _bool_condition == 'and': _bool_condition = 'must' elif _bool_condition == 'or': _bool_condition = 'should' elif _bool_condition == 'not': _bool_condition = 'must_not' # this is for things like geo_distance where we explicitly want the true and/or/not elif _bool_condition == 'explicit_and': _bool_condition = 'and' elif _bool_condition == 'explicit_or': _bool_condition = 'or' elif _bool_condition == 'explicit_not': _bool_condition = 'not' return _bool_condition def range(self, field, **kwargs): search_options = {} for opt in ['condition', 'minimum_should_match']: if opt in kwargs: search_options[opt] = kwargs.get(opt) del kwargs[opt] q = qdsl.range(field, **kwargs) if self._querybody.is_filtered(): d = {'filter': q} else: d = {'query': q} if search_options: d.update(search_options) #self._build_body(**d) self._querybody.chain(d) return self def search_geo(self, field, distance, lat, lon, **kwargs): condition = kwargs.get('condition', 'and') if 'condition' in kwargs: del kwargs['condition'] #self._build_body( filter={"geo_distance": { "distance": distance, field: [lon,lat] } }, condition='explicit_and', **kwargs ) self._querybody.chain(qdsl.filter_( qdsl.geo_distance(field, [lon, lat], distance, **kwargs)), condition=condition) return self def missing(self, field, **kwargs): #kwargs['filter'] = {"missing":{"field": field } } #self._build_body( **kwargs ) self._querybody.chain(qdsl.filter_(qdsl.missing(field))) return self def exists(self, field, **kwargs): #kwargs['filter'] = {"exists": { "field": field } } #self._build_body( **kwargs ) self._querybody.chain(qdsl.filter_(qdsl.exists(field, **kwargs))) return self def delete(self, **kwargs): params = self._create_search_params() params.update(kwargs) self._es.delete_by_query(**params) def delete_in(self, ids): if not isinstance(ids, list): raise TypeError('argument to delete in must be a list.') bulk_docs = [] for i in ids: this_id = i this_type = self.base_obj.__type__ this_idx = self.idx if isinstance(i, VWBase): this_id = i.id this_type = i.__type__ try: this_idx = i.__index__ except AttributeError: pass bulk_docs.append({ '_op_type': 'delete', '_type': this_type, '_index': this_idx, '_id': this_id }) return helpers.bulk(self._es, bulk_docs, chunk_size=self.bulk_chunk_size) # commits items in bulk def commit(self, callback=None): bulk_docs = [] if callback: if not callable(callback): raise TypeError('Argument 2 to commit() must be callable') # allow for a search to work if there are not _items if len(self._items) == 0: items = self.all() else: items = self._items for i in self._items: if callback: i = callback(i) i = self.execute_callbacks('on_bulk_commit', i) this_dict = {} this_id = '' this_idx = self.idx this_type = self.base_obj.__type__ if isinstance(i, VWBase): this_dict = i._create_source_document() this_type = i.__type__ this_id = i.id try: this_idx = i.__index__ except AttributeError: pass elif isinstance(i, dict): this_dict = i this_id = i.get('id') else: raise TypeError( 'Elments passed to the collection must be type of "dict" or "VWBase"' ) if not this_id: this_id = str(uuid4()) bulk_docs.append({ '_op_type': 'index', '_type': this_type, '_index': this_idx, '_id': this_id, '_source': this_dict }) return helpers.bulk(self._es, bulk_docs, chunk_size=self.bulk_chunk_size)
from elasticsearch import Elasticsearch es = Elasticsearch() q = { "query": { "bool": { "must": [ {"match": {"text": "obama"}}, {"match": {"text": "kerry"}} ] } } } res = es.search(index="haystack", body=q) #print("Got %d Hits:" % res['hits']['total']) for hit in res['hits']['hits']: #print("%(id)s: %(text)s" % hit["_source"]) similar = es.mlt( index='haystack', id=hit['_source']['id'], doc_type='modelresult', percent_terms_to_match=.1) if similar['hits']['total'] > 0: print(similar['hits']['hits'][0]['_source'])