示例#1
0
def text_similarity_scores(index, doc_type, **kwargs):
    """
    return a sparse matrix whose (i,j) entry is the similarity score of document i and document j of /index_name/doc_type
    """

    _build_default(kwargs)
    es = Elasticsearch()
    n = es.search(index=index,
                  doc_type=doc_type,
                  body={
                      "query": {
                          "match_all": {}
                      }
                  },
                  search_type="count").get('hits').get('total', 0)
    I = []
    J = []
    V = []
    for i in range(n):
        rv = es.mlt(index, doc_type, id=i, **kwargs)
        results = rv['hits']['hits']
        if len(results) > 0:
            j = int(rv['hits']['hits'][0]['_id'])
            score = rv['hits']['hits'][0]['_score']
            I.append(i)
            J.append(j)
            V.append(score)
    return coo_matrix((V, (I, J)), shape=(n, n))
示例#2
0
def text_similarity_score_by_id(index, doc_type, id, **kwargs):
    """
    Find the document most like the given id  
    Return: (id, score) if any match is found; otherwise, (-1, -np.inf)
    """

    _build_default(kwargs)
    es = Elasticsearch()
    rv = es.mlt(index=index, doc_type=doc_type, id=id, **kwargs)
    results = rv['hits']['hits']
    if len(results) > 0:
        j = int(rv['hits']['hits'][0]['_id'])
        score = rv['hits']['hits'][0]['_score']
        return (j, score)
    else:
        return (-1, -np.inf)
示例#3
0
class UMF_Analyzer:

    def __init__(self):
        self.es = Elasticsearch([{'host':'localhost', 'port': 9200}])
        self.scheme = ['bm25','ib','lmd','lmj','ngram','tfidf','dfr']
        self.umf_query = 'umf_query'
        self.umf_document = 'umf_document'
        self.docMap = pd.read_csv(open('doc_map.csv'),sep='\t',index_col=False)

    # Save vector as mat file
    def save_mat(self,filename,vector):
        v = vector.as_matrix()
        sio.savemat(filename,{'data':v})

    # Build Similarity vector between all pairs of users
    def build_similarity_vector(self,directory):
        labels = []
        ids = []
        
        for filename in os.listdir(directory):
            labels.append(filename.split('_')[3].split('.')[0])
            ids.append(filename.split('.')[0])
            
        qVector = {}
        for s in self.scheme:
            qVector[s] = pd.DataFrame()

            
        # Build Query Similarity Vector
        for filename1 in os.listdir(directory):
            data1 = pd.read_csv(open(directory + '/'+filename1),sep='\t',names=['query','document','time'])
            pivot_id = filename1.split('.')[0]
            v={}
    
            for s in self.scheme:
                v[s] = []
                
            for filename2 in os.listdir(directory):
                data2 = pd.read_csv(open(directory + '/' + filename2),sep='\t',names=['query','document','time'])
                qSim = self.calculate_query_similarities(data1['query'],data2['query'])
                dSim = self.calculate_document_similarities(data1['document'],data2['document'])
                
                #print "similarity:",sim
    
                for s in self.scheme:
                    v[s].append(qSim[s])
            
            for s in self.scheme:
                temp = {}
                for idx,l in enumerate(ids):
                    temp[l] = v[s][idx]

                a = pd.DataFrame(temp,index=[pivot_id],columns=ids)
                qVector[s] = qVector[s].append(a)
                
        # Build Document Similarity Vector

        
        return qVector,dVector
        
        
        #print pd.DataFrame(v)
        # vectors = {}
        # for s in self.scheme:
        #     vectors[s] = pd.DataFrame()
        #     for i in v:
        #         temp = []
        #         for j in i:
        #             temp.append(j[s])
        #         vectors[s] = vectors[s].append(pd.DataFrame(temp))
        #         print temp

        # print 'test'
        # print vectors['ngram']
                
                    

                
        
       
    # remove duplicates
    def remove_duplicates(self,queries):
        return Set(queries)
    
    # Query Refinement
    def query_refine(self,q):
        q = q.replace('\'','')
        q = q.replace(']','')
        q = q.replace('[','')
        q = q.replace(',','')
        q = q.replace('\"','')
        return q

    def display_analysis(self):
        print "############Query Inner Similarity#############"
        scores = self.inner_similarity_query('data')
        for i in range(9):
            print "######################Q",i+1,"#################"
            for s in self.scheme:
                print s,":",scores[i][s]

        print "############Document Inner Similarity#############"
        scores = self.inner_similarity_document('data')
        for i in range(9):
            print "Q",i+1,":",scores[i]
        

    def inner_similarity_query(self,directory,display=False):
        l = []
        for i in range(9):
            files = []
            avg = {}
            cnt = 0
            for s in self.scheme:
                avg[s] = 0

            for filename in os.listdir(directory):
                if filename.split('_')[3] == str(i+1) + '.csv':
                    files.append(filename)
    
            for k in range(len(files)-1):
                qSet1 = pd.read_csv(open(directory + '/' + files[k]),sep='\t',names=['query','document','time'])
                for j in range(k+1,len(files)):
                    qSet2 = pd.read_csv(open(directory + '/' +files[j]),sep='\t',names=['query','document','time'])
                    Sim_kj = self.calculate_query_similarities(qSet1['query'],qSet2['query'])
                    
                    for s in self.scheme:
                        avg[s] = avg[s] + Sim_kj[s]
                    cnt = cnt + 1
                    if display:
                        print Sim_kj

            for s in self.scheme:
                avg[s] = avg[s]/cnt

            l.append(avg)
        return l

    def inner_similarity_document(self,directory,display=False):
        l = []
        for i in range(9):
            files = []
            avg = 0
            cnt = 0
            
            for filename in os.listdir(directory):
                if filename.split('_')[3] == str(i+1) + '.csv':
                    files.append(filename)
                    
            for k in range(len(files)-1):
                dSet1 = pd.read_csv(open(directory + '/' + files[k]),sep='\t',names=['query','document','time'])
                for j in range(k+1,len(files)):
                    dSet2 = pd.read_csv(open(directory + '/' + files[j]),sep='\t',names=['query','document','time'])
                    Sim_kj = self.calculate_document_similarities(dSet1['document'],dSet2['document'])
                    
                    avg = avg+Sim_kj
                    cnt = cnt + 1
                    if display:
                        print Sim_kj
    
            l.append(avg/cnt)
        return l

    def outer_similairty_document(self,directory,display=False):
        for i in range(9):
            files1 = []
            files2 = []
            cnt = 0
            avg = {}

            for s in self.scheme:
                avg[s] = 0

            for filename in os.listdir(directory):
                if filename.split('_')[3] == str(i+1) + '.csv':
                    files1.append(filename)

            for filename in os.listdir(directory):
                if not filename.split('_')[3] == str(i+1) + '.csv':
                    files2.append(filename)


            for k in range(len(files) -1):
                qSet1 = pd.read_csv(open(directory + '/' + files[k]),sep='\t',names=['query','document','time'])
                for j in range(k+1,len(files)):
                    qSet2 = pd.read_csv(open(directory + '/' +files[j]),sep='\t',names=['query','document','time'])
                    Sim_kj = self.calculate_query_similarities(qSet1['query'],qSet2['query'])
                    
                    for s in self.scheme:
                        avg[s] = avg[s] + Sim_kj[s]
                    cnt = cnt + 1    
                    if display:
                        print Sim_kj

            for s in self.scheme:
                avg[s] = avg[s]/cnt
            print "Average:",avg
            
            
    def calculate_cluster_query_similarity(self,directory, num1,num2):
        avg = {}
        files1 = []
        files2 = []
        cnt = 0

        for s in self.scheme:
            avg[s] = 0

        for filename in os.listdir(directory):
            if filename.split('_')[3] == str(num1) + '.csv':
                files1.append(filename)

        for filename in os.listdir(directory):
            if filename.split('_')[3] == str(num2) + '.csv':
                files2.append(filename)

        for i in range(len(files1) -1):
            qSet1 = pd.read_csv(open(directory + '/' + files1[i]),sep='\t',names=['query','document','time'])
            for j in range(i+1,len(files2)):
                qSet2 = pd.read_csv(open(directory + '/' + files2[j]),sep='\t',names=['query','document','time'])
            
                Sim_ij = self.calculate_query_similarities(qSet1['query'],qSet2['query'])

                for s in self.scheme:
                    avg[s] = avg[s] + Sim_ij[s]
                cnt = cnt+1
        for s in self.scheme:
            avg[s] = avg[s]/cnt
        return avg

    def calculate_cluster_document_similarity(self,directory, num1,num2):
        avg = 0
        files1 = []
        files2 = []
        cnt = 0

        for filename in os.listdir(directory):
            if filename.split('_')[3] == str(num1) + '.csv':
                files1.append(filename)

        for filename in os.listdir(directory):
            if filename.split('_')[3] == str(num2) + '.csv':
                files2.append(filename)

        for i in range(len(files1) -1):
            qSet1 = pd.read_csv(open(directory + '/' + files1[i]),sep='\t',names=['query','document','time'])
            for j in range(i+1,len(files2)):
                qSet2 = pd.read_csv(open(directory + '/' + files2[j]),sep='\t',names=['query','document','time'])
                Sim_ij = self.calculate_document_similarities(qSet1['document'],qSet2['document'])
                avg = avg + Sim_ij
                cnt = cnt+1
    
        return avg/cnt

    # This function calculates similarities between two sets of queries.
    # 'function calculate_query_similarity' is called
    def calculate_query_similarities(self,qSet1,qSet2):
        qSet1 = self.remove_duplicates(qSet1)
        qSet2 = self.remove_duplicates(qSet2)
        
        scores = {}
        for s in self.scheme:
            scores[s] = 0

        for q1 in qSet1:
            for q2 in qSet2:
                rQ1 = self.query_refine(q1)
                rQ2 = self.query_refine(q2)

                score = self.calculate_query_similarity(rQ1,rQ2)
                for s in self.scheme:
                    scores[s] = scores[s] + score[s]

        cnt = len(qSet1) * len(qSet2)
        for s in self.scheme:
            scores[s] = scores[s]/cnt
            
        return scores
                                     
    # This function calculates similarity between two queries
    def calculate_query_similarity(self,q1,q2):
        scores = {}

        for s in self.scheme:
            scores[s] = 0
            analyzer = 'my_' + s + '_analyzer'
            content = q1.replace(r"/",",")
            res = self.es.search(index=self.umf_query+ '_' + s, q=content,doc_type='query',analyzer=analyzer,size=4000)
            
            for entry in res['hits']['hits']:
                if q2 == entry['_source']['query']:
                    scores[s] = entry['_score']
                    break
                    
        return scores

    # Getting Body Text extracted from Web page
    # Return the extracted document
    def getDocumentFromURL(self,url):
        # from goose import Goose
        # g = Goose()
        # article = g.extract(url=url)
        # text = ''.join([i if ord(i) < 128 else '' for i in article.cleaned_text])
        # return text
        
        for idx,entry in self.docMap.iterrows():
            if entry['key'] == url:
                if type(entry['value']) == float:
                    return None
                return entry['value']
            
    def getDocumentIDFromURL(self,url):
        for idx,entry in self.docMap.iterrows():
            if entry['key'] == url:
                return entry['id']

    def getDocumentIDFromDocument(self,document):
        for idx,entry in self.docMap.iterrows():
            if entry['value'] == document:
                return entry['id']

    def document_preprocess(self,dSet):
        dSet = self.remove_duplicates(dSet) # remove duplicates

        if 'https://google.com/' in dSet:
            dSett = dSet.remove('https://google.com/')
        if 'http://google.com/' in dSet:
            dSet.remove('http://google.com/')
        if 'http://www.google.com/' in dSet:
            dSet.remove('http://www.google.com/')
        if 'https://www.google.com/' in dSet:
            dSet.remove('https://www.google.com/')
        if 'http://www.google.co.kr/' in dSet:
            dSet.remove('http://www.google.co.kr/')
        if 'http://www.google.com/webhp?hl=en' in dSet:
            dSet.remove('http://www.google.com/webhp?hl=en')
        if 'about:blank' in dSet:
            dSet.remove('about:blank')

        if 'google.com' in dSet:
            dSet.remove('google.com')

        return dSet
    

    
    def calculate_document_similarities(self,dSet1,dSet2):
        dSet1 = self.document_preprocess(dSet1)
        dSet2 = self.document_preprocess(dSet2)

        score = 0
        for d1 in dSet1:
            for d2 in dSet2:
                rD1 = self.getDocumentFromURL(d1)
                rD2 = self.getDocumentFromURL(d2)

                if rD2 == None or rD1 == None:
                    continue

                score = score + self.calculate_document_similarity(rD1,rD2)

        cnt = len(dSet1) * len(dSet2)

        return score/cnt
                

    def calculate_document_similarity(self,d1,d2):
        ID1 = self.getDocumentIDFromDocument(d1)
        ID2 = self.getDocumentIDFromDocument(d2)

        if ID1 == ID2:
            return 1.0

        score = 0
        res = self.es.mlt(index=self.umf_document+'_bm25',doc_type='document',id=ID1,search_size=200)
        for entry in res['hits']['hits']:
            if entry['_id'] == ID2:
                return entry['_score']
        return 0.0
示例#4
0
def text_similarity_clustering(index, doc_type, searchsize, cutv, **kwargs):

    _build_default(kwargs)
    es = Elasticsearch()

    # --- retrieve the document _ids
    from elasticsearch_dsl import Search
    s = Search(using=es, index=index, doc_type=doc_type)
    s = s.fields(
        [])  # only get ids, otherwise `fields` takes a list of field names
    ids = [h.meta.id for h in s.scan()]

    n = s.count()

    # restrict the searchsize between 1 and 10
    searchsize = 10 if searchsize >= 10 else searchsize
    searchsize = 1 if searchsize < 1 else searchsize

    # restrict the cutv between > 0
    cutv = 0 if cutv < 0 else cutv
    # store the sparse adjaency matrix

    I = []
    J = []
    V = []
    RV = []
    for i in ids:
        rv = es.mlt(index, doc_type, id=i, **kwargs)
        results = rv['hits']['hits']
        if len(results) > 0:
            for loop in xrange(searchsize):
                j = int(rv['hits']['hits'][loop]['_id'])
                score = rv['hits']['hits'][loop]['_score']
                if score >= cutv:
                    I.append(ids.index(i))
                    J.append(j)
                    V.append(score)
                    RV.append(score)

    # construct the adjaency matrix using Sparse Index
    A = coo_matrix((V, (I, J)), shape=(n, n))
    RA = coo_matrix((RV, (I, J)), shape=(n, n))

    # construct a graph
    G = nx.from_scipy_sparse_matrix(A)

    # obtain the degree vlaues for all the nodes
    # {0: 0, 1: 1, 2: 1, 3: 2, 4: 3, 5: 4, 6: 4, 7: 1, 8: 2, 9: 0,
    # 10: 4, 11: 0, 12: 4, 13: 1, 14: 1, 15: 1, 16: 4, 17: 4, 18: 3,
    # 19: 4, 20: 4, 21: 2, 22: 1, 23: 0, 24: 1, 25: 4, 26: 3, 27: 1,
    # 28: 1, 29: 2, 30: 3, 31: 4, 32: 1, 33: 4, 34: 2, 35: 2, 36: 1,
    # 37: 1, 38: 3, 39: 1, 40: 1, 41: 4, 42: 0, 43: 0, 44: 2, 45: 4,
    # 46: 1, 47: 1, 48: 2, 49: 0, 50: 1, 51: 1, 52: 3, 53: 1, 54: 2,
    # 55: 0, 56: 1, 57: 1, 58: 3, 59: 5, 60: 0, 61: 2, 62: 2, 63: 3,
    # 64: 5, 65: 1, 66: 1, 67: 4, 68: 5, 69: 2, 70: 4, 71: 3, 72: 4,
    # 73: 4, 74: 4, 75: 2, 76: 2, 77: 3, 78: 3, 79: 2, 80: 3, 81: 1,
    # 82: 1, 83: 2, 84: 0, 85: 2, 86: 4, 87: 2, 88: 1, 89: 1, 90: 4,
    # 91: 2, 92: 0, 93: 3, 94: 1, 95: 3, 96: 2, 97: 3, 98: 0, 99: 3,
    # 100: 0, 101: 1, 102: 1, 103: 2, 104: 3, 105: 4, 106: 4, 107: 1,
    # 108: 2, 109: 4, 110: 2, 111: 0, 112: 1, 113: 1, 114: 1, 115: 1,
    # 116: 4, 117: 4, 118: 3, 119: 4, 120: 4, 121: 2, 122: 1, 123: 0,
    # 124: 2, 125: 4, 126: 5, 127: 1, 128: 1, 129: 2, 130: 3, 131: 4,
    # 132: 1, 133: 4, 134: 2, 135: 2, 136: 0, 137: 1, 138: 3, 139: 0,
    # 140: 1, 141: 4, 142: 0, 143: 0, 144: 2, 145: 4, 146: 1, 147: 1,
    # 148: 2, 149: 0, 150: 1, 151: 1, 152: 3, 153: 1, 154: 2, 155: 0,
    # 156: 1, 157: 1, 158: 3, 159: 5, 160: 0, 161: 2, 162: 2, 163: 3,
    # 164: 5, 165: 1, 166: 1, 167: 4, 168: 5, 169: 2, 170: 4, 171: 3,
    # 172: 4, 173: 4, 174: 4, 175: 2, 176: 2, 177: 0, 178: 4, 179: 2,
    # 180: 3, 181: 1, 182: 1, 183: 2, 184: 0, 185: 2, 186: 4, 187: 2,
    # 188: 1, 189: 1, 190: 4, 191: 2, 192: 0, 193: 3, 194: 1, 195: 3,
    # 196: 2, 197: 3, 198: 0, 199: 3}

    D = G.degree().values()

    # partition the graph by modularity
    # return a dictionary
    #  {0: 0, 1: 1, 2: 1, 3: 2, 4: 3, 5: 4, 6: 4, 7: 1, 8: 2, 9: 0, 10: 4, 11: 0, 12: 4, 13: 1, 14: 1, 15: 1, 16: 4, 17: 4, 18: 3, 19: 4,
    #  20: 4, 21: 2, 22: 1, 23: 0, 24: 1, 25: 4, 26: 3, 27: 1, 28: 1, 29: 2, 30: 3, 31: 4, 32: 1, 33: 4, 34: 2, 35: 2, 36: 1, 37: 1, 38:
    #  3, 39: 1, 40: 1, 41: 4, 42: 0, 43: 0, 44: 2, 45: 4, 46: 1, 47: 1, 48: 2, 49: 0, 50: 1, 51: 1, 52: 3, 53: 1, 54: 2, 55: 0, 56: 1,
    #  57: 1, 58: 3, 59: 5, 60: 0, 61: 2, 62: 2, 63: 3, 64: 5, 65: 1, 66: 1, 67: 4, 68: 5, 69: 2, 70: 4, 71: 3, 72: 4, 73: 4, 74: 4,
    #  75: 2, 76: 2, 77: 3, 78: 3, 79: 2, 80: 3, 81: 1, 82: 1, 83: 2, 84: 0, 85: 2, 86: 4, 87: 2, 88: 1, 89: 1, 90: 4, 91: 2, 92: 0,
    #  93: 3, 94: 1, 95: 3, 96: 2, 97: 3, 98: 0, 99: 3, 100: 0, 101: 1, 102: 1, 103: 2, 104: 3, 105: 4, 106: 4, 107: 1, 108: 2, 109: 4,
    #  110: 2, 111: 0, 112: 1, 113: 1, 114: 1, 115: 1, 116: 4, 117: 4, 118: 3, 119: 4, 120: 4, 121: 2, 122: 1, 123: 0, 124: 2, 125: 4,
    #  126: 5, 127: 1, 128: 1, 129: 2, 130: 3, 131: 4, 132: 1, 133: 4, 134: 2, 135: 2, 136: 0, 137: 1, 138: 3, 139: 0, 140: 1, 141: 4,
    #  142: 0, 143: 0, 144: 2, 145: 4, 146: 1, 147: 1, 148: 2, 149: 0, 150: 1, 151: 1, 152: 3, 153: 1, 154: 2, 155: 0, 156: 1, 157: 1,
    #  158: 3, 159: 5, 160: 0, 161: 2, 162: 2, 163: 3, 164: 5, 165: 1, 166: 1, 167: 4, 168: 5, 169: 2, 170: 4, 171: 3, 172: 4, 173: 4,
    #  174: 4, 175: 2, 176: 2, 177: 0, 178: 4, 179: 2, 180: 3, 181: 1, 182: 1, 183: 2, 184: 0, 185: 2, 186: 4, 187: 2, 188: 1, 189: 1,
    #  190: 4, 191: 2, 192: 0, 193: 3, 194: 1, 195: 3, 196: 2, 197: 3, 198: 0, 199: 3}
    # where key is the document id, and value is the parition id

    partition = community.best_partition(G)
    RAC = RA.tocsr()

    return get_map_document(partition, ids, D, RAC)
示例#5
0
class VWCollection(VWCallback):
    
    def __init__(self,items=[],**kwargs):
        self.bulk_chunk_size = kwargs.get('bulk_chunk_size',
            config.bulk_chunk_size)
        self._sort = []
        self.results_per_page = kwargs.get('results_per_page',
            config.results_per_page)
        self._querybody = querybuilder.QueryBody() # sets up the new query bodies

        if kwargs.get('base_obj'):
            self.base_obj = kwargs.get('base_obj')
        else:
            try:
                self.base_obj = self.__class__.__model__
            except AttributeError:
                raise AttributeError('Base object must contain a model or pass base_obj')

        self._es = Elasticsearch(config.dsn)
        self._esc = client.IndicesClient(self._es)

        if '__index__' in dir(self.base_obj):
            idx = self.base_obj.__index__
        else:
            idx = config.default_index

        self._search_params = []
        self._raw = {}
        self.idx = idx
        self.type = self.base_obj.__type__
        self._special_body = {}
        
        # special list of items that can be committed in bulk
        self._items = items 

    def search(self,q):
        self._search_params.append(q)
        return self

    # setup a raw request
    def raw(self, raw_request):
        self._raw = raw_request
        return self

    def filter_by(self, condition = 'and',**kwargs):
        if kwargs.get('condition'):
            condition=kwargs.get('condition')
            del kwargs['condition']

        condition = self._translate_bool_condition(condition)

        for k,v in kwargs.iteritems():
            if k == 'id' or k == 'ids':
                id_filter = v
                if not isinstance(id_filter, list):
                    id_filter = [id_filter]

                self._querybody.chain(qdsl.ids(id_filter), condition=condition)
            else:
                try:
                    analyzed = is_analyzed(getattr(self.base_obj, k))
                except AttributeError:
                    analyzed = is_analyzed(v)

                q_type = 'filter'
                if analyzed:
                    q_type = 'query'

                if isinstance(v, list):
                    # lists are treat as like "OR" (using terms() on not_analyzed, bool/matched on analyzed)
                    if analyzed:
                        match_queries = []
                        for item in v:
                            match_queries.append( qdsl.match(k,item) )
                        self._querybody.chain( qdsl.bool(qdsl.should(match_queries)), condition=condition,type=q_type )
                    else:
                        self._querybody.chain( qdsl.terms(k,v),condition=condition,
                            type=q_type)
                else:
                    #search_value = unicode(v)
                    if analyzed:
                        self._querybody.chain(qdsl.match(unicode(k), v), condition=condition,type=q_type)
                    else:
                        self._querybody.chain(qdsl.term(unicode(k), v), condition=condition,type=q_type)

        return self

    def multi_match(self, fields, query, **kwargs):
        self._querybody.chain(qdsl.multi_match(query, fields), condition=kwargs.get('condition', None), type='query')
        return self

    def exact(self, field, value,**kwargs):
        try:
            field_template = getattr( self.base_obj, field)

            if type(field_template) != ESType:
                field_template = create_es_type(field_template)

            for estype in [String,IP,Attachment]:
                if isinstance(field_template, estype) and field_template.analyzed == True:
                    logger.warn('%s types may not exact match correctly if they are analyzed' % unicode(estype.__class__.__name__))

        except AttributeError:
            logger.warn('%s is not in the base model.' % unicode(field))

        kwargs['type'] = 'filter'
        if isinstance(value, list):
            self._querybody.chain(qdsl.terms(field,value), **kwargs)
        else:
            self._querybody.chain(qdsl.term(field, value), **kwargs)

        return self


    def or_(self,*args):
        return ' OR '.join(args)

    def and_(self,*args):
        return ' AND '.join(args)

    def get(self,id, **kwargs):
        try:
            params = {'index':self.idx, 'doc_type':self.type, 'id':id}
            params.update(kwargs)
            doc = self._es.get(**params)
            if doc:
                return VWCollectionGen(self.base_obj, {'docs':[doc]})[0]

            return None

        except:
            # TODO. Discuss this. Should get() return None even on exceptions?
            return None

    def refresh(self, **kwargs):
        self._esc.refresh(index=self.idx, **kwargs)

    def get_in(self, ids,**kwargs):
        if len(ids) > 0: # check for ids. empty list returns an empty list (instead of exception)
            params = {'index':self.idx, 'doc_type':self.type, 'body':{'ids':ids}}
            params.update(kwargs);
            res = self._es.mget(**params)
            if res and res.get('docs'):
                return VWCollectionGen(self.base_obj, res)

        return []

    def get_like_this(self,doc_id,**kwargs):
        params = {'index':self.idx,'doc_type':self.type,'id':doc_id}
        params.update(kwargs)
        res = self._es.mlt(**params)

        if res and res.get('docs'):
            return VWCollectionGen(self.base_obj, res)
        else:
            return []

    def sort(self, **kwargs):
        for k,v in kwargs.iteritems():
            v = v.lower()
            if v not in ['asc','desc']:
                v = 'asc'

            self._sort.append('%s:%s' % (k,v))
        return self

    def clear_previous_search(self):
        self._raw = {}
        self._search_params = []
        self._special_body = {}
        self._querybody = querybuilder.QueryBody()

    def _create_search_params( self, **kwargs ):
        # before_query_build() is allowed to manipulate the object's internal state before we do stuff
        self._querybody = self.execute_callbacks('before_query_build', self._querybody )

        q = {
            'index': self.idx,
            'doc_type': self.type
        }

        if self._raw:
            q['body'] = self._raw
        elif len(self._search_params) > 0:
            kwargs['type'] = 'query'
            self._querybody.chain(qdsl.query_string(self.and_(*self._search_params)), **kwargs)
        else:
            q['body'] = qdsl.query(qdsl.match_all())

        if self._querybody.is_filtered() or self._querybody.is_query():
            q['body'] = self._querybody.build()

        # after_query_build() can manipulate the final query before being sent to ES
        # this is generally considered a bad idea but might be useful for logging
        q = self.execute_callbacks( 'after_query_build', q )

        logger.debug(json.dumps(q))
        return q

    def count(self):
        params = self._create_search_params()
        resp = self._es.count(**params)
        return resp.get('count')

    def __len__(self):
        return self.count()

    def limit(self,count):
        self.results_per_page = count
        return self

    def all(self,**kwargs):

        params = self._create_search_params()
        if not params.get('size'):
            params['size'] = self.results_per_page

        if kwargs.get('results_per_page') != None:
            kwargs['size'] = kwargs.get('results_per_page')
            del kwargs['results_per_page']

        if kwargs.get('start') != None:
            kwargs['from_'] = kwargs.get('start')
            del kwargs['start']

        logger.debug(json.dumps(self._sort))

        params.update(kwargs)
        if len(self._sort) > 0:
            if params.get('sort') and isinstance(params['sort'], list):
                params['sort'].extend(self._sort)
            else:
                params['sort'] = self._sort

        if params.get('sort'):
            if isinstance(params['sort'], list):
                params['sort'] = ','.join(params.get('sort'))
            else:
                raise TypeError('"sort" argument must be a list')

        logger.debug(json.dumps(params))
        results = self._es.search(**params)

        return VWCollectionGen(self.base_obj,results)

    def one(self,**kwargs):
        kwargs['results_per_page'] = 1
        results = self.all(**kwargs)
        try:
            return results[0]
        except IndexError:
            raise NoResultsFound('No result found for one()')

    # this is for legacy purposes in filter_by
    def _translate_bool_condition(self,_bool_condition):
        if _bool_condition == 'and':
            _bool_condition = 'must'
        elif _bool_condition == 'or':
            _bool_condition = 'should'
        elif _bool_condition == 'not':
            _bool_condition = 'must_not'

        # this is for things like geo_distance where we explicitly want the true and/or/not
        elif _bool_condition == 'explicit_and':
            _bool_condition = 'and'
        elif _bool_condition == 'explicit_or':
            _bool_condition = 'or'
        elif _bool_condition == 'explicit_not':
            _bool_condition = 'not'

        return _bool_condition

    def range(self, field, **kwargs):
        search_options = {}
        for opt in ['condition','minimum_should_match']:
            if opt in kwargs:
                search_options[opt] = kwargs.get(opt)
                del kwargs[opt]

        q = qdsl.range(field, **kwargs)
        if self._querybody.is_filtered():
            d = {'filter': q}
        else:
            d = {'query': q}

        if search_options:
            d.update(search_options)

        self._querybody.chain(d)

        return self

    def search_geo(self, field, distance, lat, lon,**kwargs):
        condition = kwargs.get('condition', 'and')
        if 'condition' in kwargs:
            del kwargs['condition']

        self._querybody.chain(qdsl.filter_(qdsl.geo_distance(field, [lon,lat], distance, **kwargs)), condition=condition)
        return self

    def missing( self, field, **kwargs):
        self._querybody.chain(qdsl.filter_(qdsl.missing(field)))
        return self

    def exists( self, field, **kwargs):
        self._querybody.chain(qdsl.filter_(qdsl.exists(field, **kwargs)))
        return self

    def delete(self, **kwargs):
        params = self._create_search_params()
        params.update(kwargs)
        self._es.delete_by_query(**params)

    def delete_in(self, ids):
        if not isinstance(ids, list):
            raise TypeError('argument to delete in must be a list.')

        bulk_docs = []
        for i in ids:
            this_id = i
            this_type = self.base_obj.__type__
            this_idx = self.idx
            if isinstance(i, VWBase):
                this_id = i.id
                this_type = i.__type__
                try:
                    this_idx = i.__index__
                except AttributeError:
                    pass

            bulk_docs.append({'_op_type': 'delete', '_type': this_type, '_index': this_idx, '_id': this_id })

        return helpers.bulk( self._es, bulk_docs, chunk_size=self.bulk_chunk_size)

    # commits items in bulk
    def commit(self, callback=None):
        bulk_docs = []

        if callback:
            if not callable(callback):
                raise TypeError('Argument 2 to commit() must be callable')

        # allow for a search to work if there are not _items
        if len(self._items) == 0:
            items = self.all()
        else:
            items = self._items

        for i in self._items:
            if callback:
                i = callback(i)

            i = self.execute_callbacks('on_bulk_commit', i)

            this_dict = {}
            this_id = ''
            this_idx = self.idx
            this_type = self.base_obj.__type__
            if isinstance(i, VWBase):
                this_dict = i._create_source_document()
                this_type = i.__type__
                this_id = i.id
                try:
                    this_idx = i.__index__
                except AttributeError:
                    pass

            elif isinstance(i,dict):
                this_dict = i
                this_id = i.get('id')

            else:
                raise TypeError('Elments passed to the collection must be type of "dict" or "VWBase"')

            if not this_id:
                this_id = str(uuid4())

            bulk_docs.append({'_op_type': 'index', '_type': this_type, '_index': this_idx, '_id': this_id, '_source': this_dict})

        return helpers.bulk(self._es,bulk_docs,chunk_size=self.bulk_chunk_size)
示例#6
0
class ESControl():
    def __init__(self, index=None, doc_type=None):
        b = models.Blog.objects.all()[0]
        self._index = index or b.elastic_search_index
        self._doc_type = doc_type or b.elastic_search_doc_type
        self._es = Elasticsearch()

    @staticmethod
    def _hit_item_element(hit_item, element, fill_source=True):
        highlight = hit_item.get('highlight')
        source = hit_item.get('_source')
        if highlight and highlight.get(element):
            return '...'.join(highlight.get(element))
        elif fill_source:
            return source.get(element)
        else:
            return ''

    def remove_entry(self, entry):
        self._es.delete(index=self._index, doc_type=self._doc_type, id=entry.id)

    def import_entry(self, entry):
        doc = {'title': entry.title,
               'slug': entry.slug,
               'content': strip_tags(entry.content),
               'category': entry.category.name,
               'pub_date': entry.pub_date}
        self._es.index(index=self._index, doc_type=self._doc_type, id=entry.id, body=doc)

    def import_entries(self):
        entries = models.Entry.objects.all()
        for entry in entries:
            self.import_entry(entry)

    def delete_index(self):
        self._es.indices.delete(self._index)

    def create_index(self):
        self._es.indices.create(self._index, essettings.index_definition(self._doc_type))

    def update_analyzer_kuromoji(self):
        self._es.indices.close(self._index)
        self._es.indices.put_settings(essettings.kuromoji_analyzer_def(), self._index)
        self._es.indices.open(self._index)

    def search_entries(self, query):
        hits = self._es.search(index=self._index,
                               doc_type=self._doc_type,
                               body=essettings.search_query_body(query))['hits']
        hit_list = []
        for hit in hits['hits']:
            item = {
                'score': hit.get('_score'),
                'title': self._hit_item_element(hit, 'title'),
                'slug': self._hit_item_element(hit, 'slug'),
                'slug_source': hit.get('_source').get('slug'),
                'content': self._hit_item_element(hit, 'content', fill_source=False),
                'category': self._hit_item_element(hit, 'category'),
                'pub_date': self._hit_item_element(hit, 'pub_date'),
            }
            hit_list.append(item)
        return hit_list

    def more_like_this(self, entry, **kwargs):
        """
        :param entry:
        :return: a list of entries similar to the passed entry.
        """
        query_result = self._es.mlt(self._index, self._doc_type, entry.id, **kwargs)
        entry_list = []
        for hit in query_result['hits']['hits']:
            hit_info = {'entry': hit.get('_source'),
                        'score': hit.get('_score')}
            entry_list.append(hit_info)

        return entry_list
示例#7
0
# Creating elasticsearch and index
es = Elasticsearch(BONSAI_URL)
es.indices.create(index='news', ignore=400)

# Adding and retrieving a document
doc = {
    'domain': 'CNN',
    'date': datetime(2010, 10, 10, 10, 10, 10),
    'text': 'This is an article.'
}
res = es.index(index="news", doc_type='article', id=1, body=doc)
res = es.get(index="news", doc_type='article', id=1)
es.indices.refresh(index="news")

# Returning all documents
res = es.search(index="news", body={"query": {"match_all": {}}})
print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
    print("%(timestamp)s %(author)s: %(text)s" % hit["_source"])

# Finding simlarity
# NOTE similarity does not work for test sentences. 
# A senstence will generally not be simialr enough regardless
es.mlt(index='news', doc_type="article",
                          id=1, mlt_fields="text", 
                          search_size=7,
                          min_term_freq=0,
                          min_doc_freq=0,
                          percent_terms_to_match= 0)

示例#8
0
class VWCollection(VWCallback):
    def __init__(self, items=[], **kwargs):

        self.bulk_chunk_size = kwargs.get('bulk_chunk_size',
                                          config.bulk_chunk_size)

        self._sort = []

        self.results_per_page = kwargs.get('results_per_page',
                                           config.results_per_page)

        self._querybody = querybuilder.QueryBody(
        )  # sets up the new query bodies

        if kwargs.get('base_obj'):
            self.base_obj = kwargs.get('base_obj')
        else:
            try:
                self.base_obj = self.__class__.__model__
            except AttributeError:
                raise AttributeError(
                    'Base object must contain a model or pass base_obj')

        self._es = Elasticsearch(config.dsn)
        self._esc = client.IndicesClient(self._es)

        if '__index__' in dir(self.base_obj):
            idx = self.base_obj.__index__
        else:
            idx = config.default_index

        self._search_params = []
        self._raw = {}
        self.idx = idx
        self.type = self.base_obj.__type__
        self._special_body = {}
        self._items = items  # special list of items that can be committed in bulk

        # these values are used in the _build_body() to determine where additional _build_body()
        # options should exist. Defaults to and/must
        self._last_top_level_boolean = None
        self._last_boolean = None

    def search(self, q):
        self._search_params.append(q)
        return self

    # setup a raw request
    def raw(self, raw_request):
        self._raw = raw_request
        return self

    def filter_by(self, condition='and', **kwargs):
        if kwargs.get('condition'):
            condition = kwargs.get('condition')
            del kwargs['condition']

        condition = self._translate_bool_condition(condition)

        for k, v in kwargs.iteritems():
            if k == 'id' or k == 'ids':
                id_filter = v
                if not isinstance(id_filter, list):
                    id_filter = [id_filter]

                #self._build_body( filter={"ids": {"values": id_filter } }, condition=condition )
                self._querybody.chain(qdsl.ids(id_filter), condition=condition)
            else:
                try:
                    analyzed = is_analyzed(getattr(self.base_obj, k))
                except AttributeError:
                    analyzed = is_analyzed(v)

                q_type = 'filter'
                if analyzed:
                    q_type = 'query'

                if isinstance(v, list):
                    # lists are treat as like "OR"
                    #search_value = " or ".join( [ unicode(vitem) for vitem in v] )
                    #search_value = "(" + search_value + ")"
                    self._querybody.chain(qdsl.terms(k, v),
                                          condition=condition,
                                          type=q_type)
                else:
                    #search_value = unicode(v)
                    if analyzed:
                        self._querybody.chain(qdsl.match(unicode(k), v),
                                              condition=condition,
                                              type=q_type)
                    else:
                        self._querybody.chain(qdsl.term(unicode(k), v),
                                              condition=condition,
                                              type=q_type)

        return self

    def multi_match(self, fields, query, **kwargs):
        #self._build_body(query={"multi_match": { "fields": fields, "query": query } }, condition=kwargs.get('condition', None))
        self._querybody.chain(qdsl.multi_match(query, fields),
                              condition=kwargs.get('condition', None),
                              type='query')
        return self

    def exact(self, field, value, **kwargs):
        try:
            field_template = getattr(self.base_obj, field)

            if type(field_template) != ESType:
                field_template = create_es_type(field_template)

            for estype in [String, IP, Attachment]:
                if isinstance(field_template,
                              estype) and field_template.analyzed == True:
                    logger.warn(
                        str(estype.__class__.__name__) +
                        ' types may not exact match correctly if they are analyzed'
                    )

        except AttributeError:
            logger.warn(str(field) + ' is not in the base model.')

        kwargs['type'] = 'filter'
        if isinstance(value, list):
            #self._build_body( filter={"terms": { field: value } }, **kwargs )

            self._querybody.chain(qdsl.terms(field, value), **kwargs)
        else:
            #self._build_body( filter={"term": { field: value } }, **kwargs )
            self._querybody.chain(qdsl.term(field, value), **kwargs)

        return self

    def or_(self, *args):
        return ' OR '.join(args)

    def and_(self, *args):
        return ' AND '.join(args)

    def get(self, id, **kwargs):
        try:
            params = dict(index=self.idx, doc_type=self.type, id=id)
            params.update(kwargs)
            #return self._create_obj( self._es.get(**params) )
            doc = self._es.get(**params)
            if doc:
                return VWCollectionGen(self.base_obj, {'docs': [doc]})[0]

            return None

        except:
            return None

    def refresh(self, **kwargs):
        self._esc.refresh(index=self.idx, **kwargs)

    def get_in(self, ids, **kwargs):

        if len(
                ids
        ) > 0:  # check for ids. empty list returns an empty list (instead of exception)
            params = dict(index=self.idx,
                          doc_type=self.type,
                          body={'ids': ids})
            params.update(kwargs)
            res = self._es.mget(**params)
            if res and res.get('docs'):
                return VWCollectionGen(self.base_obj, res)

        return []

    def get_like_this(self, doc_id, **kwargs):

        params = dict(index=self.idx, doc_type=self.type, id=doc_id)
        params.update(kwargs)
        res = self._es.mlt(**params)

        if res and res.get('docs'):
            return VWCollectionGen(self.base_obj, res)
        else:
            return []

    def sort(self, **kwargs):
        for k, v in kwargs.iteritems():
            v = v.lower()
            if v not in ['asc', 'desc']:
                v = 'asc'

            self._sort.append('%s:%s' % (k, v))
        return self

    def clear_previous_search(self):
        self._raw = {}
        self._search_params = []
        self._special_body = {}
        self._querybody = querybuilder.QueryBody()

    def _create_search_params(self, **kwargs):

        # before_query_build() is allowed to manipulate the object's internal state before we do the do
        self._querybody = self.execute_callbacks('before_query_build',
                                                 self._querybody)

        q = {'index': self.idx, 'doc_type': self.type}

        if self._raw:
            q['body'] = self._raw
        elif len(self._search_params) > 0:
            kwargs['type'] = 'query'
            #q['body'] = self._build_body(query=qdsl.query_string( self.and_(*self._search_params), **kwargs) )
            self._querybody.chain(
                qdsl.query_string(self.and_(*self._search_params)), **kwargs)
        else:
            q['body'] = qdsl.query(qdsl.match_all())

        if self._querybody.is_filtered() or self._querybody.is_query():
            q['body'] = self._querybody.build()

        # after_query_build() can manipulate the final query before being sent to ES
        # this is generally considered a bad idea but might be useful for logging
        q = self.execute_callbacks('after_query_build', q)

        logger.debug(json.dumps(q))
        return q

    def count(self):
        params = self._create_search_params()
        resp = self._es.count(**params)
        return resp.get('count')

    def __len__(self):
        return self.count()

    def limit(self, count):
        self.results_per_page = count
        return self

    def all(self, **kwargs):

        params = self._create_search_params()
        if not params.get('size'):
            params['size'] = self.results_per_page

        if kwargs.get('results_per_page') != None:
            kwargs['size'] = kwargs.get('results_per_page')
            del kwargs['results_per_page']

        if kwargs.get('start') != None:
            kwargs['from_'] = kwargs.get('start')
            del kwargs['start']

        logger.debug(json.dumps(self._sort))

        params.update(kwargs)
        if len(self._sort) > 0:
            if params.get('sort') and isinstance(params['sort'], list):
                params['sort'].extend(self._sort)
            else:
                params['sort'] = self._sort

        if params.get('sort'):
            if isinstance(params['sort'], list):
                params['sort'] = ','.join(params.get('sort'))
            else:
                raise TypeError('"sort" argument must be a list')

        logger.debug(json.dumps(params))
        results = self._es.search(**params)
        #rows = results.get('hits').get('hits')

        return VWCollectionGen(self.base_obj, results)

    def one(self, **kwargs):
        kwargs['results_per_page'] = 1
        results = self.all(**kwargs)
        try:
            return results[0]
        except IndexError:
            raise NoResultsFound('No result found for one()')

    # this is for legacy purposes in filter_by
    def _translate_bool_condition(self, _bool_condition):
        if _bool_condition == 'and':
            _bool_condition = 'must'
        elif _bool_condition == 'or':
            _bool_condition = 'should'
        elif _bool_condition == 'not':
            _bool_condition = 'must_not'

        # this is for things like geo_distance where we explicitly want the true and/or/not
        elif _bool_condition == 'explicit_and':
            _bool_condition = 'and'
        elif _bool_condition == 'explicit_or':
            _bool_condition = 'or'
        elif _bool_condition == 'explicit_not':
            _bool_condition = 'not'

        return _bool_condition

    def range(self, field, **kwargs):
        search_options = {}
        for opt in ['condition', 'minimum_should_match']:
            if opt in kwargs:
                search_options[opt] = kwargs.get(opt)
                del kwargs[opt]

        q = qdsl.range(field, **kwargs)
        if self._querybody.is_filtered():
            d = {'filter': q}
        else:
            d = {'query': q}

        if search_options:
            d.update(search_options)

        #self._build_body(**d)
        self._querybody.chain(d)

        return self

    def search_geo(self, field, distance, lat, lon, **kwargs):

        condition = kwargs.get('condition', 'and')
        if 'condition' in kwargs:
            del kwargs['condition']

        #self._build_body( filter={"geo_distance": { "distance": distance, field: [lon,lat] } }, condition='explicit_and', **kwargs )
        self._querybody.chain(qdsl.filter_(
            qdsl.geo_distance(field, [lon, lat], distance, **kwargs)),
                              condition=condition)
        return self

    def missing(self, field, **kwargs):
        #kwargs['filter'] = {"missing":{"field": field } }
        #self._build_body( **kwargs )
        self._querybody.chain(qdsl.filter_(qdsl.missing(field)))
        return self

    def exists(self, field, **kwargs):
        #kwargs['filter'] = {"exists": { "field": field } }
        #self._build_body( **kwargs )
        self._querybody.chain(qdsl.filter_(qdsl.exists(field, **kwargs)))
        return self

    def delete(self, **kwargs):
        params = self._create_search_params()
        params.update(kwargs)
        self._es.delete_by_query(**params)

    def delete_in(self, ids):
        if not isinstance(ids, list):
            raise TypeError('argument to delete in must be a list.')

        bulk_docs = []
        for i in ids:
            this_id = i
            this_type = self.base_obj.__type__
            this_idx = self.idx
            if isinstance(i, VWBase):
                this_id = i.id
                this_type = i.__type__
                try:
                    this_idx = i.__index__
                except AttributeError:
                    pass

            bulk_docs.append({
                '_op_type': 'delete',
                '_type': this_type,
                '_index': this_idx,
                '_id': this_id
            })

        return helpers.bulk(self._es,
                            bulk_docs,
                            chunk_size=self.bulk_chunk_size)

    # commits items in bulk
    def commit(self, callback=None):
        bulk_docs = []

        if callback:
            if not callable(callback):
                raise TypeError('Argument 2 to commit() must be callable')

        # allow for a search to work if there are not _items
        if len(self._items) == 0:
            items = self.all()
        else:
            items = self._items

        for i in self._items:
            if callback:
                i = callback(i)

            i = self.execute_callbacks('on_bulk_commit', i)

            this_dict = {}
            this_id = ''
            this_idx = self.idx
            this_type = self.base_obj.__type__
            if isinstance(i, VWBase):
                this_dict = i._create_source_document()
                this_type = i.__type__
                this_id = i.id
                try:
                    this_idx = i.__index__
                except AttributeError:
                    pass

            elif isinstance(i, dict):
                this_dict = i
                this_id = i.get('id')

            else:
                raise TypeError(
                    'Elments passed to the collection must be type of "dict" or "VWBase"'
                )

            if not this_id:
                this_id = str(uuid4())

            bulk_docs.append({
                '_op_type': 'index',
                '_type': this_type,
                '_index': this_idx,
                '_id': this_id,
                '_source': this_dict
            })

        return helpers.bulk(self._es,
                            bulk_docs,
                            chunk_size=self.bulk_chunk_size)
示例#9
0
from elasticsearch import Elasticsearch
es = Elasticsearch()


q = {
    "query": {
        "bool": {
            "must": [
                {"match": {"text": "obama"}},
                {"match": {"text": "kerry"}}
            ]
        }
    }
}

res = es.search(index="haystack", body=q)
#print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
    #print("%(id)s: %(text)s" % hit["_source"])

    similar = es.mlt(
        index='haystack',
        id=hit['_source']['id'],
        doc_type='modelresult',
        percent_terms_to_match=.1)
    if similar['hits']['total'] > 0:
        print(similar['hits']['hits'][0]['_source'])