예제 #1
0
def compute_minhash(lst):
    m1 = MinHash(num_perm=128)
    for d in set(lst):
        m1.update(d.encode('utf8'))
    return m1
예제 #2
0
def create_minhash(data):
    minhash = MinHash(HASH_SIZE, seed=12)
    for qgram in data:
        minhash.update(qgram.encode('utf-8'))
    return minhash
예제 #3
0
f = open("../data/" + dataset + ".graph")
ptr = f.readline().strip("\n").strip(" ").split(" ")
idx = f.readline().strip("\n").strip(" ").split(' ')
if len(idx) != nume:
    print("error idx", len(idx))
    exit()
if len(ptr) != numv + 1:
    print("error ptr", len(ptr))
    exit()

t0 = time.time()
lsh = MinHashLSH(threshold=lsh_thres, num_perm=per)
allver = []
lists = [[] for i in range(numv)]
for i in range(numv):
    m = MinHash(num_perm=per)
    for iter in range((int)(ptr[i]), (int)(ptr[i + 1])):
        m.update(str(idx[iter]).encode('utf-8'))
        lists[i].append(idx[iter])
    lsh.insert(str(i), m)
    allver.append(m)
#res = lsh.query(allver[0])
#print(res)
t1 = time.time()
print("init LSH", t1 - t0)


def jd(l1, l2):
    if len(l1) == 0 or len(l2) == 0:
        return 0
    s1 = set(l1)
from datasketch import MinHash

k_sig = 1024
minhash = MinHash(num_perm=k_sig)
minhash2 = MinHash(num_perm=k_sig)
minhash3 = MinHash(num_perm=k_sig)

minhash.update(
    "This is a good algorithm that can perform wild ranges of services".encode(
        'utf-8'))

minhash2.update(
    "The computing algorithm for this is very cool and works really well".
    encode('utf-8'))

minhash3.update(
    "The computing algorithm for this is very cool and works really wellx".
    encode('utf-8'))

# print(minhash.hashvalues)
# print(minhash2.hashvalues)
# print(minhash3.hashvalues)

print(minhash2.jaccard(minhash3))
# print(minhash2.permutations)
# print(minhash3.permutations)
예제 #5
0
def get_minhash(item_str):
    temp = MinHash()
    for d in item_str:
        temp.update(d.encode('utf8'))
    return temp
예제 #6
0
from datasketch import MinHash

data1 = ['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
        'estimating', 'the', 'similarity', 'between', 'datasets']
data2 = ['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
        'estimating', 'the', 'similarity', 'between', 'documents']

m1, m2 = MinHash(), MinHash()
for d in data1:
    m1.update(d.encode('utf8'))
for d in data2:
    m2.update(d.encode('utf8'))
print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

s1 = set(data1)
s2 = set(data2)
actual_jaccard = float(len(s1.intersection(s2)))/float(len(s1.union(s2)))
print("Actual Jaccard for data1 and data2 is", actual_jaccard)

print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m1))

print(MinHash(hashvalues=m1.hashvalues).jaccard(m1))
예제 #7
0
 def test_bytesize(self):
     m1 = MinHash(4, 1, hashfunc=fake_hash_func)
     lm1 = LeanMinHash(m1)
     self.assertTrue(lm1.bytesize() == (4 * 4) + 4 + 8)
예제 #8
0
def getHashSig(tagsListOfPep):
    minHash = MinHash(num_perm=NUM_PERMUTATION)
    for tag in tagsListOfPep:
        minHash.update(tag.encode('utf-8'))

    return minHash.digest()
예제 #9
0
    def min_hash(self,
                 num_perm=64,
                 seed=42,
                 use_components=None,
                 type_option=None,
                 n_char=None,
                 n_word=None,
                 npz=None,
                 isrequest=False):
        """
        Minhash function.
        
        Parameters
        --------
        num_perm: int
            Number of permutations.
        seed: int
            For random permutations.
        use_components: str, optional: ['name', 'addr'] or ['name'] or ['addr'].
            Components to use.
        type_option: str, optional: ['char', 'word'] or ['char'] or ['word'].
            Components to use.
        n_char: list of int
            sizes of char grams. 
        n_word: list of int
            sizes of word grams.
        isrequest: bool
            If this LpuList created for base, we have False.
        npz: bool
            Indicator or using npz files.
        
        """

        if npz:
            self.options = type_option
            self.num_perm = num_perm

            n = n_char if type_option == 'char' else n_word
            self.features['not_weighed_{}_{}_{}minhash'.format(use_components[0], type_option[0], n[0])] = \
                np.load(npz)['min_hash']
            return self

        use_components = use_components or ['name']
        type_option = type_option or ['char']
        n_char = n_char or [3]
        n_word = n_word or [1]

        if 'char' not in type_option and 'word' not in type_option:
            assert False, "Проверьте значение параметра type_option."

        if 'name' not in use_components and 'addr' not in use_components:
            assert False, "Проверьте значение параметра use_components."

        self.options = type_option
        self.num_perm = num_perm

        for i in use_components:
            for j in type_option:
                n_list = n_char if j == 'char' else n_word
                for n in n_list:
                    help_list = []
                    for idx, name in enumerate(
                            self.features['{}_{}_{}grams'.format(i, j, n)]):
                        minhash = MinHash(num_perm, seed=seed)
                        for ngram in name:
                            minhash.update(ngram.encode('utf8'))
                        lean_minhash = LeanMinHash(minhash)
                        help_list.append(lean_minhash)

                    self.features['not_weighed_{}_{}_{}minhash'.format(
                        i, j, n)] = np.array(help_list)
                    file_path = 'data/min_hash_dadata/{}_{}_{}_not_weighed_minhash.npz'.format(
                        i, j, n)
                    if not isrequest:
                        np.savez_compressed(file_path,
                                            min_hash=np.array(help_list))

        return self
예제 #10
0
# m1 = MinHash(num_perm=128)
# m2 = MinHash(num_perm=128)
# m3 = MinHash(num_perm=128)
# for d in set1:
#     m1.update(d.encode('utf8'))
# for d in set2:
#     m2.update(d.encode('utf8'))
# for d in set3:
#     m3.update(d.encode('utf8'))

# Create LSH index
# lsh = MinHashLSH(threshold=0.2, num_perm=128, params = (2,3))

k_sig = 1024

m1 = MinHash(num_perm=k_sig)
m2 = MinHash(num_perm=k_sig)
m3 = MinHash(num_perm=k_sig)
m4 = MinHash(num_perm=k_sig)
m5 = MinHash(num_perm=k_sig)

for d in set1:
    m1.update(d.encode('utf8'))
for d in set2:
    m2.update(d.encode('utf8'))
for d in set3:
    m3.update(d.encode('utf8'))
for d in set4:
    m4.update(d.encode('utf8'))
for d in set5:
    m5.update(d.encode('utf8'))
예제 #11
0
if __name__ == '__main__':

    # data1 = 'VGFGEEWEDAAWCN'
    data1 = ['TAG', 'VGF', 'GTB', 'EEW']
    # data1 = ['TAG', 'RBT', 'WCDS']

    data2 = 'TAGDSAFDVGFGTEEWEQWWRFRSDAAWCDSNBH'
    data3 = 'TAGSSAFDDBFDTWEEWTDWWRFRSCASWCDSQBH'

    data2 = [data2[i:i + 3] for i in range(len(data2) - 2)]
    data3 = [data3[i:i + 3] for i in range(len(data3) - 2)]

    # for i in range(len(data2)-2):
    #     print(data2[i:i+3])

    m1, m2 = MinHash(), MinHash()
    # for d in data1:
    d = 'AVB'
    m1.update(d.encode('utf8'))
    # for d in data2:
    d = 'BVA'
    m2.update(d.encode('utf8'))
    print("Estimated Jaccard for data1 and data2 is", m2.jaccard(m1))

    # sumSimi = 0
    # m2 = MinHash()
    # m2.update(data2.encode('utf8'))
    # for d in data1:
    #     print(d)
    #     m1 = MinHash()
    #     m1.update(d.encode('utf8'))
def main():

    
    path = '../../../Desktop' # Directory in local computer
    """Shows basic usage of the Drive v3 API.
    Prints the names and ids of the first 10 files the user has access to.
    """
    creds = None
    # The file token.pickle stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server()
        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)


    service = build('drive', 'v3', credentials=creds)

    

    # Call the Drive v3 API
    results = service.files().list(
        pageSize=1000, fields="nextPageToken, files(parents, name, id, modifiedTime, mimeType, md5Checksum)").execute()
    items = results.get('files', [])

    

    currentTimesinceEpoc = time.time()
    currentday = time.strftime('%d', time.localtime(currentTimesinceEpoc))
    currentmonth = time.strftime('%m', time.localtime(currentTimesinceEpoc))
    currentyear = time.strftime('%Y', time.localtime(currentTimesinceEpoc))
    currentdate = dt.datetime(int(currentyear),int(currentmonth), int(currentday),0,0,0)

    folderModifTimes = {}

    if not items:
        print('No files found.')
    else:
        '''location heirarchy for Drive files'''
        
        for item in items:

            
            modifyear, modifmonth, modifday = item['modifiedTime'].split('-')
            modifday = modifday.split('T')[0]
            modifdate = dt.datetime(int(modifyear),int(modifmonth), int(modifday),0,0,0)
            modificationTimesinceEpoc = str(currentTimesinceEpoc - (currentdate-modifdate).total_seconds())

            item ['modificationTimesinceEpoc'] = modificationTimesinceEpoc

            if item.get('parents') != None and len(item.get('parents')) >1: #more tham 1 parent not handled
                print("many parents")
            if item.get('parents') == None:
                if item['mimeType'] == 'application/vnd.google-apps.folder':
                    parentname = 'Drive'
                    parentid = 'drive'
                else:
                    parentname = 'Parent-less Files'
                    parentid = 'parentlessfiles'
            else:
                parentname = (service.files().get(fileId=item.get('parents')[0]).execute())['name']
                parentid = item.get('parents')[0]
                
            if folderModifTimes.get(parentid) == None or float(folderModifTimes[parentid]) < float(modificationTimesinceEpoc):
                folderModifTimes[parentid] = modificationTimesinceEpoc
 
            item['parentid'] =  parentid
            item['parentname'] = parentname

        new_items = []

        print('location' + '^' + 'dir' + 'root' + '^' + 'Root'  + '^' + str(currentTimesinceEpoc) + '^' + 'dir'+os.path.abspath(path)+ '^' + 'Desktop'  + '^' + str(currentTimesinceEpoc))
        print('location' + '^' + 'dir' + 'root' + '^' + 'Root'  + '^' + str(currentTimesinceEpoc) + '^' + 'dir'+'https://drive.google.com/open?id=' +'drive'+ '^' + 'Drive'  + '^' + str(currentTimesinceEpoc))
        print('location' + '^' + 'dir' + 'https://drive.google.com/open?id=' +'drive' + '^' + 'Drive'  + '^' + str(currentTimesinceEpoc) + '^' + 'dir'+'https://drive.google.com/open?id=' +'parentlessfiles'+ '^' + 'Parent-less Files'  + '^' + folderModifTimes['parentlessfiles'])

        for item in items:
            
            if item['mimeType'] == 'application/vnd.google-apps.folder': #if a folder
                if folderModifTimes.get(item['id']) == None:
                    print('location' + '^' + 'dir' + 'https://drive.google.com/open?id=' +item['parentid'] + '^' + item['parentname']  + '^' + folderModifTimes[item['parentid']] + '^' + 'dir'+'https://drive.google.com/open?id=' +item['id']+ '^' + item['name']  + '^' + item ['modificationTimesinceEpoc'])
                else:
                    print('location' + '^' + 'dir' + 'https://drive.google.com/open?id=' +item['parentid'] + '^' + item['parentname']  + '^' + folderModifTimes[item['parentid']] + '^' + 'dir'+'https://drive.google.com/open?id=' +item['id']+ '^' + item['name']  + '^' + folderModifTimes[item['id']])
            else: #if not a folder
                new_items.append(item) #further links would only be between files
                print('location' + '^' + 'dir' +'https://drive.google.com/open?id=' + item['parentid'] + '^' + item['parentname']  + '^' + folderModifTimes[item['parentid']] + '^' +'https://drive.google.com/open?id=' +item['id']+ '^' + item['name']  + '^' + item ['modificationTimesinceEpoc'])
        
        items = new_items

        for r, d, f in os.walk(path):
            f = [file for file in f if not (file[0] == '.' or file[0] == '_')]
            d[:] = [dr for dr in d if not (dr[0] == '.' or dr[0] == '_')]
            for file in f:
                    filepath = os.path.join(r, file)
                    filepath2 = filepath.split('/')
                    filename = filepath2[-1]
                    filemtime = str(os.path.getmtime(os.path.abspath(filepath)))
                    mime = magic.Magic(mime=True)
                    mimeType = mime.from_file(os.path.abspath(filepath))
                    md5Checksum = hs.fileChecksum(os.path.abspath(filepath), "md5")
                    item = {'id': os.path.abspath(filepath),'name' :filename, 'modificationTimesinceEpoc': filemtime, 'mimeType': mimeType, 'md5Checksum': md5Checksum }
                    items.append(item)

        with open('email_metadata', 'r') as f:
            attachments = f.read().split('\n')

        for attachment in attachments:
                
            if len(attachment.split("^"))==1:
                break
   
            attachment_id, attachment_name, attachment_mimeType, attachment_md5Checksum, attachment_mtime = attachment.split("^")
            item = {'id': attachment_id+'~'+attachment_name,'name' : attachment_name, 'modificationTimesinceEpoc': attachment_mtime, 'mimeType': attachment_mimeType, 'md5Checksum': attachment_md5Checksum }
            items.append(item)


        ''' same hash'''

        for f1 in items:
            for f2 in items:
                if  f1['id']!=f2['id'] and f1.get('md5Checksum') != None and f2.get('md5Checksum') != None and f1.get('md5Checksum') == f2.get('md5Checksum'):
                    f1mtime = f1['modificationTimesinceEpoc']
                    f2mtime = f2['modificationTimesinceEpoc']
                    print('content' + '^'+ f1['id'] + '^' + f1['name'] + '^' + f1mtime + '^'+ f2['id'] + '^' + f2['name'] + '^' + f2mtime)

        '''content similarity'''


        text_files = []

        '''scanning the directory'''
        for f in items:
            filepath = f['id'] 
            file_type = f['mimeType']
            major_minor = file_type.split('/')
                    
            if major_minor[0] == 'text' or file_type == 'application/vnd.google-apps.document':
                text_files.append(f)

        documents =[]

        os.mkdir('DriveTextFiles') #To temporarily store drive text files

        for f in text_files:
            try:
                documents.append((open(f['id']).read(),f))
            except FileNotFoundError: # downloading drive text files
                
                try :
                    file_id = f['id']
                    if f['mimeType'] == 'application/vnd.google-apps.document':
                        request = service.files().export_media(fileId=file_id, mimeType='text/plain')
                    else:
                        request = service.files().get_media(fileId=file_id)
                    fh = io.FileIO(os.path.join('DriveTextFiles',f['id']), 'wb')
                    downloader = googleapiclient.http.MediaIoBaseDownload(fh, request)
                    done = False
                    while done is False:
                        status, done = downloader.next_chunk()
                    documents.append((open(os.path.join('DriveTextFiles',f['id'])).read(),f))
                except (googleapiclient.errors.HttpError, FileNotFoundError):
                    pass

        lsh = MinHashLSH(threshold=0.3, num_perm=128)

        for f in documents:
            setdoc = set(f[0].split())  
            m = MinHash(num_perm=128)
            for d in setdoc:
                m.update(d.encode('utf8'))
            lsh.insert(f[1]['id'] + '^' +f[1]['name']+ '^' +f[1]['modificationTimesinceEpoc'], m)


        results = []
        for doc in documents:
            setdoc = set(doc[0].split())    
            m = MinHash(num_perm=128)
            for d in setdoc:
                m.update(d.encode('utf8'))
            result = lsh.query(m)
            results.append((doc[1]['id'] + '^' +doc[1]['name']+ '^' +doc[1]['modificationTimesinceEpoc'],result)) 


        '''forming links between files with similar content'''

        for result in results:
            f2mtime = result[0].split('^')[2]
            for r in result[1]:
                if r!=result[0]: 
                    f1mtime = r.split('^')[2]
                    print('content' + '^'+ r.split('^')[0] + '^' + r.split('^')[1] + '^' + f1mtime +'^'+ result[0].split('^')[0] + '^' + result[0].split('^')[1] + '^' + f2mtime)

        shutil.rmtree('DriveTextFiles')

        '''name similarity'''

        for f1 in items:
            for f2 in items:
                distance1 = textdistance.jaro.distance(f1['name'],f2['name'])
                distance2 = textdistance.levenshtein.distance(f1['name'],f2['name']) 
                if ((distance1<=0.30 and f1['id']!=f2['id'] and distance2<0.75*min(len(f1['name']),len(f2['name']),8)) or distance1<=0.15 or distance2<=0.25*min(len(f1['name']),len(f2['name']),8)) and f1['id']!=f2['id']:
                    f1mtime = f1['modificationTimesinceEpoc']
                    f2mtime = f2['modificationTimesinceEpoc']
                    print('name' + '^'+ f1['id'] + '^' + f1['name'] + '^' + f1mtime +'^' + f2['id'] + '^' + f2['name'] + '^' + f2mtime)


        '''time similarity'''

        file_threshhold = 10
        filetimes = []
        i = 0

        for item in items:
            filetimes.append([float(item['modificationTimesinceEpoc'])])
            i = i+1

        kmeans = KMeans(n_clusters=int(i/file_threshhold) +1, random_state=0).fit(filetimes)
        labels = kmeans.labels_

        for j in range(int(i/file_threshhold)+1) : #iterating through all clusters
            idx = [] 
            for i in range(0, len(labels)) : 
                if labels[i] == j : 
                    idx.append(i) 


            filesj = [items[i] for i in idx] #all the files in a cluster
    
            #forming similar time links
            for f1 in filesj:
                for f2 in filesj:
                    if  f1['id']!=f2['id'] :
                        f1mtime = f1['modificationTimesinceEpoc']
                        f2mtime = f2['modificationTimesinceEpoc']
                        print('time' + '^'+ f1['id'] + '^' + f1['name'] + '^' + f1mtime + '^'+ f2['id'] + '^' + f2['name'] + '^' + f2mtime)
예제 #13
0
import database as mydb
from datasketch import MinHashLSHForest, MinHash, MinHashLSH
import pdb
from Translator import translate
from Article import Article
# Create MinHash objects

lsh = MinHashLSH(threshold=0.1,
                 num_perm=128,
                 storage_config={
                     "type": "redis",
                     "basename": b"docs_tech",
                     "redis": {
                         "host": "localhost",
                         "port": 6379
                     }
                 })
lsh = MinHashLSH(threshold=0.2, num_perm=128)
articles_en = mydb.execute_query("SELECT id, keyword, title FROM english")
keywords_en = [
    Article(id=item[0], keyword=item[1], content=item[2])
    for item in articles_en
]
for item in keywords_en:
    minhash = MinHash(num_perm=128)
    list_keyword = item.keyword.split(",")
    for k in list_keyword:
        minhash.update(k.encode("utf-8"))
    lsh.insert(str(item.id), minhash)
    # forest.add(str(item.id), minhash)
# forest.index()
예제 #14
0
def compare(first_signature_input, second_signature_input):
    first_minhash = MinHash(hashvalues=first_signature_input)
    second_minhash = MinHash(hashvalues=second_signature_input)

    return first_minhash.jaccard(second_minhash)
예제 #15
0
    for file in f:
            filepath = os.path.join(r, file)
            mime = magic.Magic(mime=True)
            file_type = mime.from_file(filepath) 
            major_minor = file_type.split('/')
            
            if major_minor[0] == 'text' :
            	text_files.append(os.path.abspath(filepath))


documents = [(open(f).read(),f) for f in text_files]
lsh = MinHashLSH(threshold=0.4, num_perm=128)

for f in documents:
    setdoc = set(f[0].split())	
    m = MinHash(num_perm=128)
    for d in setdoc:
    	m.update(d.encode('utf8'))
    lsh.insert(f[1], m)


results = []
for doc in documents:
	setdoc = set(doc[0].split())	
	m = MinHash(num_perm=128)
	for d in setdoc:
		m.update(d.encode('utf8'))
	result = lsh.query(m)
	results.append(result)

results = np.array(results) 
예제 #16
0
	def minHash(self, code_tokens):
		minHash = MinHash(num_perm=self.num_perm)
		for d in code_tokens: # TODO modify this for n-grams
			minHash.update("".join(d).encode('utf-8'))

		return minHash
예제 #17
0
def compare(signature1, signature2):
    mh1, mh2 = MinHash(hashvalues=signature1), MinHash(hashvalues=signature2)

    return mh1.jaccard(mh2)
예제 #18
0
 def generate_minhash(self):
     self.minhash = MinHash(num_perm=400)
     for shingle in self.get_n_best_shingles(n=400):
         self.minhash.update(shingle.encode('utf8'))
예제 #19
0
 def test_is_empty(self):
     m = MinHash()
     lm = LeanMinHash(m)
     self.assertTrue(lm.is_empty())
예제 #20
0
def minhash_IDs(IDs):
    m = MinHash()
    for ID in IDs:
        m.update(ID.encode('utf-8'))
    return m
예제 #21
0
def hash(s):
    s.lower().translate(str.maketrans('', '', string.punctuation))
    mh = MinHash(num_perm=128)
    for d in [s[i:i + N] for i in range(len(s) - N + 1)]:
        mh.update(d.encode('utf8'))
    return mh
예제 #22
0
파일: main.py 프로젝트: tukraus/LIICD
def run(codebase_path, updates_file_path, commits):

    print("Creating Clone Index from HEAD~" + str(commits + 1))

    result = subprocess.run([
        'git', '-C',
        str(codebase_path), 'log', '-' + str(commits + 1), '--no-merges',
        '--pretty=format:"%h"'
    ],
                            stdout=subprocess.PIPE)

    result_commits = result.stdout.decode('utf-8')
    result_commits = result_commits.replace('"', '')
    result_commits = result_commits.split('\n')

    # checkout to the current commit
    subprocess.run([
        'git', '-C',
        str(codebase_path), 'checkout', result_commits[len(result_commits) - 1]
    ],
                   stdout=subprocess.DEVNULL,
                   stderr=subprocess.STDOUT)

    # start the timer
    index_cr_start = timer()

    codebase = CodebaseReader(codebase_path)
    lines_per_files = codebase.get_lines_per_file()

    # Create LSH index
    lsh_index = MinHashLSH(threshold=config.THRESHOLD,
                           num_perm=config.PERMUTATIONS)

    for file in lines_per_files:
        min_hash = MinHash(num_perm=config.PERMUTATIONS)
        for line in lines_per_files[file]:
            min_hash.update(line.encode('utf8'))
        lsh_index.insert(file, min_hash)

    index_cr_end = timer()
    index_cr_diff = round(index_cr_end - index_cr_start, 5)

    incremental_step_time = 0
    commits_processed = 0  # we use this instead of len(data['commits']) bcs there might me commits that only affect
    # excluded (e.g. test) files and in that case the specific commit does not get processed

    try:
        with open(updates_file_path) as f:
            data = json.load(f)
            commits = data['commits']

            for commit in commits:
                creates_lst = []
                updates_lst = []
                deletes_lst = []
                renames_lst = []

                print('========> Running Analysis for codebase @commit: ',
                      commit['id'], "<========")

                # checkout to the current commit
                subprocess.run([
                    'git', '-C',
                    str(codebase_path), 'checkout', commit['id']
                ],
                               stdout=subprocess.DEVNULL,
                               stderr=subprocess.STDOUT)

                is_processed = False

                for change in commit['changes']:
                    change_type = change['type']

                    if change_type in ['A', 'M', 'D']:
                        affected_filename = change['filename']
                        file_path = Path(affected_filename)

                        # skip directories not read when creating the initial index & skip invalid files
                        if is_in_exlcuded_dir(
                                file_path) or is_in_excluded_format(file_path):
                            continue

                        is_processed = True  # if I get here then there is at least 1 change in that commit that is processed

                        file_path = codebase_path / file_path

                        print('-> Parsing change [', change_type,
                              '] for file [', file_path, ']')

                        if change_type == 'A':
                            creates_lst.append(str(file_path))
                        elif change_type == 'M':
                            updates_lst.append(str(file_path))
                        elif change_type == 'D':
                            deletes_lst.append(str(file_path))
                    else:
                        affected_filenames = change['filename']
                        from_filename = Path(affected_filenames[0])
                        to_filename = Path(affected_filenames[1])

                        # skip directories not read when creating the initial index & skip invalid files
                        if is_in_exlcuded_dir(from_filename) or is_in_excluded_format(from_filename) or \
                                is_in_exlcuded_dir(to_filename) or is_in_excluded_format(to_filename):
                            continue

                        is_processed = True  # if I get here then there is at least 1 change in that commit that is processed

                        from_filename = codebase_path / from_filename
                        to_filename = codebase_path / to_filename

                        print('-> Parsing change [', change_type,
                              '] for renamed/moved file [', from_filename, ']',
                              'to [', to_filename, ']')

                        renames_lst.append(
                            (str(from_filename), str(to_filename)))

                if is_processed:
                    changes_handler = ChangesHandler(lsh_index, codebase,
                                                     deletes_lst, updates_lst,
                                                     creates_lst, renames_lst)
                    # start incremental step timer
                    start = timer()
                    # handle commit changes
                    changes_handler.handle_changes()
                    # end incremental step timer
                    end = timer()
                    time_diff = round(end - start, 5)
                    print("Detection/Index update time: " + str(time_diff) +
                          " seconds")

                    commits_processed += 1
                    incremental_step_time += time_diff
                else:
                    print("Commit " + commit['id'] +
                          " was skipped because all files were excluded")

                # checkout back to HEAD
                subprocess.run(
                    ['git', '-C',
                     str(codebase_path), 'checkout', '-'],
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.STDOUT)

            print(
                "============================================================")
            print("Total LOCs: ", codebase.get__initial_codebase_lines())
            print("Total Index creation time: ", index_cr_diff, " seconds")
            print("Total commits: ", len(commits))
            print("Total commits processed: ", commits_processed)
            if commits_processed > 0:
                print(
                    "Average Incremental Step Time: ",
                    round(incremental_step_time / float(commits_processed), 5),
                    " seconds")
            else:
                print("0 commits out of ", len(commits),
                      " were processed. Something went terribly wrong!")
            print(
                "============================================================")

        f.close()
    except IOError:
        traceback.print_exc()
        print("File \"" + str(updates_file_path) + "\" not found.")
 def query(self, v, n):
     m = MinHash(num_perm=self._n_perm)
     for e in v:
         m.update(str(e).encode('utf8'))
     return map(int, self._index.query(m, n))
예제 #24
0
def get_minhash_signature(shingle_set):
    mh = MinHash()
    for el in shingle_set:
        mh.update(el.encode('utf8'))
    return mh.hashvalues
예제 #25
0
def _main():
    if len(sys.argv) != 5:
        usage()

    folder = sys.argv[1]
    label_fn = sys.argv[2]
    k = int(sys.argv[3])
    outFn = sys.argv[4]

    # Get sample labels
    labels = get_labels(folder,label_fn)

    # Randomly choose samples from labels with at least 10 samples in them
    samples = dict()
    for c in labels:
        if len(labels[c]) < k:
            continue
        for s in random.sample(labels[c],k):
            samples[s] = c

    stats = dict()
    history = dict()

    # Iterate over samples and calculate their similarities
    for s1 in samples:
        c1 = samples[s1]

        if s1 not in history:
            history[s1] = set()
        if c1 not in stats:
            stats[c1] = dict()
            stats[c1]['jaccard'] = dict()
            stats[c1]['lsh'] = dict()
            stats[c1]['hamming'] = dict()

        for s2 in samples:
            # Don't duplicate similarity measurements
            if s1 == s2:
                continue
            if s2 in history:
                if s1 in history[s2]:
                    continue

            c2 = samples[s2]
            if c2 not in stats:
                stats[c2] = dict()
                stats[c2]['jaccard'] = dict()
                stats[c2]['lsh'] = dict()
                stats[c2]['hamming'] = dict()
            if c2 not in stats[c1]['jaccard']:
                stats[c1]['jaccard'][c2] = list()
                stats[c1]['lsh'][c2] = Counter()
                stats[c1]['hamming'][c2] = list()
            if c1 not in stats[c2]['jaccard']:
                stats[c2]['jaccard'][c1] = list()
                stats[c2]['lsh'][c1] = Counter()
                stats[c2]['hamming'][c1] = list()

            # Note that we've compared these samples now
            history[s1].add(s2)

            # Read API sequences
            lseq1 = readFile(folder,s1)
            lseq2 = readFile(folder,s2)

            seq1 = set(lseq1)
            seq2 = set(lseq2)

            # https://ekzhu.github.io/datasketch/lsh.html
            # Compare these two samples
            m1 = MinHash(num_perm=128)
            m2 = MinHash(num_perm=128)
            for d in seq1:
                m1.update(d.encode('utf8'))
            for d in seq2:
                m2.update(d.encode('utf8'))

            # Calculate LSH similarity
            lsh = MinHashLSH(threshold=0.7, num_perm=128)
            lsh.insert(samples[s1],m1)
            result = lsh.query(m2)
            if len(result) == 1:
                rl = True
            else:
                rl = False

            # Calculate Jaccard similarity
            rj = float(len(seq1.intersection(seq2)))/float(len(seq1.union(seq2)))

            # Pad smallest sequence
            if len(lseq1) < len(lseq2):
                diff = len(lseq2) - len(lseq1)
                lseq1.extend([0]*diff)
            elif len(lseq1) > len(lseq2):
                diff = len(lseq1) - len(lseq2)
                lseq2.extend([0]*diff)

            # Calculate Hamming distance
            rh = sum(s1 != s2 for s1,s2 in zip(lseq1,lseq2))

            # Keep track of similarities
            stats[c1]['jaccard'][c2].append(rj)
            stats[c1]['lsh'][c2][rl] += 1
            stats[c1]['hamming'][c2].append(rh)

            stats[c2]['jaccard'][c1].append(rj)
            stats[c2]['lsh'][c1][rl] += 1
            stats[c2]['hamming'][c1].append(rh)

            # Print status
            sys.stdout.write('{0} {4}  {1} {5}: Jaccard similarity: {2}  |  > 0.7 LSH similarity: {3} | Hamming distance: {6}\n'.format(samples[s1],samples[s2],rj,rl,s1,s2,rh))

    # Print summary stats
    with open(outFn,'w') as fw:
        fw.write('class:\n')
        fw.write('    class jaccard_average LSH_ similar_counts hamming_average\n')
        fw.write('\n')

        for c in stats:
            fw.write('{0}:\n'.format(c))
            for c2 in stats[c]['jaccard']:

                add = float(sum(stats[c]['jaccard'][c2]))
                total = float(len(stats[c]['jaccard'][c2]))

                add2 = float(sum(stats[c]['hamming'][c2]))
                total2 = float(len(stats[c]['hamming'][c2]))

                fw.write('    {0} {1} {2} {3}\n'.format(c2, add/total, stats[c]['lsh'][c2], add2/total2))
def get_mh(values, permutations=512):
    mh = MinHash(num_perm=permutations)
    for el in values:
        mh.update(str(el).encode('utf8'))
    return mh
예제 #27
0
from datasketch import MinHash, MinHashLSH

data1 = ['这个', '程序', '代码', '太乱', '那个', '代码', '规范']
data2 = ['这个', '程序', '代码', '不', '规范', '那个', '更', '规范']
data3 = ['这个', '程序', '代码', '不', '规范', '那个', '规范', '些']

# 创建MinHash对象
m1 = MinHash()
m2 = MinHash()
m3 = MinHash()
for d in data1:
    m1.update(d.encode('utf8'))
for d in data2:
    m2.update(d.encode('utf8'))
for d in data3:
    m3.update(d.encode('utf8'))
# 创建LSH
lsh = MinHashLSH(threshold=0.5, num_perm=128)
lsh.insert("m2", m2)
lsh.insert("m3", m3)
result = lsh.query(m1)
print("近似的邻居(Jaccard相似度>0.5)", result)
예제 #28
0
 def make_min_hash(self,words):
     min_hash = MinHash(self.__num_permutation)
     for word in words:
         min_hash.update(word.encode('utf8'))
     return min_hash
예제 #29
0
data1 = [
    'minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
    'estimating', 'the', 'similarity', 'between', 'datasets'
]
data2 = [
    'minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
    'estimating', 'the', 'similarity', 'between', 'documents'
]
data3 = [
    'minhash', 'is', 'probability', 'data', 'structure', 'for', 'estimating',
    'the', 'similarity', 'between', 'documents'
]

# Create MinHash objects
m1 = MinHash(num_perm=128)
m2 = MinHash(num_perm=128)
m3 = MinHash(num_perm=128)
for d in data1:
    m1.update(d.encode('utf8'))
for d in data2:
    m2.update(d.encode('utf8'))
for d in data3:
    m3.update(d.encode('utf8'))

# Create a MinHash LSH Forest with the same num_perm parameter
forest = MinHashLSHForest(num_perm=128)

# Add m2 and m3 into the index
forest.add("m2", m2)
forest.add("m3", m3)
예제 #30
0
artist_shingle = defaultdict(list)
corpus = processLyrics(corpus)
for artist,lyrics in corpus.items():
    tokens = clean_text(lyrics)
    artist_shingle[artist].append(tokens)


from datasketch import MinHashLSHForest, MinHash
from sklearn.metrics import jaccard_similarity_score

g = []

listlsh = []
lsh = MinHashLSHForest(num_perm=128)
for artist,sets in artist_shingle.items():
    a = MinHash(num_perm=128)
    for d in sets[0]:
        a.update(d.encode('utf8'))
    listlsh.append(a)
    lsh.add(artist,a)

lsh.index()
tester = {}
with open('tester.json') as file:
    tester = json.loads(file.read().encode('latin-1'))
numcorrect_1 =0
numcorrect_5 = 0
numcorrect_10 = 0
total = 0
for artist,songlist in tester.items():
    for song in songlist: