Пример #1
0
def simhash_test():
    data = {
        1: u'How are you? I Am fine. blar blar blar blar blar Thanks.',
        2: u'How are you i am fine. blar blar blar blar blar than',
        3: u'This is simhash test.',
    }
    for k, v in data.items(): print k, get_phrases(v)
    for k, v in data.items(): print k, Simhash(get_phrases(v)).value

    objs = [(str(k), Simhash(get_phrases(v))) for k, v in data.items()]
    index = SimhashIndex(objs, k=3)

    print index.bucket_size()

    s1 = Simhash(get_phrases(u'How are you i am fine. blar blar blar blar blar thank'))
    print index.get_near_dups(s1)

    index.add('4', s1)
    print index.get_near_dups(s1)
Пример #2
0
def use_simhash_index():
    data = {
        1: "How are you? I Am fine. blar blar blar blar blar Thanks.",
        2: "How are you i am fine. blar blar blar blar blar than",
        3: "This is simhash test.",
    }
    objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
    index = SimhashIndex(objs, k=3)
    
    print(index.bucket_size())
    
    s1 = Simhash(get_features(u"How are you i am fine. blar blar blar blar blar thank"))
    print(index.get_near_dups(s1))
    
    index.add("4", s1)
    print(index.get_near_dups(s1))
Пример #3
0
def use_simhash_index():
    data = {
        1: "How are you? I Am fine. blar blar blar blar blar Thanks.",
        2: "How are you i am fine. blar blar blar blar blar than",
        3: "This is simhash test.",
    }
    objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
    index = SimhashIndex(objs, k=3)

    print(index.bucket_size())

    s1 = Simhash(
        get_features(u"How are you i am fine. blar blar blar blar blar thank"))
    print(index.get_near_dups(s1))

    index.add("4", s1)
    print(index.get_near_dups(s1))
Пример #4
0
class TestSimhashIndex(TestCase):
    def setUp(self):
        data = {
            1: u'How are you? I Am fine. blar blar blar blar blar Thanks.',
            2: u'How are you i am fine. blar blar blar blar blar than',
            3: u'This is simhash test.',
        }
        objs = [(str(k), Simhash(v)) for k, v in data.items()]
        self.index = SimhashIndex(objs)

    def test_bucket_size(self):
        self.assertEqual(self.index.bucket_size(), 6)

    def test_get_near_dup(self):
        s1 = Simhash(u'How are you i am fine. blar blar blar blar blar thank')
        dups = self.index.get_near_dups(s1)

        self.assertEqual(len(dups), 2)

def get_features(s):
    width = 3
    s = s.lower()
    s = re.sub(r'[^\w]+', '', s)
    return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]

data = {
    1: u'How are you? I Am fine. blar blar blar blar blar Thanks.',
    2: u'How are you i am fine. blar blar blar blar blar than',
    3: u'This is simhash test.',
}
objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
index = SimhashIndex(objs, k=3)

print(index.bucket_size())

s1 = Simhash(get_features(u'How are you i am fine. blar blar blar blar blar thank'))
print(index.get_near_dups(s1))

index.add('4', s1)
print(index.get_near_dups(s1))


def main():
    pass


if __name__ == '__main__':
    main()
Пример #6
0
import re
from simhash import Simhash, SimhashIndex


def get_features(s):
    width = 3
    s = s.lower()
    s = re.sub(r'[^\w]+', '', s)
    return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]


data = {
    1: u'How are you? I Am fine. blar blar blar blar blar Thanks.',
    2: u'How are you i am fine. blar blar blar blar blar than',
    3: u'This is simhash test.',
}
objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
index = SimhashIndex(objs, k=3)

print(index.bucket_size())

s1 = Simhash(
    get_features(u'How are you i am fine. blar blar blar blar blar thank'))
print(index.get_near_dups(s1))

index.add('4', s1)
print(index.get_near_dups(s1))
Пример #7
0
    index = SimhashIndex(objs, k=3)

    if os.path.isfile(args.db):
        print 'MatchMeta.Info Database Located'
        print 'Patience...Loading Index...'
        conn = sqlite3.connect(args.db)
        meta = conn.execute(
            "SELECT path FROM MatchMeta WHERE path NOT LIKE '%winsxs%'")
        count = 1

        for line in meta:
            item = Simhash(get_features(unicode(line[0])))
            count = count + 1
            index.add(count, item)

        print index.bucket_size()
        print 'Excluding the WINSXS Directory'
        print '---------------------------------'
        print ' MatchMeta.Info Database Loaded'
        print '---------------------------------'
        conn.close()
    else:
        print 'MatchMeta.Info Database -- FAILED'
        sys.exit()

elif (args.near.upper() == 'N'):
    print 'Skipping MatchMeta.Info Database Fuzzing'
else:
    print 'Please use only Y or N'

#########################################################################################################################################
Пример #8
0


#duplicate detection
keys = fourgram.keys()
f1 = open('rezFinalNoDuplicates.txt', 'w')
objs = []
for k in fourgram:
    try:
        objs.append((k, Simhash(fourgram[k])))
    except Exception as e:
        print e
#objs = [(k, Simhash(fourgram[k])) for k in fourgram]
index = SimhashIndex(objs, k=3)

print "bucket_size", index.bucket_size()

for key in keys:
    s1 = Simhash(fourgram[key])
    duplicates = ", ".join(index.get_near_dups(s1))
    f1.write(key + "\t" + duplicates+"\n")
    print key, duplicates

'''
while len(keys) > 0:
    key = keys.pop()
    keysJ = list(keys)
    f1.write(key + '\t' + text[key])

    while len(keysJ) > 0:
        j = keysJ.pop()
Пример #9
0
    objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
    index = SimhashIndex(objs, k=3)

    if os.path.isfile(args.db):
        print 'MatchMeta.Info Database Located'
        print 'Patience...Loading Index...'
        conn = sqlite3.connect(args.db)
        meta = conn.execute("SELECT path FROM MatchMeta WHERE path NOT LIKE '%winsxs%'")
        count = 1

        for line in meta:
            item = Simhash(get_features(unicode(line[0])))
            count = count+1
            index.add(count,item)

        print index.bucket_size()
        print 'Excluding the WINSXS Directory'
        print '---------------------------------'
        print ' MatchMeta.Info Database Loaded'
        print '---------------------------------'
        conn.close()
    else:
        print 'MatchMeta.Info Database -- FAILED'
        sys.exit()

elif(args.near.upper() == 'N'):
    print 'Skipping MatchMeta.Info Database Fuzzing'
else:
    print 'Please use only Y or N'

#########################################################################################################################################
func = delayed(read_file)

F = F[:cutoff_users]

with Parallel(-1) as MP:
    for res in MP(func(x) for x in tqdm(F)):
        if res is None: continue
        ox, dx = res

        objs.extend(ox)
        data.update(dx)

print "OBJS SIZE", len(objs)
index = SimhashIndex(objs, f=f_dim, k=3)
print "Bucket size", index.bucket_size()

accounted_keys = set()
dataset = []
C = collections.Counter()

for key, val in tqdm(objs):
    # Skip if we've seen this pattern before
    if key in accounted_keys:
        continue

    dupes = index.get_near_dups(val)
    tweet = data[key]

    # Don't report self-matches
    if len(dupes) <= 1: