Python SimhashIndex.add примеры, simhash.SimhashIndex.add Python примеры использования

Пример #1

0

Показать файл

class TestSimhashIndex(TestCase):
    data = {
        1:
        u'How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks.',
        2:
        u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than',
        3: u'This is a different one.',
    }

    def setUp(self):
        objs = [(str(k), Simhash(v)) for k, v in self.data.items()]
        self.index = SimhashIndex(objs)

    def test_get_near_dup(self):
        s1 = Simhash(
            u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank'
        )
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 2)

        self.index.delete('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 1)

        self.index.delete('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 1)

        self.index.add('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 2)

        self.index.add('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 2)

Пример #2

0

Показать файл

class TestSimhashIndex(TestCase):
    data = {
        1: 'How are you? I Am fine. blar blar blar blar blar Thanks.',
        2: 'How are you i am fine. blar blar blar blar blar than',
        3: 'This is simhash test.',
        4: 'How are you i am fine. blar blar blar blar blar thank1',
    }

    def setUp(self):
        objs = [(str(k), Simhash(v)) for k, v in self.data.items()]
        self.index = SimhashIndex(objs, k=10)

    def test_get_near_dup(self):
        s1 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank')
        dups = self.index.get_near_dups(s1)
        self.assertEqual(3, len(dups))

        self.index.delete('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(2, len(dups))

        self.index.delete('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(2, len(dups))

        self.index.add('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(3, len(dups))

        self.index.add('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(3, len(dups))

Пример #3

0

Показать файл

Файл: test_simhash.py Проект: JFanZhao/simhash

class TestSimhashIndex(TestCase):
    data = {
        1: 'How are you? I Am fine. blar blar blar blar blar Thanks.',
        2: 'How are you i am fine. blar blar blar blar blar than',
        3: 'This is simhash test.',
        4: 'How are you i am fine. blar blar blar blar blar thank1',
    }

    def setUp(self):
        objs = [(str(k), Simhash(v)) for k, v in self.data.items()]
        self.index = SimhashIndex(objs, k=10)

    def test_get_near_dup(self):
        s1 = Simhash(u'How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank')
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 3)

        self.index.delete('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 2)

        self.index.delete('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 2)

        self.index.add('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 3)

        self.index.add('1', Simhash(self.data[1]))
        dups = self.index.get_near_dups(s1)
        self.assertEqual(len(dups), 3)

Пример #4

0

Показать файл

def main():
    # user_query = input()
    DOCID = 0


    numPartial = 1 

    index = SimhashIndex([])

    totaldocs = 0
    docnum = 0

    validDocFile = open('validDocs2', 'w')

    for root, dirs, files in os.walk(DEVPATH):
        for fname in files:
            if not fname.endswith(".json"):
                continue
            totaldocs += 1
            h2t = html2text.HTML2Text()

            file = open(root + "/" + fname)

            pageDict = json.loads(file.read())

            # close file to get memory back
            file.close()

            # get html formated content
            htmlContent = pageDict['content']

            print(pageDict['url'])

            plainContent = h2t.handle(htmlContent)

            feat = get_features(plainContent)

            sim = Simhash(feat)

            if len(index.get_near_dups(sim)) > 0:
                continue

            print(docnum, totaldocs)

            index.add(str(docnum), sim)

            validDocFile.write(root + "/" + fname + "\n")

            docnum+=1


    validDocFile.close()

Пример #5

0

Показать файл

Файл: spelling.py Проект: pombredanne/MatchingSnippets

class SpellingCorrector(object):
  def __init__(self, vocab_to_freq, f=64, k=32):
    self.vocab_to_freq = vocab_to_freq
    self.simhash_index = SimhashIndex([], f=f, k=k)
    self.f = f
    self.k = k
    
    simhash_index = self.simhash_index
    for w in vocab_to_freq:
      sh = Simhash(w, f=f)
      simhash_index.add(w, sh)
  
  def add_valid_word(self, word):
    if word not in self.vocab_to_freq:
      sh = Simhash(word, self.f)
      self.simhash_index.add(word, sh)
    self.vocab_to_freq[word] = self.vocab_to_freq.get(word, 0) + 1
    
  def correct_word(self, word):
    
    if word in self.vocab_to_freq:
      return word
    
    #Edit distance between
    sh = Simhash(word, f=self.f)
    candidates = self.simhash_index.get_near_dups(sh)
    
    if not candidates:
      #No near dups. Oh well. This word will go as it is.
      print 'no candidates'
      return word
    
    if len(candidates) == 1:
      #Only one candidate, so assume this is the correction
      return candidates[0]
      
    lev_dist_gen = ((other_w, levenshtein(other_w, word)) for other_w in candidates)
    closest_words, dists = zip(*all_min_or_max(lev_dist_gen, min, lambda item: item[1]))
    
    if len(closest_words) == 1:
      #One of the candidates had the best edit distance. Return that.
      return closest_words[0]
    
    #OK, there are multiple closest words. Rely on word frequency to choose the right one.
    vocab_to_freq = self.vocab_to_freq
    word_freq_gen = ((other_w, vocab_to_freq[other_w]) for other_w in closest_words)
    most_freq_words, freqs = zip(*all_min_or_max(word_freq_gen, max, lambda item: item[1]))
    
    #using choice because at this point there's no other way to narrow it down, unless we
    #track higher order ngrams.
    return choice(most_freq_words)

Пример #6

0

Показать файл

def find_near_matches(session, collection, index_size, probability_index_near_match):
    from simhash import Simhash, SimhashIndex
    logging.getLogger().setLevel(logging.CRITICAL)

    tweet_id_simhash_value = session.execute(
        sa.select([model.Tweet.tweet_id, model.Tweet.features['filter','simhash']])
        .where(model.Tweet.collection == collection)
    )

    simhash_index = SimhashIndex([], k=7)

    insert_relation_stmt = pg.insert(model.relation)
    # insert_tweet_near_matches_stmt = insert_tweet_near_matches_stmt.on_conflict_do_update(
    #     index_elements=['tweet_id', 'collection'],
    #     set_={
    #         'earliest_near_match_id': insert_tweet_near_matches_stmt.excluded.earliest_near_match_id
    #     }
    # )

    indexed_tweet_ids = []

    for i, (tweet_id, simhash_value) in enumerate(tweet_id_simhash_value):

        if (i % 100000) == 1000:
            logger.info('Processed %s tweets. Committing.', i)
            session.commit()

        simhash = Simhash(simhash_value)

        near_matches_ids = simhash_index.get_near_dups(simhash)

        if not near_matches_ids:
            simhash_index.add(tweet_id, simhash)
            indexed_tweet_ids.append((tweet_id, simhash))

            if len(indexed_tweet_ids) > index_size:
                simhash_index.delete(*indexed_tweet_ids.pop(0))

        if near_matches_ids:
            near_match_id = min(near_matches_ids)

            logger.debug('A near match %s for tweet %s', near_match_id, tweet_id)
            session.execute(
                insert_relation_stmt.values(
                    [(tweet_id, collection, 'near_match', near_match_id)]
                )
            )

    session.commit()

Пример #7

0

Показать файл

Файл: test.py Проект: MacHu-GWU/simhash-guide

def use_simhash_index():
    data = {
        1: "How are you? I Am fine. blar blar blar blar blar Thanks.",
        2: "How are you i am fine. blar blar blar blar blar than",
        3: "This is simhash test.",
    }
    objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
    index = SimhashIndex(objs, k=3)
    
    print(index.bucket_size())
    
    s1 = Simhash(get_features(u"How are you i am fine. blar blar blar blar blar thank"))
    print(index.get_near_dups(s1))
    
    index.add("4", s1)
    print(index.get_near_dups(s1))

Пример #8

0

Показать файл

Файл: test.py Проект: MacHu-GWU/simhash-guide

def use_simhash_index():
    data = {
        1: "How are you? I Am fine. blar blar blar blar blar Thanks.",
        2: "How are you i am fine. blar blar blar blar blar than",
        3: "This is simhash test.",
    }
    objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
    index = SimhashIndex(objs, k=3)

    print(index.bucket_size())

    s1 = Simhash(
        get_features(u"How are you i am fine. blar blar blar blar blar thank"))
    print(index.get_near_dups(s1))

    index.add("4", s1)
    print(index.get_near_dups(s1))

Пример #9

0

Показать файл

Файл: hashtagAnalyzer.py Проект: mriverov/tip.twitter

    def process_graph(self, project_id):
        visits = defaultdict(list)
        p = 0
        hashtags_db = Hashtag.objects.filter(project_id=project_id)

        logger.info("Total hashtags to process " + str(len(hashtags_db)))
        for hashtag_entry in hashtags_db:
            visits[hashtag_entry.user_id].append(hashtag_entry.hashtag)
            p += 1

        logger.info("Hashtag read")
        logger.info("Hashtag processed " + str(p))
        logger.info("Visits count " + str(len(visits)))

        objs = []
        cant_users = 0
        cant_processed = 0
        index = SimhashIndex(objs, f=f1, k=k1)
        for user, hashtags in visits.iteritems():
            if len(hashtags) > MIN_HASHTAG_PER_USER:
                simhash = Simhash(hashtags, f=f1)
                index.add(user, simhash)
                cant_processed += 1
            cant_users += 1
            if cant_users % 10000 == 0:
                logger.info("%s processed" % cant_users)

        logger.info("Simash index build for %i out of %i users" %
                    (cant_processed, len(visits)))
        cant_processed = 0
        for user, hashtags in visits.iteritems():
            near_dups = index.get_near_dups(Simhash(hashtags, f=f1))
            for user_near_dups in near_dups:
                user_near_dups = long(user_near_dups)
                if user_near_dups != long(user):
                    hashtag_near_dups = visits[user_near_dups]
                    intersect = set(hashtags).intersection(hashtag_near_dups)
                    ratio = len(intersect) * 1.0 / len(hashtag_near_dups)
                    if ratio >= 0.1:
                        hashtag_graph = HashtagGraph(user_oid_i=user,
                                                     user_oid_j=user_near_dups,
                                                     ratio=ratio)
                        hashtag_graph.save()
            cant_processed += 1
            if cant_processed % 10000 == 0:
                logger.info("%i processed" % cant_processed)

Пример #10

0

Показать файл

Файл: urlAnalyzer.py Проект: mriverov/tip.twitter

    def process_graph(self, project_id):
        visits = defaultdict(list)
        processed = 0
        urls_db = Urls.objects.filter(project_id=project_id)

        logger.info("Total urls to process " + str(len(urls_db)))
        for url_entry in urls_db:
            visits[url_entry.user_id].append(url_entry.url)
            processed += 1
        logger.info("Urls read")
        logger.info("Urls processed " + str(processed))
        logger.info("Visits count " + str(len(visits)))

        objs = []
        cant_users = 0
        cant_processed = 0
        index = SimhashIndex(objs, f=f1, k=k1)
        for user, urls in visits.iteritems():
            if len(urls) > MIN_URLS_PER_USER:
                simhash = Simhash(urls, f=f1)
                index.add(user, simhash)
                cant_processed += 1
            cant_users += 1
            if cant_users % 10000 == 0:
                logger.info("%s processed" % cant_users)

        logger.info("Simash index build for %i out of %i users" %
                    (cant_processed, len(visits)))
        cant_processed = 0
        for user, urls in visits.iteritems():
            near_dups = index.get_near_dups(Simhash(urls, f=f1))
            for user_near_dups in near_dups:
                user_near_dups = long(user_near_dups)
                if user_near_dups != long(user):
                    urls_near_dups = visits[user_near_dups]
                    intersect = set(urls).intersection(urls_near_dups)
                    ratio = len(intersect) * 1.0 / len(urls_near_dups)
                    if ratio >= 0.1:
                        url_graph = UrlsGraph(user_oid_i=user,
                                              user_oid_j=user_near_dups,
                                              ratio=ratio)
                        url_graph.save()
            cant_processed += 1
            if cant_processed % 10000 == 0:
                logger.info("%i processed" % cant_processed)

Пример #11

0

Показать файл

Файл: test.py Проект: Ivan333/project-nlp

def simhash_test():
    data = {
        1: u'How are you? I Am fine. blar blar blar blar blar Thanks.',
        2: u'How are you i am fine. blar blar blar blar blar than',
        3: u'This is simhash test.',
    }
    for k, v in data.items(): print k, get_phrases(v)
    for k, v in data.items(): print k, Simhash(get_phrases(v)).value

    objs = [(str(k), Simhash(get_phrases(v))) for k, v in data.items()]
    index = SimhashIndex(objs, k=3)

    print index.bucket_size()

    s1 = Simhash(get_phrases(u'How are you i am fine. blar blar blar blar blar thank'))
    print index.get_near_dups(s1)

    index.add('4', s1)
    print index.get_near_dups(s1)

Пример #12

0

Показать файл

Файл: hashtagAnalyzer.py Проект: mriverov/tip.twitter

    def process_graph(self, project_id):
        visits = defaultdict(list)
        p = 0;
        hashtags_db = Hashtag.objects.filter(project_id=project_id)

        logger.info("Total hashtags to process "+str(len(hashtags_db)))
        for hashtag_entry in hashtags_db:
            visits[hashtag_entry.user_id].append(hashtag_entry.hashtag)
            p +=1

        logger.info("Hashtag read")
        logger.info("Hashtag processed " + str(p))
        logger.info("Visits count " + str(len(visits)))

        objs = []
        cant_users = 0
        cant_processed = 0
        index = SimhashIndex(objs, f=f1, k=k1)
        for user, hashtags in visits.iteritems():
            if len(hashtags) > MIN_HASHTAG_PER_USER:
                simhash = Simhash(hashtags, f=f1)
                index.add(user, simhash)
                cant_processed += 1
            cant_users += 1
            if cant_users % 10000 == 0:
                logger.info("%s processed" % cant_users)

        logger.info("Simash index build for %i out of %i users" % (cant_processed, len(visits)))
        cant_processed = 0
        for user, hashtags in visits.iteritems():
            near_dups = index.get_near_dups(Simhash(hashtags, f=f1))
            for user_near_dups in near_dups:
                user_near_dups = long(user_near_dups)
                if user_near_dups != long(user):
                    hashtag_near_dups = visits[user_near_dups]
                    intersect = set(hashtags).intersection(hashtag_near_dups)
                    ratio = len(intersect)*1.0/len(hashtag_near_dups)
                    if ratio >= 0.1:
                        hashtag_graph = HashtagGraph(user_oid_i=user, user_oid_j=user_near_dups, ratio=ratio)
                        hashtag_graph.save()
            cant_processed += 1
            if cant_processed % 10000 == 0:
                    logger.info("%i processed" % cant_processed)

Пример #13

0

Показать файл

Файл: weibo_cluster.py Проект: nanjunxiao/tag_doc_with_lda

		index = SimhashIndex({})
		for one in weibocollection.find({'create_time':{'$gte':lasttimestamp,'$lt':curtimestamp} }):
			weibo_id = str(one['_id'])
			weibo_text = one['data']['text'].strip()
			text_sh = Simhash(weibo_text )
			if len(index.get_near_dups(text_sh) ) == 0: #not find sim
				#cut
				text_seg = jieba.cut(weibo_text)
				text_result = list(set(text_seg) - stopwords)
				content = ' 1 '.join(text_result)
				if content != '':
					fdoc.write(weibo_id+'\t'+weibo_text.encode('utf-8')+'\n');fcut.write(content.encode('utf-8')+' 1\n')
					cutnum += 1
				simnum += 1
			num += 1
			index.add(num,text_sh)
	except pymongo.errors,e:
		logger.critical('mongo find error: %s' %e)
		sys.exit(-2)

	logger.info('simnum: %d ' %simnum);
	logger.info('cutnum: %d ' %cutnum);
	connection.close()
	fdoc.close();fcut.close()

def main():
	curtimestamp=0;lasttimestamp=0
	if len(sys.argv)==4 and sys.argv[1]=='-BETime':
		lasttimestamp = long(sys.argv[2])
		curtimestamp = long(sys.argv[3])
	elif len(sys.argv)==2 and sys.argv[1]=='-SYSTime':

Пример #14

0

Показать файл

Файл: simhashPlayground2.py Проект: Mariox222/htmlStruc

import re
from simhash import Simhash, SimhashIndex


def get_features(s):
    width = 3
    s = s.lower()
    s = re.sub(r'[^\w]+', '', s)
    return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]


data = {
    1: u'How are you? I Am fine. blar blar blar blar blar Thanks.',
    2: u'How are you i am fine. blar blar blar blar blar than',
    3: u'This is simhash test.',
}
objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
index = SimhashIndex(objs, k=3)

print(index.bucket_size())

s1 = Simhash(
    get_features(u'How are you i am fine. blar blar blar blar blar thank'))
print(index.get_near_dups(s1))

index.add('4', s1)
print(index.get_near_dups(s1))

Пример #15

0

Показать файл

    data = {}
    objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
    index = SimhashIndex(objs, k=3)

    if os.path.isfile(args.db):
        print 'MatchMeta.Info Database Located'
        print 'Patience...Loading Index...'
        conn = sqlite3.connect(args.db)
        meta = conn.execute(
            "SELECT path FROM MatchMeta WHERE path NOT LIKE '%winsxs%'")
        count = 1

        for line in meta:
            item = Simhash(get_features(unicode(line[0])))
            count = count + 1
            index.add(count, item)

        print index.bucket_size()
        print 'Excluding the WINSXS Directory'
        print '---------------------------------'
        print ' MatchMeta.Info Database Loaded'
        print '---------------------------------'
        conn.close()
    else:
        print 'MatchMeta.Info Database -- FAILED'
        sys.exit()

elif (args.near.upper() == 'N'):
    print 'Skipping MatchMeta.Info Database Fuzzing'
else:
    print 'Please use only Y or N'

Пример #16

0

Показать файл

    'utf-8', 'ignore')

s2 = 'How are you i am fine. blar blar blar blar blar than'.decode(
    'utf-8', 'ignore')

s3 = 'This is simhash test.'.decode('utf-8', 'ignore')

# print get_features(s1)
#
# print Simhash(get_features('How are you? I am fine. Thanks.')).value

sh1 = Simhash(s1)
sh2 = Simhash(s2)
sh3 = Simhash(s3)

# print sh.value

# print sh1.distance(sh2)

shIndex = SimhashIndex([], k=3)
shIndex.add('1', sh1)
shIndex.add('2', sh2)
# shIndex.add('3', sh3)

if shIndex.get_near_dups(sh3):
    print 'YES'
else:
    print 'NO'

# print shIndex.get_near_dups(sh2)

Пример #17

0

Показать файл

Файл: match_metainfo.py Проект: pombredanne/MatchMeta.Info

    data = {}
    objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
    index = SimhashIndex(objs, k=3)

    if os.path.isfile(args.db):
        print 'MatchMeta.Info Database Located'
        print 'Patience...Loading Index...'
        conn = sqlite3.connect(args.db)
        meta = conn.execute("SELECT path FROM MatchMeta WHERE path NOT LIKE '%winsxs%'")
        count = 1

        for line in meta:
            item = Simhash(get_features(unicode(line[0])))
            count = count+1
            index.add(count,item)

        print index.bucket_size()
        print 'Excluding the WINSXS Directory'
        print '---------------------------------'
        print ' MatchMeta.Info Database Loaded'
        print '---------------------------------'
        conn.close()
    else:
        print 'MatchMeta.Info Database -- FAILED'
        sys.exit()

elif(args.near.upper() == 'N'):
    print 'Skipping MatchMeta.Info Database Fuzzing'
else:
    print 'Please use only Y or N'

Пример #18

0

Показать файл

            weibo_id = str(one['_id'])
            weibo_text = one['data']['text'].strip()
            text_sh = Simhash(weibo_text)
            if len(index.get_near_dups(text_sh)) == 0:  #not find sim
                #cut
                text_seg = jieba.cut(weibo_text)
                text_result = list(set(text_seg) - stopwords)
                content = ' 1 '.join(text_result)
                if content != '':
                    fdoc.write(weibo_id + '\t' + weibo_text.encode('utf-8') +
                               '\n')
                    fcut.write(content.encode('utf-8') + ' 1\n')
                    cutnum += 1
                simnum += 1
            num += 1
            index.add(num, text_sh)
    except pymongo.errors, e:
        logger.critical('mongo find error: %s' % e)
        sys.exit(-2)

    logger.info('simnum: %d ' % simnum)
    logger.info('cutnum: %d ' % cutnum)
    connection.close()
    fdoc.close()
    fcut.close()


def main():
    curtimestamp = 0
    lasttimestamp = 0
    if len(sys.argv) == 4 and sys.argv[1] == '-BETime':

Пример #19

0

Показать файл

Файл: nearDuplicate.py Проект: davtalab/572-hw1

class NearDuplicate:
    def __init__(self, filenames, k=2, metadata_dictionary=None):
        self.filenames = filenames
        self.simhash_index = None
        self.image_dictionary = {}
        self.metadata_dictionary = metadata_dictionary
        self.k = k
        # Need to store the image hashes in some fashion
        # Possibly cluster the hashes (k-means)

    def tika_metadata(self, filename):
        """Use the tika-py module to grab metadata for a file"""
        parsed = parser.from_file(filename)
        return parsed.get("metadata", {})

    def exifread_metadata(self, filename):
        """Use the exifread module to grab metadata for a file"""
        f = open(filename, 'rb')
        tags = exifread.process_file(f)
        return tags

    def generate_features_from_dict(self, filename):
        """ Use this function when we provide json metadata information from
            the tika java module"""

        # Find the metadata object from the json metadata file for the image_file named 'filename'
        metadata = self.metadata_dictionary.get(filename, {})

        # The tags or type of metadata we want
        feature_tags = [
            "Image Height", "Image Width", "File Size", "Content-Type",
            "Image Bytes", "File Name Suffix"
        ]

        # Create a feature array using these metadata values
        features = []

        feature_weight_dict = {
            "Image Height": 1,
            "Image Width": 1,
            "Files Size": 2,
            "Content-Type": 3,
            "Image Bytes": 6,
            "File Name Suffix": 2
        }

        # Grab the bytes of the entire file
        image_bytes = "NONE"
        try:
            image_bytes = open(filename, 'rb').read()
        except OSError:
            image_bytes = "NONE"

        # Get the central bytes
        image_bytes_str = unicode(str(image_bytes), 'utf-8', "ignore")
        byte_offset = len(image_bytes_str) // 4
        filename_suffix = filename[-10:]

        modified_metadata = {
            "Image Height": metadata.get("Image Height", "NONE"),
            "Image Width": metadata.get("Image Width", "NONE"),
            "File Size": metadata.get("File Size", "NONE"),
            "Content-Type": metadata.get("Content-Type", "NONE"),
            "Image Bytes": image_bytes_str[byte_offset:-byte_offset],
            "File Name Suffix": filename_suffix
        }

        # Create an array of (token, weight) tuples. These are our features and weights
        # to be used for the Simhash
        for (feature_tag,
             weight), (meta_tag,
                       meta_value) in zip(feature_weight_dict.items(),
                                          modified_metadata.items()):
            features.append((meta_tag + ":" + meta_value, weight))

        return features

    def generate_features(self, filename):
        """Given an image generate a feature vector"""
        """ 
            Since Tika-Py requires a server call (i.e. slower)
            Do native image metadata grabbing, and fallback on tika if the
            image can't be opened (i.e., it's an svg or gif)
        """
        im, use_tika = None, False
        try:
            im = Image.open(filename)
            use_tika = False
        except IOError:
            use_tika = True

        # Grab the metadata for the image
        metadata = {}

        # We'll store features to use for simhash in a tuple array [(token, weight)]
        features = []

        if use_tika:
            # Use only metadata from tika
            # The image file can't be opened using PIL.Image, so that means
            # a diff type of image besides jpg, png
            metadata = self.tika_metadata(filename)

            # Grab the bytes of the entire file
            image_bytes = open(filename).read()

            # Get the central bytes

            image_bytes_str = unicode(str(image_bytes), 'utf-8', "ignore")
            #image_bytes_str = str(image_bytes)
            byte_offset = len(image_bytes_str) // 4
            metadata["Image Bytes"] = image_bytes_str[byte_offset:-byte_offset]
            feature_tags = [
                "Image Height", "Image Width", "File Size", "Content-Type",
                "Image Bytes"
            ]
            features = [
                tag + ":" + metadata.get(tag, "NONE") for tag in feature_tags
            ]
            return features
        """ 
            FEATURES
                We'll resize the image so all images are normalized to a certain size 
                Also make sure to retain aspect ratio

                Features to use (in order of importance)
                    - center region bytes 
                    - color histogram
                    - content type
                    - image width
                    - image height

            We can take subregions of the image, and hash those
        """

        # Resize the image so all images are normalized
        width = im.size[0]
        height = im.size[1]
        resize_width = 30
        resize_height = resize_width * height / width
        resize_im = None
        histogram_bytes, histogram_weight = "", 0
        center_region_bytes, center_region_weight = "", 5
        extension = ""
        try:
            resize_im = im.resize((resize_width, resize_height),
                                  Image.ANTIALIAS)
            # Crop sub regions
            height_padding, width_padding = resize_height / 5, resize_width / 5
            box = (width_padding, height_padding, resize_width - width_padding,
                   resize_height - height_padding)
            sub_region = resize_im.crop(box)

            # Generate a histogram
            histogram_bytes, histogram_weight = str(resize_im.histogram()), 4
            center_region_bytes, center_region_weight = str(
                list(sub_region.getdata())), 3
        except OSError:

            # Couldn't resize the image. Let's
            print >> sys.stderr, "Couldn't resize the image. Prob an eps or svg"
            resize_im = im
            resize_width = im.size[0]
            resize_height = im.size[1]
            sub_region = im

            # Grab the bytes of the entire file
            image_bytes = open(filename).read()
            # Get the central bytes
            #image_bytes_str = str(image_bytes)
            histogram_bytes = "NONE"
            image_bytes_str = unicode(str(image_bytes), 'utf-8', "ignore")
            byte_offset = len(image_bytes_str) // 4
            center_region_bytes = image_bytes_str[byte_offset:-byte_offset]

        extension = resize_im.format if resize_im.format != None else os.path.splitext(
            filename)[1]

        # Figure out the content type (png, jpg, etc.)
        content_type = "image/" + str(extension.lower())

        feature_weight_dict = {
            "Image Height": 1,
            "Image Width": 1,
            "Image Histogram": histogram_weight,
            "Content-Type": 5,
            "Center Region Bytes": center_region_weight
        }

        metadata = {
            "Image Height": str(width),
            "Image Width": str(height),
            "Image Histogram": histogram_bytes,
            "Content-Type": content_type,
            "Center Region Bytes": center_region_bytes
        }

        # Create an array of (token, weight) tuples. These are our features and weights
        # to be used for the Simhash
        for (feature_tag,
             weight), (meta_tag,
                       meta_value) in zip(feature_weight_dict.items(),
                                          metadata.items()):
            features.append((meta_tag + ":" + meta_value, weight))

        return features

    def merge_near_duplicate_dictionaries(self, nd):
        """Merge the current near duplicate instance with another near duplicate instance"""

        smaller_nd = self if len(self.image_dictionary) <= len(
            nd.image_dictionary) else nd
        larger_nd = self if len(self.image_dictionary) > len(
            nd.image_dictionary) else nd
        final_dict = larger_nd.image_dictionary

        # Iterate over the smaller near duplicate instance
        for key in smaller_nd.image_dictionary.keys():

            # If an exact duplicate exists, just grab it and merge them
            if larger_nd.image_dictionary.get(key, None) != None:
                arr = smaller_nd.image_dictionary.get(key, []) +\
                        larger_nd.image_dictionary.get(key, [])
                final_dict[key] = arr
                continue

            # Find the closest near duplicate in the larger dictionary by
            # using it's index
            simhash_obj = smaller_nd.image_dictionary[key][0]["hash_object"]

            near_duplicates_keys = larger_nd.simhash_index.get_near_dups(
                simhash_obj)

            # If a near duplicate exists
            if len(near_duplicates_keys) > 0:
                # grab the array of images at that key in the larger dictionary
                # Merge it the array of images in the smaller dictionary
                near_dup_key = near_duplicates_keys[0]
                arr = smaller_nd.image_dictionary.get(key, []) +\
                        larger_nd.image_dictionary.get(near_dup_key, [])

                # create an entry in the new dictionary
                final_dict[near_dup_key] = arr
                continue

            # Otherwise we should just add this key-object from the dictionary
            # to this array
            final_dict[key] = smaller_nd.image_dictionary[key]

            # Add this simhash to the Index for efficient searching
            larger_nd.simhash_index.add(key, simhash_obj)

        self.image_dictionary = final_dict
        self.simhash_index = larger_nd.simhash_index

        nd.image_dicionary = final_dict
        nd.simhash_index = larger_nd.simhash_index

        # Now simply return this final dict
        return final_dict

    def simhash_value_to_key(self, simhash):
        """Given a simhash object, convert it's value to a hexadecimal key 
            This key will be used in our image_file dictionary
        """
        return str(hex(simhash.value))

    def deduplicate_images(self):
        """
            Given a list of image files "self.filenames", deduplicate the images using
            near deduplication 
        """
        # Iterate through our files
        for image_file in self.filenames:
            feature_array = []
            if self.metadata_dictionary != None:
                # Will use a java tika program to generate metadata
                # Metadata will be a json file with {filename : metadata} objects
                feature_array = self.generate_features_from_dict(image_file)
            else:
                # Use our own function for grabbing metadata
                # Create a list of features
                feature_array = self.generate_features(image_file)

            # Simhash this list of features
            sHash = Simhash(feature_array)
            if self.simhash_index == None:
                # First image, so we create the index add it to the dictionary
                # And move on to next iteration
                key = self.simhash_value_to_key(sHash)

                # We will use this index to speed up the process for finding
                # nearby simhashes
                self.simhash_index = SimhashIndex([(key, sHash)], k=self.k)
                self.image_dictionary[key] = [{
                    "filename": image_file,
                    "hash_key": key,
                    "hash_object": sHash
                }]
                continue

            near_duplicates_keys = self.simhash_index.get_near_dups(sHash)

            if len(near_duplicates_keys) > 0:
                # There are duplicates, so we should add them to the corresponding entry
                # in the file dictionary

                # Get the key for the nearest duplicate image
                near_dup_simhash_key = near_duplicates_keys[0]

                # Get the key for this current image
                current_simhash_key = self.simhash_value_to_key(sHash)

                # Create an object comprised of the image filename and key
                # We'll store this in a dictionary to be used in our merge step
                current_simhash_object = {
                    "filename": image_file,
                    "hash_key": current_simhash_key,
                    "hash_object": sHash
                }
                self.image_dictionary[near_dup_simhash_key].append(
                    current_simhash_object)
            else:
                # No duplicates, so let's create an entry in our image filename dictionary
                key = self.simhash_value_to_key(sHash)

                # Add this simhash to the Index for efficient searching
                self.simhash_index.add(key, sHash)

                # Create an object in our image file dictionary
                self.image_dictionary[key] = [{
                    "filename": image_file,
                    "hash_key": key,
                    "hash_object": sHash
                }]

Пример #20

0

Показать файл

import re
from simhash import Simhash, SimhashIndex


def get_features(s):
    width = 3
    s = s.lower()
    s = re.sub(r'[^\w]+', '', s)
    return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]


data = {
    1: u'How are you? I Am fine. blar blar blar blar blar Thanks.',
    2: u'How are you i am fine. blar blar blar blar blar than',
    3: u'This is simhash test.',
}

objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
print(objs)

index = SimhashIndex(objs, k=3)  # k：海明距离
print(index.bucket_size())

s1 = Simhash(
    get_features(
        u'How are you i am fine. blar blar blar blar blar blar thank'))
print(index.get_near_dups(s1))

index.add('4', s1)  # 相当于将s1当做data的第四个kv对进行比对
print(index.get_near_dups(s1))

Пример #21

0

Показать файл

Файл: simhash_test2.py Проект: Martin1024/infomatter-crawler

def get_features(s):
    width = 3
    s = s.lower()
    s = re.sub(r'[^\w]+', '', s)
    return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]


data = {
    1: u'How are you? I Am fine. blar blar blar blar blar Thanks.',
    2: u'How are you i am fine. blar blar blar blar blar than',
    3: u'This is simhash test.',
}
objs = []
# objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
index = SimhashIndex(objs, k=3)


s1 = Simhash(get_features(
    u'How are you i am fine. blar blar blar blar blar thank'))
print(s1.value)
print (index.get_near_dups(s1))

index.add('4', s1)
print (index.get_near_dups(s1))

s2 = Simhash(7604580641891645972)
print(s2.value)
index.add('5', s2)
index.add('3', s2)
print (index.get_near_dups(s2))

Пример #22

0

Показать файл

Файл: 利用字符串哈希值计算相似性.py Проект: gswyhq/hello-world


def get_features(s):
    width = 3
    s = s.lower()
    s = re.sub(r'[^\w]+', '', s)
    return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]

data = {
    1: u'How are you? I Am fine. blar blar blar blar blar Thanks.',
    2: u'How are you i am fine. blar blar blar blar blar than',
    3: u'This is simhash test.',
}
objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
index = SimhashIndex(objs, k=3)

print(index.bucket_size())

s1 = Simhash(get_features(u'How are you i am fine. blar blar blar blar blar thank'))
print(index.get_near_dups(s1))

index.add('4', s1)
print(index.get_near_dups(s1))


def main():
    pass


if __name__ == '__main__':
    main()

Пример #23

0

Показать файл

Файл: simhash.py Проект: WilliamDanielWen/PycharmWorkplace

s3 = 'This is simhash test.'.decode('utf-8', 'ignore')

# print get_features(s1)
#
# print Simhash(get_features('How are you? I am fine. Thanks.')).value


sh1 = Simhash(s1)
sh2 = Simhash(s2)
sh3 = Simhash(s3)

# print sh.value


# print sh1.distance(sh2)

shIndex = SimhashIndex([], k=3)
shIndex.add('1', sh1)
shIndex.add('2', sh2)
# shIndex.add('3', sh3)

if shIndex.get_near_dups(sh3):
    print 'YES'
else:
    print 'NO'

# print shIndex.get_near_dups(sh2)

Пример #24

0

Показать файл

Файл: nearDuplicate.py Проект: usc-csci-fall15/572-hw1

class NearDuplicate:
    def __init__(self, filenames, k=2, metadata_dictionary=None):
        self.filenames = filenames
        self.simhash_index = None 
        self.image_dictionary = {}
        self.metadata_dictionary = metadata_dictionary
        self.k = k 
        # Need to store the image hashes in some fashion
        # Possibly cluster the hashes (k-means) 
    
    def tika_metadata(self, filename):
        """Use the tika-py module to grab metadata for a file"""
        parsed = parser.from_file(filename)
        return parsed.get("metadata", {})

    def exifread_metadata(self, filename):
        """Use the exifread module to grab metadata for a file"""
        f = open(filename, 'rb')
        tags = exifread.process_file(f)
        return tags

    def generate_features_from_dict(self, filename):
        """ Use this function when we provide json metadata information from
            the tika java module"""

        # Find the metadata object from the json metadata file for the image_file named 'filename'
        metadata = self.metadata_dictionary.get(filename, {})
       
        # The tags or type of metadata we want
        feature_tags = ["Image Height", "Image Width", "File Size", "Content-Type", "Image Bytes", "File Name Suffix"]

        # Create a feature array using these metadata values
        features = []

        feature_weight_dict = {
                "Image Height" : 1, 
                "Image Width" : 1,
                "Files Size" : 2,
                "Content-Type" : 3,
                "Image Bytes" : 6, 
                "File Name Suffix" :2 
        }

        # Grab the bytes of the entire file
        image_bytes = "NONE"
        try:
            image_bytes = open(filename, 'rb').read()
        except OSError:
            image_bytes = "NONE"

        # Get the central bytes 
        image_bytes_str = unicode( str(image_bytes), 'utf-8', "ignore")
        byte_offset = len(image_bytes_str)//4
        filename_suffix = filename[-10:]

        modified_metadata = {
                "Image Height" : metadata.get("Image Height", "NONE"), 
                "Image Width" : metadata.get("Image Width", "NONE"),
                "File Size" : metadata.get("File Size", "NONE"),
                "Content-Type" : metadata.get("Content-Type", "NONE"),
                "Image Bytes" : image_bytes_str[byte_offset:-byte_offset], 
                "File Name Suffix" : filename_suffix
        }
       
        # Create an array of (token, weight) tuples. These are our features and weights
        # to be used for the Simhash
        for (feature_tag, weight), (meta_tag, meta_value) in zip(feature_weight_dict.items(), 
                modified_metadata.items()):
            features.append((meta_tag + ":" + meta_value, weight))

        return features


    def generate_features(self, filename):
        """Given an image generate a feature vector"""

        """ 
            Since Tika-Py requires a server call (i.e. slower)
            Do native image metadata grabbing, and fallback on tika if the
            image can't be opened (i.e., it's an svg or gif)
        """
        im, use_tika = None, False 
        try:
            im = Image.open(filename)
            use_tika = False
        except IOError:
            use_tika = True
            
        # Grab the metadata for the image
        metadata = {} 
        
        # We'll store features to use for simhash in a tuple array [(token, weight)]
        features = []

        if use_tika:
            # Use only metadata from tika
            # The image file can't be opened using PIL.Image, so that means
            # a diff type of image besides jpg, png
            metadata = self.tika_metadata(filename)

            # Grab the bytes of the entire file
            image_bytes = open(filename).read()

            # Get the central bytes 

            image_bytes_str = unicode( str(image_bytes), 'utf-8', "ignore")
            #image_bytes_str = str(image_bytes)
            byte_offset = len(image_bytes_str)//4
            metadata["Image Bytes"] = image_bytes_str[byte_offset:-byte_offset] 
            feature_tags = ["Image Height", "Image Width", "File Size", "Content-Type", "Image Bytes"]
            features = [tag + ":" + metadata.get(tag,"NONE") for tag in feature_tags]
            return features

        """ 
            FEATURES
                We'll resize the image so all images are normalized to a certain size 
                Also make sure to retain aspect ratio

                Features to use (in order of importance)
                    - center region bytes 
                    - color histogram
                    - content type
                    - image width
                    - image height

            We can take subregions of the image, and hash those
        """

        
        # Resize the image so all images are normalized
        width = im.size[0]
        height = im.size[1]
        resize_width = 30 
        resize_height = resize_width*height/width
        resize_im = None
        histogram_bytes, histogram_weight = "", 0
        center_region_bytes, center_region_weight = "", 5
        extension = ""
        try :
            resize_im = im.resize((resize_width, resize_height), Image.ANTIALIAS)
            # Crop sub regions
            height_padding, width_padding = resize_height/5, resize_width/5
            box = (width_padding, height_padding, resize_width - width_padding, 
                    resize_height - height_padding)
            sub_region = resize_im.crop(box)
            
            # Generate a histogram
            histogram_bytes, histogram_weight = str(resize_im.histogram()), 4
            center_region_bytes, center_region_weight = str(list(sub_region.getdata())), 3
        except OSError:
            
            # Couldn't resize the image. Let's
            print >> sys.stderr, "Couldn't resize the image. Prob an eps or svg"
            resize_im = im
            resize_width = im.size[0]
            resize_height = im.size[1]
            sub_region = im

            # Grab the bytes of the entire file
            image_bytes = open(filename).read()
            # Get the central bytes 
            #image_bytes_str = str(image_bytes)
            histogram_bytes = "NONE"
            image_bytes_str = unicode( str(image_bytes), 'utf-8', "ignore")
            byte_offset = len(image_bytes_str)//4
            center_region_bytes = image_bytes_str[byte_offset:-byte_offset] 
         
        extension = resize_im.format if resize_im.format !=  None else os.path.splitext(filename)[1]
         
        # Figure out the content type (png, jpg, etc.)
        content_type = "image/" + str(extension.lower())
        
        
        feature_weight_dict = {
                "Image Height" : 1, 
                "Image Width" : 1,
                "Image Histogram" : histogram_weight,
                "Content-Type" : 5,
                "Center Region Bytes" : center_region_weight 
        }

        metadata = {
                "Image Height" : str(width), 
                "Image Width" : str(height),
                "Image Histogram" : histogram_bytes,
                "Content-Type" : content_type,
                "Center Region Bytes" : center_region_bytes 
        }
       
        # Create an array of (token, weight) tuples. These are our features and weights
        # to be used for the Simhash
        for (feature_tag, weight), (meta_tag, meta_value) in zip(feature_weight_dict.items(), 
                metadata.items()):
            features.append((meta_tag + ":" + meta_value, weight))

        return features 


    def merge_near_duplicate_dictionaries(self, nd):
        """Merge the current near duplicate instance with another near duplicate instance"""

        smaller_nd = self if len(self.image_dictionary) <= len(nd.image_dictionary) else nd
        larger_nd = self if len(self.image_dictionary) > len(nd.image_dictionary) else nd
        final_dict = larger_nd.image_dictionary

        # Iterate over the smaller near duplicate instance
        for key in smaller_nd.image_dictionary.keys():
            

            # If an exact duplicate exists, just grab it and merge them 
            if larger_nd.image_dictionary.get(key, None) != None:
                arr = smaller_nd.image_dictionary.get(key, []) +\
                        larger_nd.image_dictionary.get(key, [])
                final_dict[key] = arr
                continue

            # Find the closest near duplicate in the larger dictionary by
            # using it's index
            simhash_obj = smaller_nd.image_dictionary[key][0]["hash_object"]

            near_duplicates_keys = larger_nd.simhash_index.get_near_dups(simhash_obj)
            
            # If a near duplicate exists 
            if len(near_duplicates_keys) > 0:
                # grab the array of images at that key in the larger dictionary
                # Merge it the array of images in the smaller dictionary 
                near_dup_key = near_duplicates_keys[0]
                arr = smaller_nd.image_dictionary.get(key, []) +\
                        larger_nd.image_dictionary.get(near_dup_key, [])

                # create an entry in the new dictionary
                final_dict[near_dup_key] = arr
                continue
                
            # Otherwise we should just add this key-object from the dictionary
            # to this array
            final_dict[key] = smaller_nd.image_dictionary[key] 

            # Add this simhash to the Index for efficient searching
            larger_nd.simhash_index.add(key, simhash_obj)

        self.image_dictionary = final_dict
        self.simhash_index = larger_nd.simhash_index

        nd.image_dicionary = final_dict
        nd.simhash_index = larger_nd.simhash_index

        # Now simply return this final dict 
        return final_dict


    def simhash_value_to_key(self, simhash):
        """Given a simhash object, convert it's value to a hexadecimal key 
            This key will be used in our image_file dictionary
        """
        return str(hex(simhash.value))


    def deduplicate_images(self):
        """
            Given a list of image files "self.filenames", deduplicate the images using
            near deduplication 
        """
        # Iterate through our files
        for image_file in self.filenames:
            feature_array = []
            if self.metadata_dictionary != None:
                # Will use a java tika program to generate metadata 
                # Metadata will be a json file with {filename : metadata} objects
                feature_array = self.generate_features_from_dict(image_file)
            else:
                # Use our own function for grabbing metadata
                # Create a list of features
                feature_array = self.generate_features(image_file)
        
            # Simhash this list of features
            sHash = Simhash(feature_array)
            if self.simhash_index == None:
                # First image, so we create the index add it to the dictionary
                # And move on to next iteration
                key = self.simhash_value_to_key(sHash)

                # We will use this index to speed up the process for finding
                # nearby simhashes
                self.simhash_index = SimhashIndex([(key, sHash)], k=self.k)
                self.image_dictionary[key] = [{
                    "filename" : image_file, 
                    "hash_key" : key, 
                    "hash_object": sHash
                }] 
                continue

            near_duplicates_keys = self.simhash_index.get_near_dups(sHash)

            if len(near_duplicates_keys) > 0:
                # There are duplicates, so we should add them to the corresponding entry
                # in the file dictionary

                # Get the key for the nearest duplicate image
                near_dup_simhash_key = near_duplicates_keys[0] 

                # Get the key for this current image 
                current_simhash_key = self.simhash_value_to_key(sHash) 

                # Create an object comprised of the image filename and key
                # We'll store this in a dictionary to be used in our merge step
                current_simhash_object = {
                    "filename" : image_file, 
                    "hash_key" : current_simhash_key,
                    "hash_object" : sHash
                }
                self.image_dictionary[near_dup_simhash_key].append(current_simhash_object)
            else:
                # No duplicates, so let's create an entry in our image filename dictionary
                key = self.simhash_value_to_key(sHash)

                # Add this simhash to the Index for efficient searching
                self.simhash_index.add(key, sHash)

                # Create an object in our image file dictionary
                self.image_dictionary[key] = [{
                    "filename" : image_file, 
                    "hash_key" : key,
                    "hash_object" : sHash
                }]

Пример #25

0

Показать файл

class DocCollection(object):
  def __init__(self, hash_size=64, hash_tol=3, num_words_to_complete=10):
    """
    Params:
      hash_size : The number of output bits of the hash function used in SimHash.
                  Higher values -> able to handle more noise.
      hash_tol  : The number of bits that can differ for a candidate near-match in Simhash
      
      num_words_to_complete : The number of words to complete given a context when a new
                              document is encountered in get_best_match
    """
    
    self.num_words_to_complete = num_words_to_complete
    self.hash_size = hash_size
    self.hash_tol = hash_tol
    
    #This implementation of simhash stores the index in RAM, but it could easily be
    # put on disk.
    self.simhash_index = SimhashIndex(objs=[], f=self.hash_size, k=self.hash_tol)
    self.author_identifier = LanguageModelAuthorIdentifier()
    self.author_semantic_models = SemanticLanguageModels()
  
  def generate_simhash(self, tokens):
    #Generate a Simhash from Spacy tokens.
    sh = Simhash(u'', f=self.hash_size) #silly interface...
    sh.build_by_features(tokens)
    return sh
    
  def add(self, doc, title, author):
    add_to_index = self.simhash_index.add
    
    #Index each paragraph in the document into the simhash index
    paras = extract_paragraphs(doc)
    
    #Update the word shape language model for this author
    para_toks = [tokenize(p) for p in paras]
    flat_tokens = [item for sublist in para_toks for item in sublist]
    self.author_semantic_models.add_doc(flat_tokens, author)
    
    #Update the semantic model for this author
    self.author_identifier.add_doc(flat_tokens, author)
    
    #Add each paragraph to the simhash index
    for para_num, tokens in enumerate(para_toks, 1):
      if not tokens:
        continue
      sh = self.generate_simhash(tokens)
      self.simhash_index.add((tokens, title, author, para_num), sh)
        
  def get_best_match(self, snippet):
    get_near_dups = self.simhash_index.get_near_dups
    generate_simhash = self.generate_simhash
    title_author_to_count = {}
    
    paras = extract_paragraphs(snippet)
    
    #evenly distribute the corrupted paragraphs
    #shuffle(paras)
    
    #For each paragraph, get the closest matching previously encountered paragraphs.
    #If multiple matches, prune via edit distance.
    #The work of art that matches the most paragraphs is the winner (if it matches enough)
    paras_done = 0
    for para in paras:
      tokens = tokenize(para)
      if not tokens:
        continue
      paras_done += 1
      sh = generate_simhash(tokens)
      candidates = [make_tuple(match) for match in get_near_dups(sh)]
      
      #Increment the count of these works
      for candidate in candidates:
        _, title, author, para_num = candidate
        k = (title, author)
        title_author_to_count[k] = title_author_to_count.get(k, 0) + 1
    
    if title_author_to_count:
      #OK, what work was the most frequent, and what was that frequency?
      (title, author), f = max(title_author_to_count.iteritems(), key=lambda item: item[1])
                  
      score = 1.*f/paras_done
      if score >= 0.1:
        return {'title': title, 'author': author, 
                'score': score, 'author_score': None, 
                'completion': None}
    
    #This is either so corrupt that we can't tell what it is, or is a new work.
    #Guess the author
    tokens = [item for sublist in [tokenize(p) for p in paras] for item in sublist]
    author_guess, author_score = self.author_identifier.predict_author(tokens)
    completion = self.author_semantic_models.complete(author_guess, tokens, self.num_words_to_complete, 1)
    
    return {'title': None, 'author': author_guess, 
            'score': None, 'author_score': author_score, 
            'completion': completion}

  def clear(self):
    self.simhash_index = SimhashIndex(objs=[], f=self.hash_size, k=self.hash_tol)

Пример #26

0

Показать файл

import re
from simhash import Simhash, SimhashIndex
def get_features(s):
    width = 3
    s = s.lower()
    s = re.sub(r'[^\w]+', '', s)
    return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]

data = {1:u'2018-02-17 0:00:00,2018-02-17 0:00:00,2018-02-17 0:00:00,86000300159583,2018-01-26 0:00:00',
2:u'2018-02-16 0:00:00,60125170993,2018-02-16 0:00:00,86000300257742,2018-01-26 0:00:00',
3:u'2018-02-15 0:00:00,60125170993,2018-02-15 0:00:00,86011600116290,2018-01-26 0:00:00',
4:u'2018-02-14 0:00:00,60125170993,2018-02-14 0:00:00,86008501214219,2018-01-26 0:00:00',
5:u'2018-02-13 0:00:00,60125170993,2018-02-13 0:00:00,86000300420496,2018-01-26 0:00:00',
6:u'2018-02-12 0:00:00,60125170993,2018-02-12 0:00:00,86000300656419,2018-01-26 0:00:00',
7:u'2018-02-11 0:00:00,60125170993,2018-02-11 0:00:00,86553802671042,2018-01-26 0:00:00'
}
objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
index = SimhashIndex(objs, k=3)

print index.bucket_size()

s1 = Simhash(get_features(u'2018-02-17 0:00:00,2018-03-17 0:00:00,2018-02-17 0:00:00,86000300159583,2018-01-26 0:00:00'))
print index.get_near_dups(s1)

index.add('0', s1)
print index.get_near_dups(s1)

Python SimhashIndex.add примеры использования